---
comments:
giscus:
repo: xinzhuohkust/comments
hypothesis:
theme: clean
title: "Sentiment Analysis"
description: "revised version"
date: last-modified
author:
- name: Xinzhuo HUANG
url: https://github.com/xinzhuohkust
eamil: xhuangcb@connect.ust.hk
affiliations:
- name: HKUST, SOSC
title-block-banner: "#CDC0B0"
highlight-style:
light: custom-light.theme
dark: custom-dark.theme
format:
html:
code-summary: "Show the code"
code-line-numbers: true
code-block-bg: true
code-block-border-left: "#8ebdd4"
code-tools:
caption: "Source Code"
smooth-scroll: true
linestretch: 1.2
df-print: paged
theme: simplex
code-fold: true
toc: true
toc-location: left
number-sections: true
monofont: Helvetica
mainfont: Cambria Math
link-citations: yes
---
```{=html}
<style>
body {text-align: justify}
</style>
```
```{r filename="Package Management", warning=FALSE, message=FALSE}
setwd ("D:/OneDrive - HKUST Connect/SOSC/paper_with_jean/sentiment_analysis/connected_case" )
require ("pacman" )
p_load (tidyverse, tidyfst, purrr, furrr, textclean, tictoc, progress, lubridate, sentimentr, rvest, purrr, furrr, sf, terra, tmap, DT, reticulate, tidytext, lme4, modelsummary, sf, leaflet, countrycode)
```
# Data Wrangling
* Fix the mistake of labeling non-democracy using v2x_libdem and v2x_regime simultaneously.
* Create a new variable **nondemocracy_robust** which only use v2x_regime as reference.
* Make some adjustments to the covariates and fix other mistakes.
* Please find all the raw data [ here ](https://drive.google.com/drive/folders/1BHMTpECZItVR3LEuKmod5P1oFEZTUEco?usp=sharing) .
## standard country codes
```{r message=FALSE, warning=FALSE}
"https://www.worlddata.info/countrycodes.php" %>%
read_html () %>%
html_element ("table" ) %>%
html_table (header = T) %>%
dplyr:: select (1 , 3 ) %>%
purrr:: set_names (nm = c ("country_name" , "country_code" )) %>%
add_row (country_name = "Palestine/Gaza" , country_code = "PSG" ) -> standard_code
```
## democracy
```{r message=FALSE, warning=FALSE}
read_rds ("D: \\ OneDrive - HKUST Connect \\ SOSC \\ paper_with_jean \\ V-Dem-CY-Core-v12.rds" ) %>%
tibble () %>%
filter (year %in% c (2007 , 2008 , 2011 , 2018 )) %>%
dplyr:: select (v2x_libdem, v2x_regime, country_name, country_text_id, year) %>%
mutate (nondemocracy = if_else (v2x_libdem < median (v2x_libdem), 1 , 0 ),
nondemocracy_robust = if_else (v2x_regime %in% 2 : 3 , 0 , 1 )) %>%
rename (country_code = country_text_id) %>%
dplyr:: select (country_name, nondemocracy, country_code, year, nondemocracy_robust) -> democracy
```
## aid
```{r message=FALSE, warning=FALSE}
rio:: import ("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China_aid_v2.xlsx" , setclass = "tibble" ) %>%
dplyr:: select (recipient, year, ` Amount (Constant USD2017) ` ) %>%
rename (amount = ` Amount (Constant USD2017) ` , country_name = recipient) %>%
mutate (country_name = case_when (country_name == "Viet Nam" ~ "Vietnam" ,
country_name == "Lao People's Democratic Republic" ~ "Laos" ,
country_name == "Democratic People's Republic of Korea" ~ "North Korea" ,
country_name == "Congo" ~ "Republic of the Congo" ,
country_name == "Brunei Darussalam" ~ "Brunei" ,
country_name == "Cabo Verde" ~ "Cape Verde" ,
country_name == "Kyrgyz Republic" ~ "Kyrgyzstan" ,
country_name == "Micronesia" ~ "Federated States of Micronesia" ,
country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten" ,
country_name == "Timor-Leste" ~ "East Timor" ,
country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten" ,
country_name == "Syrian Arab Republic" ~ "Syria" ,
country_name == "Cote d'Ivoire" ~ "Ivory Coast" ,
country_name == "Myanmar" ~ "Burma" ,
country_name == "West Bank and Gaza Strip" ~ "Palestine/Gaza" ,
TRUE ~ country_name)) %>% # repair name
left_join (standard_code) %>%
filter (! str_detect (country_name, "egion" )) %>%
group_by (country_code, year) %>%
summarise (amount = sum (amount, na.rm = T)) %>%
arrange (year) %>%
pivot_wider (names_from = year, values_from = amount) %>%
mutate (sum_2008 = rowSums (across (` 2000 ` : ` 2008 ` ), na.rm = T), # cumulatively
sum_2009 = rowSums (across (` 2000 ` : ` 2009 ` ), na.rm = T),
sum_2012 = rowSums (across (` 2000 ` : ` 2011 ` ), na.rm = T),
sum_2019 = rowSums (across (` 2000 ` : ` 2017 ` ), na.rm = T)) %>%
dplyr:: select (country_code, sum_2008: sum_2019) %>%
pivot_longer (cols = sum_2008: sum_2019, names_to = "year" ) %>%
mutate (year = map_chr (year, ~ str_extract (., "[[:digit:]]{4}" )) |> as.numeric ()) %>%
rename (cum_amount = value) %>%
ungroup () -> aid
```
## bri
```{r message=FALSE, warning=FALSE}
rio:: import ("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/BRI_country.xlsx" , setclass = "tibble" ) %>%
dplyr:: select (BRI_dummy, ` Country Code ` ) %>%
set_names (c ("BRI_dummy" , "country_code" )) %>%
rename (BRI = BRI_dummy) -> BRI
```
## interactive map
```{r message=FALSE, warning=FALSE, fig.dim = c(10, 13)}
#| column: page
democracy %>%
mutate (year = year + 1 ) %>%
left_join (aid) %>%
left_join (BRI) %>%
select (4 , 1 , 3 , 2 , 6 , 7 , 5 ) %>%
replace_na (list (BRI = 0 , cum_amount = 0 )) %>%
mutate (aid = if_else (cum_amount > 0 , 1 , 0 )) -> all_variables
read_sf ("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China/map/ne_50m_admin_0_countries.shp" ) %>%
rename (country_code = ADM0_A3) %>%
left_join (filter (all_variables, year == 2019 )) %>%
mutate (nondemocracy = if_else (nondemocracy == 1 , "Not Democracy" , "Democracy" )) -> world_shape
colorFactor (palette = c ("#be64ac" , "#5ac8c8" ), domain = world_shape$ nondemocracy) -> pal # color palette
read_rds ("geo_point_world.Rdata" ) %>%
left_join (all_variables) -> point_geo # lat-long
point_geo %>%
filter (BRI == 1 ) %>%
mutate (BRI = "BRI" ) %>%
filter (year == 2008 ) -> BRI_geo # BRI point
point_geo %>%
filter (cum_amount > 0 & year == 2019 ) -> aid_geo # aid point
leaflet () %>%
addProviderTiles (providers$ CartoDB.Positron,
options = tileOptions (tileSize = 256 , minZoom = 2 , maxZoom = 20 )) %>%
addPolygons (data = sf:: st_as_sf (world_shape),
fillColor = ~ pal (nondemocracy),
stroke = T,
color = "#8e92c1" ,
weight = 2 ,
smoothFactor = 1 ,
opacity = 1 ,
fillOpacity = 0.2 ,
label = ~ country_name,
highlightOptions = highlightOptions (color = "green" ,
weight = 2 ,
bringToFront = TRUE ),
labelOptions = labelOptions (style = list ("font-weight" = "normal" , padding = "3px 8px" ),
textsize = "12px" ,
direction = "auto" )) %>%
addMarkers (data = BRI_geo,
lng = ~ longitude - 0.02 ,
lat = ~ latitude - 0.02 ,
label = "BRI Country" ,
clusterOptions = markerClusterOptions ()) %>%
addLegend (na.label = "Missing" ,
data = sf:: st_as_sf (world_shape),
pal = pal,
values = ~ nondemocracy,
opacity = 0.3 ,
title = NULL ,
position = "bottomright" ) %>%
addCircles (opacity = 1 ,
color = "#5a64ac" ,
data = aid_geo,
lng = ~ longitude,
lat = ~ latitude,
weight = 6.6 ,
radius = ~ log (cum_amount), label = "Aid Recipient" , popup = ~ str_c (format (round (cum_amount / 1e6 , 1 ), trim = TRUE ), "Million" ))
```
# Mixed Models
Sample data (n = 2000, N = 43,780) is as follows.
```{r message=FALSE, warning=FALSE}
options (digits = 3 )
read_rds ("sentence_level_result.Rdata" ) %>%
left_join (read_rds ("basic_info.Rdata" )) %>%
mutate (year = year + 1 ) %>%
left_join (all_variables) %>%
group_by (country_code, events) %>%
mutate (caseid = cur_group_id () |> as_factor (),
treatment = if_else (day >= 0 , 1 , 0 ),
day_f = as_factor (day)) %>%
filter (day < 10 & day >= - 10 ) %>%
ungroup () -> regression_data
regression_data %>%
dplyr:: select (- c (link, sentiment_sr)) %>%
sample_n (2000 ) %>%
datatable (filter = "top" , options = list (pageLength = 5 , autoWidth = TRUE ))
```
## utilize nondemocracy
```{r warning=FALSE, message=FALSE}
regression_data %>%
pivot_longer (cols = compound: pos, names_to = "sentiment_name" ) %>%
group_nest (sentiment_name) %>%
mutate (models = map (data, ~ lmer (value ~ 1 + treatment * (aid + nondemocracy + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models
modelsummary (list ("neu" = models$ models[[3 ]],
"pos" = models$ models[[4 ]],
"neg" = models$ models[[2 ]],
"compound" = models$ models[[1 ]]),
stars = c ("*" = 0.10 , "**" = 0.05 , "***" = 0.01 ),
coef_map = c ("(Intercept)" = "Constant" ,
"treatment" = "Treatment" ,
"aid" = "Aid recipient" ,
"nondemocracy" = "Not democracy" ,
"BRI" = "BRI member" ,
"treatmentaid" = "Treatment * Aid recipient" ,
"treatmentnondemocracy" = "Treatment * Not democracy" ,
"treatmentBRI" = "Treatment * BRI member" ))
```
## utilize nondemocracy_robust
```{r}
regression_data %>%
pivot_longer (cols = compound: pos, names_to = "sentiment_name" ) %>%
group_nest (sentiment_name) %>%
mutate (models = map (data, ~ lmer (value ~ 1 + treatment * (aid + nondemocracy_robust + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models_robust
modelsummary (list ("neu" = models_robust$ models[[3 ]],
"pos" = models_robust$ models[[4 ]],
"neg" = models_robust$ models[[2 ]],
"compound" = models_robust$ models[[1 ]]),
stars = c ("*" = 0.10 , "**" = 0.05 , "***" = 0.01 ),
coef_map = c ("(Intercept)" = "Constant" ,
"treatment" = "Treatment" ,
"aid" = "Aid recipient" ,
"nondemocracy_robust" = "Not democracy" ,
"BRI" = "BRI member" ,
"treatmentaid" = "Treatment * Aid recipient" ,
"treatmentnondemocracy_robust" = "Treatment * Not democracy" ,
"treatmentBRI" = "Treatment * BRI member" ))
```
# Reply
## question 1
Both question 1 and question 2 are caused by different treatment coding (i.e., when does the treatment effect begin). Since I tried two ways before and after which brought ambiguities and inconsistencies. I am very sorry. Please code the treatment based on the **day**.
```{r warning=FALSE, message=FALSE}
full <- read_rds ("sentence_level_result.Rdata" ) %>%
left_join (read_rds ("basic_info.Rdata" )) %>%
mutate (year = year + 1 ) %>% # since we use previous year's democracy information
left_join (all_variables) %>%
group_by (country_code, events) %>%
mutate (caseid = cur_group_id () |> as_factor (),
day_f = as_factor (day)) %>%
ungroup () # all the data
before <- full %>%
mutate (treatment = if_else (day > 0 , 1 , 0 )) %>%
filter (day > - 10 & day <= 10 ) # before
after <- full %>%
mutate (treatment = if_else (day >= 0 , 1 , 0 )) %>%
filter (day >= - 10 & day < 10 ) # after
question_1_country <- c ("Kazakhstan" , "Malaysia" , "Turkmenistan" , "United States of America" ) %>%
imap (~ countrycode (., origin = "country.name" , destination = "iso3c" )) %>%
unlist ()
before_n <- before %>%
filter (country_code %in% question_1_country) %>%
count (events, country_code, treatment, name = "before_n" )
after_n <- after %>%
filter (country_code %in% question_1_country) %>%
count (events, country_code, treatment, name = "after_n" )
before_n %>%
left_join (after_n) %>%
mutate (diff = after_n - before_n) %>%
datatable ()
```
## question 2
```{r warning=FALSE, message=FALSE}
question_2_country <- c ("Brazil" , "Cambodia" , "Croatia" , "Malaysia" , "Nepal" , "New Zealand" , "Senegal" , "Vietnam" ) %>%
imap (~ countrycode (., origin = "country.name" , destination = "iso3c" )) %>%
unlist ()
before_n_2 <- before %>%
ungroup () %>%
filter (country_code %in% question_2_country & events == "chenguangcheng" ) %>%
count (events, country_code, treatment, name = "before_n" )
after_n_2 <- after %>%
ungroup () %>%
filter (country_code %in% question_2_country & events == "chenguangcheng" ) %>%
count (events, country_code, treatment, name = "after_n" )
before_n_2 %>%
left_join (after_n_2) %>%
mutate (diff = after_n - before_n) %>%
datatable ()
```
## question 3
As I have mentioned, only **sentimentr** in R will commit errors when it processes a very long sentence without proper punctuation. Since we choose the data calculated by the asent, so **no** articles will be dropped.
**Asent** in Python:
```{r}
full %>%
pull (compound) %>%
quantile ()
```
**Sentimentr** in R:
```{r}
full %>%
pull (sentiment_sr) %>%
quantile ()
```