Package Management

Show the code

setwd("D:/OneDrive - HKUST Connect/SOSC/paper_with_jean/sentiment_analysis/connected_case")

require("pacman")

p_load(tidyverse, tidyfst, purrr, furrr, textclean, tictoc, progress, lubridate, sentimentr, rvest, purrr, furrr, sf, terra, tmap, DT, reticulate, tidytext, lme4, modelsummary, sf, leaflet, countrycode)

1 Data Wrangling

Fix the mistake of labeling non-democracy using v2x_libdem and v2x_regime simultaneously.
Create a new variable nondemocracy_robust which only use v2x_regime as reference.
Make some adjustments to the covariates and fix other mistakes.
Please find all the raw data here.

1.1 standard country codes

Show the code

"https://www.worlddata.info/countrycodes.php" %>% 
  read_html() %>% 
  html_element("table") %>% 
  html_table(header = T) %>% 
  dplyr::select(1, 3) %>% 
  purrr::set_names(nm = c("country_name", "country_code")) %>% 
  add_row(country_name = "Palestine/Gaza", country_code = "PSG") -> standard_code

1.2 democracy

Show the code

read_rds("D:\\OneDrive - HKUST Connect\\SOSC\\paper_with_jean\\V-Dem-CY-Core-v12.rds") %>% 
  tibble() %>% 
  filter(year %in% c(2007, 2008, 2011, 2018)) %>% 
  dplyr::select(v2x_libdem, v2x_regime, country_name, country_text_id, year)  %>% 
  mutate(nondemocracy = if_else(v2x_libdem < median(v2x_libdem), 1, 0),
         nondemocracy_robust = if_else(v2x_regime %in% 2:3, 0, 1)) %>% 
  rename(country_code =  country_text_id) %>% 
  dplyr::select(country_name, nondemocracy, country_code, year, nondemocracy_robust) -> democracy

1.3 aid

Show the code

rio::import("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China_aid_v2.xlsx", setclass = "tibble") %>% 
  dplyr::select(recipient, year, `Amount (Constant USD2017)`) %>% 
  rename(amount = `Amount (Constant USD2017)`, country_name = recipient) %>% 
  mutate(country_name = case_when(country_name == "Viet Nam" ~ "Vietnam",
                                  country_name == "Lao People's Democratic Republic" ~ "Laos",
                                  country_name == "Democratic People's Republic of Korea" ~ "North Korea",
                                  country_name == "Congo" ~ "Republic of the Congo",
                                  country_name == "Brunei Darussalam" ~ "Brunei",
                                  country_name == "Cabo Verde" ~ "Cape Verde",
                                  country_name == "Kyrgyz Republic" ~ "Kyrgyzstan",
                                  country_name == "Micronesia" ~ "Federated States of Micronesia",
                                  country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten",
                                  country_name == "Timor-Leste" ~ "East Timor",
                                  country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten",
                                  country_name == "Syrian Arab Republic" ~ "Syria",
                                  country_name == "Cote d'Ivoire" ~ "Ivory Coast",
                                  country_name == "Myanmar" ~ "Burma",
                                  country_name == "West Bank and Gaza Strip" ~ "Palestine/Gaza",
                                  TRUE ~ country_name)) %>% # repair name 
  left_join(standard_code) %>% 
  filter(!str_detect(country_name, "egion")) %>% 
  group_by(country_code, year) %>% 
  summarise(amount = sum(amount, na.rm = T)) %>% 
  arrange(year) %>% 
  pivot_wider(names_from = year, values_from = amount) %>% 
  mutate(sum_2008 = rowSums(across(`2000`:`2008`), na.rm = T), # cumulatively 
         sum_2009 = rowSums(across(`2000`:`2009`), na.rm = T), 
         sum_2012 = rowSums(across(`2000`:`2011`), na.rm = T),
         sum_2019 = rowSums(across(`2000`:`2017`), na.rm = T)) %>% 
  dplyr::select(country_code, sum_2008:sum_2019) %>% 
  pivot_longer(cols = sum_2008:sum_2019, names_to = "year") %>% 
  mutate(year = map_chr(year, ~ str_extract(., "[[:digit:]]{4}")) |> as.numeric()) %>% 
  rename(cum_amount = value) %>% 
  ungroup() -> aid

1.4 bri

Show the code

rio::import("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/BRI_country.xlsx", setclass = "tibble") %>% 
  dplyr::select(BRI_dummy, `Country Code`) %>% 
  set_names(c("BRI_dummy", "country_code")) %>% 
  rename(BRI = BRI_dummy) -> BRI

1.5 interactive map

Show the code

democracy %>% 
  mutate(year = year + 1) %>% 
  left_join(aid) %>% 
  left_join(BRI) %>% 
  select(4, 1, 3, 2, 6, 7, 5) %>% 
  replace_na(list(BRI = 0, cum_amount = 0)) %>% 
  mutate(aid = if_else(cum_amount > 0, 1, 0)) -> all_variables

read_sf("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China/map/ne_50m_admin_0_countries.shp") %>% 
  rename(country_code = ADM0_A3) %>% 
  left_join(filter(all_variables, year == 2019)) %>% 
  mutate(nondemocracy = if_else(nondemocracy == 1, "Not Democracy", "Democracy")) -> world_shape 

colorFactor(palette = c("#be64ac", "#5ac8c8"), domain = world_shape$nondemocracy) -> pal # color palette

read_rds("geo_point_world.Rdata") %>% 
  left_join(all_variables) -> point_geo # lat-long

point_geo %>% 
  filter(BRI == 1) %>% 
  mutate(BRI = "BRI") %>% 
  filter(year == 2008) -> BRI_geo # BRI point 

point_geo %>% 
  filter(cum_amount > 0 & year == 2019) -> aid_geo # aid point 

leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron,
                   options = tileOptions(tileSize = 256, minZoom = 2, maxZoom = 20)) %>% 
  addPolygons(data = sf::st_as_sf(world_shape),
              fillColor = ~ pal(nondemocracy),
              stroke = T,
              color = "#8e92c1",
              weight = 2,
              smoothFactor = 1,
              opacity = 1,
              fillOpacity = 0.2,
              label = ~ country_name,
              highlightOptions = highlightOptions(color = "green",
                                                  weight = 2,
                                                  bringToFront = TRUE),
              labelOptions = labelOptions(style = list("font-weight" = "normal", padding = "3px 8px"),
                                          textsize = "12px",
                                          direction = "auto")) %>% 
  addMarkers(data = BRI_geo, 
                   lng = ~ longitude - 0.02, 
                   lat = ~ latitude - 0.02,
             label = "BRI Country",
  clusterOptions = markerClusterOptions()) %>% 
  addLegend(na.label = "Missing",
    data = sf::st_as_sf(world_shape),
              pal = pal,
    values = ~ nondemocracy,
            opacity = 0.3,
            title = NULL,
            position = "bottomright") %>% 
  addCircles(opacity = 1,
      color = "#5a64ac", 
      data = aid_geo, 
      lng = ~ longitude,
      lat = ~ latitude,
      weight = 6.6,
      radius = ~ log(cum_amount), label = "Aid Recipient", popup = ~ str_c(format(round(cum_amount / 1e6, 1), trim = TRUE), "Million"))

2 Mixed Models

Sample data (n = 2000, N = 43,780) is as follows.

Show the code

options(digits = 3)

read_rds("sentence_level_result.Rdata") %>% 
  left_join(read_rds("basic_info.Rdata")) %>% 
  mutate(year = year + 1) %>% 
  left_join(all_variables) %>% 
  group_by(country_code, events) %>% 
  mutate(caseid = cur_group_id() |> as_factor(), 
         treatment = if_else(day >= 0, 1, 0),
         day_f = as_factor(day)) %>% 
  filter(day < 10 & day >= -10) %>% 
  ungroup() -> regression_data

regression_data %>% 
  dplyr::select(-c(link, sentiment_sr)) %>% 
  sample_n(2000) %>%
  datatable(filter = "top", options = list(pageLength = 5, autoWidth = TRUE))

2.1 utilize nondemocracy

Show the code

regression_data %>% 
  pivot_longer(cols = compound:pos, names_to = "sentiment_name") %>% 
  group_nest(sentiment_name) %>% 
  mutate(models = map(data, ~ lmer(value ~  1 + treatment * (aid + nondemocracy + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models

modelsummary(list("neu" = models$models[[3]],
                  "pos" = models$models[[4]],
                  "neg" = models$models[[2]],
                  "compound" = models$models[[1]]),
             stars = c("*" = 0.10, "**" = 0.05, "***" = 0.01),
             coef_map = c("(Intercept)" = "Constant",
                            "treatment" = "Treatment",
                            "aid" = "Aid recipient",
                            "nondemocracy" = "Not democracy",
                            "BRI" = "BRI member",
                            "treatmentaid" = "Treatment * Aid recipient",
                            "treatmentnondemocracy" = "Treatment * Not democracy",
                            "treatmentBRI" = "Treatment * BRI member"))

	neu	pos	neg	compound
Constant	0.566***	0.072***	0.053***	0.076***
	(0.024)	(0.002)	(0.003)	(0.021)
Treatment	0.004	-0.004***	0.002*	-0.015**
	(0.003)	(0.001)	(0.001)	(0.007)
Aid recipient	0.068	-0.0008	-0.013**	0.107***
	(0.044)	(0.005)	(0.006)	(0.039)
Not democracy	-0.125**	-0.004	0.029***	-0.226***
	(0.062)	(0.008)	(0.008)	(0.055)
BRI member	0.091*	0.002	-0.026***	0.193***
	(0.049)	(0.007)	(0.007)	(0.045)
Treatment * Aid recipient	0.010	0.007*	-0.004	0.033**
	(0.012)	(0.004)	(0.003)	(0.016)
Treatment * Not democracy	0.002	-0.007	-0.009	0.051
	(0.026)	(0.009)	(0.006)	(0.035)
Treatment * BRI member	-0.022	0.005	0.011*	-0.066**
	(0.024)	(0.008)	(0.006)	(0.032)
Num.Obs.	43780	43780	43780	43780
R2 Marg.	0.008	0.0009	0.015	0.025
R2 Cond.	0.199	0.018	0.074	0.107
AIC	-33564.8	-129158.5	-158019.2	-7041.7
BIC	-33469.2	-129062.9	-157923.6	-6946.1
ICC	0.2	0.02	0.06	0.08
RMSE	0.16	0.06	0.04	0.22
* p < 0.1, p < 0.05, * p < 0.01

2.2 utilize nondemocracy_robust

Show the code

regression_data %>% 
  pivot_longer(cols = compound:pos, names_to = "sentiment_name") %>% 
  group_nest(sentiment_name) %>% 
  mutate(models = map(data, ~ lmer(value ~  1 + treatment * (aid + nondemocracy_robust + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models_robust

modelsummary(list("neu" = models_robust$models[[3]],
                  "pos" = models_robust$models[[4]],
                  "neg" = models_robust$models[[2]],
                  "compound" = models_robust$models[[1]]),
             stars = c("*" = 0.10, "**" = 0.05, "***" = 0.01),
             coef_map = c("(Intercept)" = "Constant",
                            "treatment" = "Treatment",
                            "aid" = "Aid recipient",
                            "nondemocracy_robust" = "Not democracy",
                            "BRI" = "BRI member",
                            "treatmentaid" = "Treatment * Aid recipient",
                            "treatmentnondemocracy_robust" = "Treatment * Not democracy",
                            "treatmentBRI" = "Treatment * BRI member"))

	neu	pos	neg	compound
Constant	0.570***	0.072***	0.052***	0.086***
	(0.024)	(0.002)	(0.003)	(0.022)
Treatment	0.004	-0.003***	0.002*	-0.015**
	(0.003)	(0.001)	(0.001)	(0.007)
Aid recipient	0.051	-0.002	-0.007	0.067*
	(0.040)	(0.005)	(0.006)	(0.037)
Not democracy	-0.151**	-0.006	0.024**	-0.231***
	(0.062)	(0.009)	(0.010)	(0.060)
BRI member	0.133**	0.006	-0.028***	0.242***
	(0.056)	(0.009)	(0.009)	(0.056)
Treatment * Aid recipient	0.007	0.005	-0.005*	0.034**
	(0.011)	(0.004)	(0.003)	(0.015)
Treatment * Not democracy	0.056	0.004	-0.015*	0.134**
	(0.039)	(0.013)	(0.009)	(0.052)
Treatment * BRI member	-0.073*	-0.005	0.018*	-0.151***
	(0.038)	(0.013)	(0.009)	(0.052)
Num.Obs.	43780	43780	43780	43780
R2 Marg.	0.006	0.0008	0.014	0.022
R2 Cond.	0.193	0.017	0.085	0.115
AIC	-33568.4	-129158.4	-158016.8	-7044.3
BIC	-33472.9	-129062.9	-157921.2	-6948.8
ICC	0.2	0.02	0.07	0.09
RMSE	0.16	0.06	0.04	0.22
* p < 0.1, p < 0.05, * p < 0.01

3 Reply

3.1 question 1

Both question 1 and question 2 are caused by different treatment coding (i.e., when does the treatment effect begin). Since I tried two ways before and after which brought ambiguities and inconsistencies. I am very sorry. Please code the treatment based on the day.

Show the code

full <- read_rds("sentence_level_result.Rdata") %>% 
  left_join(read_rds("basic_info.Rdata")) %>% 
  mutate(year = year + 1) %>% # since we use previous year's democracy information
  left_join(all_variables) %>% 
  group_by(country_code, events) %>% 
  mutate(caseid = cur_group_id() |> as_factor(), 
         day_f = as_factor(day)) %>% 
  ungroup() # all the data 

before <- full %>% 
  mutate(treatment = if_else(day > 0, 1, 0)) %>% 
  filter(day > -10 & day <= 10) # before

after <- full %>% 
  mutate(treatment = if_else(day >= 0, 1, 0)) %>% 
  filter(day >= -10 & day < 10) # after

question_1_country <- c("Kazakhstan", "Malaysia", "Turkmenistan", "United States of America") %>% 
  imap(~ countrycode(., origin = "country.name", destination = "iso3c")) %>% 
  unlist() 

before_n <- before %>% 
  filter(country_code %in% question_1_country) %>% 
  count(events, country_code, treatment, name = "before_n")

after_n <- after %>% 
  filter(country_code %in% question_1_country) %>% 
  count(events, country_code, treatment, name = "after_n")

before_n %>% 
  left_join(after_n) %>% 
  mutate(diff = after_n - before_n) %>% 
  datatable()

3.2 question 2

Show the code

question_2_country <- c("Brazil", "Cambodia", "Croatia", "Malaysia", "Nepal", "New Zealand", "Senegal", "Vietnam") %>% 
  imap(~ countrycode(., origin = "country.name", destination = "iso3c")) %>% 
  unlist()

before_n_2 <- before %>% 
  ungroup() %>% 
  filter(country_code %in% question_2_country & events == "chenguangcheng") %>% 
  count(events, country_code, treatment, name = "before_n")

after_n_2 <- after %>% 
  ungroup() %>% 
  filter(country_code %in% question_2_country & events == "chenguangcheng") %>% 
  count(events, country_code, treatment, name = "after_n")

before_n_2 %>% 
  left_join(after_n_2) %>% 
  mutate(diff = after_n - before_n) %>% 
  datatable()

3.3 question 3

As I have mentioned, only sentimentr in R will commit errors when it processes a very long sentence without proper punctuation. Since we choose the data calculated by the asent, so no articles will be dropped.

Asent in Python:

Show the code

full %>% 
  pull(compound) %>% 
  quantile()

     0%     25%     50%     75%    100% 
-0.9951 -0.0154  0.1088  0.2401  0.9988

Sentimentr in R:

Show the code

full %>% 
  pull(sentiment_sr) %>% 
  quantile()

     0%     25%     50%     75%    100% 
-0.6905 -0.0119  0.0772  0.1646  3.5448