Sentiment Analysis

revised version
Author
Affiliation

HKUST, SOSC

Published

November 26, 2022

Package Management
Show the code
setwd("D:/OneDrive - HKUST Connect/SOSC/paper_with_jean/sentiment_analysis/connected_case")

require("pacman")

p_load(tidyverse, tidyfst, purrr, furrr, textclean, tictoc, progress, lubridate, sentimentr, rvest, purrr, furrr, sf, terra, tmap, DT, reticulate, tidytext, lme4, modelsummary, sf, leaflet, countrycode)

1 Data Wrangling

  • Fix the mistake of labeling non-democracy using v2x_libdem and v2x_regime simultaneously.

  • Create a new variable nondemocracy_robust which only use v2x_regime as reference.

  • Make some adjustments to the covariates and fix other mistakes.

  • Please find all the raw data here.

1.1 standard country codes

Show the code
"https://www.worlddata.info/countrycodes.php" %>% 
  read_html() %>% 
  html_element("table") %>% 
  html_table(header = T) %>% 
  dplyr::select(1, 3) %>% 
  purrr::set_names(nm = c("country_name", "country_code")) %>% 
  add_row(country_name = "Palestine/Gaza", country_code = "PSG") -> standard_code

1.2 democracy

Show the code
read_rds("D:\\OneDrive - HKUST Connect\\SOSC\\paper_with_jean\\V-Dem-CY-Core-v12.rds") %>% 
  tibble() %>% 
  filter(year %in% c(2007, 2008, 2011, 2018)) %>% 
  dplyr::select(v2x_libdem, v2x_regime, country_name, country_text_id, year)  %>% 
  mutate(nondemocracy = if_else(v2x_libdem < median(v2x_libdem), 1, 0),
         nondemocracy_robust = if_else(v2x_regime %in% 2:3, 0, 1)) %>% 
  rename(country_code =  country_text_id) %>% 
  dplyr::select(country_name, nondemocracy, country_code, year, nondemocracy_robust) -> democracy

1.3 aid

Show the code
rio::import("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China_aid_v2.xlsx", setclass = "tibble") %>% 
  dplyr::select(recipient, year, `Amount (Constant USD2017)`) %>% 
  rename(amount = `Amount (Constant USD2017)`, country_name = recipient) %>% 
  mutate(country_name = case_when(country_name == "Viet Nam" ~ "Vietnam",
                                  country_name == "Lao People's Democratic Republic" ~ "Laos",
                                  country_name == "Democratic People's Republic of Korea" ~ "North Korea",
                                  country_name == "Congo" ~ "Republic of the Congo",
                                  country_name == "Brunei Darussalam" ~ "Brunei",
                                  country_name == "Cabo Verde" ~ "Cape Verde",
                                  country_name == "Kyrgyz Republic" ~ "Kyrgyzstan",
                                  country_name == "Micronesia" ~ "Federated States of Micronesia",
                                  country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten",
                                  country_name == "Timor-Leste" ~ "East Timor",
                                  country_name == "Sint Maarten (Dutch part)" ~ "Sint Maarten",
                                  country_name == "Syrian Arab Republic" ~ "Syria",
                                  country_name == "Cote d'Ivoire" ~ "Ivory Coast",
                                  country_name == "Myanmar" ~ "Burma",
                                  country_name == "West Bank and Gaza Strip" ~ "Palestine/Gaza",
                                  TRUE ~ country_name)) %>% # repair name 
  left_join(standard_code) %>% 
  filter(!str_detect(country_name, "egion")) %>% 
  group_by(country_code, year) %>% 
  summarise(amount = sum(amount, na.rm = T)) %>% 
  arrange(year) %>% 
  pivot_wider(names_from = year, values_from = amount) %>% 
  mutate(sum_2008 = rowSums(across(`2000`:`2008`), na.rm = T), # cumulatively 
         sum_2009 = rowSums(across(`2000`:`2009`), na.rm = T), 
         sum_2012 = rowSums(across(`2000`:`2011`), na.rm = T),
         sum_2019 = rowSums(across(`2000`:`2017`), na.rm = T)) %>% 
  dplyr::select(country_code, sum_2008:sum_2019) %>% 
  pivot_longer(cols = sum_2008:sum_2019, names_to = "year") %>% 
  mutate(year = map_chr(year, ~ str_extract(., "[[:digit:]]{4}")) |> as.numeric()) %>% 
  rename(cum_amount = value) %>% 
  ungroup() -> aid

1.4 bri

Show the code
rio::import("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/BRI_country.xlsx", setclass = "tibble") %>% 
  dplyr::select(BRI_dummy, `Country Code`) %>% 
  set_names(c("BRI_dummy", "country_code")) %>% 
  rename(BRI = BRI_dummy) -> BRI

1.5 interactive map

Show the code
democracy %>% 
  mutate(year = year + 1) %>% 
  left_join(aid) %>% 
  left_join(BRI) %>% 
  select(4, 1, 3, 2, 6, 7, 5) %>% 
  replace_na(list(BRI = 0, cum_amount = 0)) %>% 
  mutate(aid = if_else(cum_amount > 0, 1, 0)) -> all_variables

read_sf("D:/OneDrive - HKUST Connect/桌面/Ra_Hong/China/map/ne_50m_admin_0_countries.shp") %>% 
  rename(country_code = ADM0_A3) %>% 
  left_join(filter(all_variables, year == 2019)) %>% 
  mutate(nondemocracy = if_else(nondemocracy == 1, "Not Democracy", "Democracy")) -> world_shape 

colorFactor(palette = c("#be64ac", "#5ac8c8"), domain = world_shape$nondemocracy) -> pal # color palette

read_rds("geo_point_world.Rdata") %>% 
  left_join(all_variables) -> point_geo # lat-long

point_geo %>% 
  filter(BRI == 1) %>% 
  mutate(BRI = "BRI") %>% 
  filter(year == 2008) -> BRI_geo # BRI point 

point_geo %>% 
  filter(cum_amount > 0 & year == 2019) -> aid_geo # aid point 

leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron,
                   options = tileOptions(tileSize = 256, minZoom = 2, maxZoom = 20)) %>% 
  addPolygons(data = sf::st_as_sf(world_shape),
              fillColor = ~ pal(nondemocracy),
              stroke = T,
              color = "#8e92c1",
              weight = 2,
              smoothFactor = 1,
              opacity = 1,
              fillOpacity = 0.2,
              label = ~ country_name,
              highlightOptions = highlightOptions(color = "green",
                                                  weight = 2,
                                                  bringToFront = TRUE),
              labelOptions = labelOptions(style = list("font-weight" = "normal", padding = "3px 8px"),
                                          textsize = "12px",
                                          direction = "auto")) %>% 
  addMarkers(data = BRI_geo, 
                   lng = ~ longitude - 0.02, 
                   lat = ~ latitude - 0.02,
             label = "BRI Country",
  clusterOptions = markerClusterOptions()) %>% 
  addLegend(na.label = "Missing",
    data = sf::st_as_sf(world_shape),
              pal = pal,
    values = ~ nondemocracy,
            opacity = 0.3,
            title = NULL,
            position = "bottomright") %>% 
  addCircles(opacity = 1,
      color = "#5a64ac", 
      data = aid_geo, 
      lng = ~ longitude,
      lat = ~ latitude,
      weight = 6.6,
      radius = ~ log(cum_amount), label = "Aid Recipient", popup = ~ str_c(format(round(cum_amount / 1e6, 1), trim = TRUE), "Million"))

2 Mixed Models

Sample data (n = 2000, N = 43,780) is as follows.

Show the code
options(digits = 3)

read_rds("sentence_level_result.Rdata") %>% 
  left_join(read_rds("basic_info.Rdata")) %>% 
  mutate(year = year + 1) %>% 
  left_join(all_variables) %>% 
  group_by(country_code, events) %>% 
  mutate(caseid = cur_group_id() |> as_factor(), 
         treatment = if_else(day >= 0, 1, 0),
         day_f = as_factor(day)) %>% 
  filter(day < 10 & day >= -10) %>% 
  ungroup() -> regression_data

regression_data %>% 
  dplyr::select(-c(link, sentiment_sr)) %>% 
  sample_n(2000) %>%
  datatable(filter = "top", options = list(pageLength = 5, autoWidth = TRUE)) 

2.1 utilize nondemocracy

Show the code
regression_data %>% 
  pivot_longer(cols = compound:pos, names_to = "sentiment_name") %>% 
  group_nest(sentiment_name) %>% 
  mutate(models = map(data, ~ lmer(value ~  1 + treatment * (aid + nondemocracy + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models

modelsummary(list("neu" = models$models[[3]],
                  "pos" = models$models[[4]],
                  "neg" = models$models[[2]],
                  "compound" = models$models[[1]]),
             stars = c("*" = 0.10, "**" = 0.05, "***" = 0.01),
             coef_map = c("(Intercept)" = "Constant",
                            "treatment" = "Treatment",
                            "aid" = "Aid recipient",
                            "nondemocracy" = "Not democracy",
                            "BRI" = "BRI member",
                            "treatmentaid" = "Treatment * Aid recipient",
                            "treatmentnondemocracy" = "Treatment * Not democracy",
                            "treatmentBRI" = "Treatment * BRI member"))
neu pos neg compound
Constant 0.566*** 0.072*** 0.053*** 0.076***
(0.024) (0.002) (0.003) (0.021)
Treatment 0.004 -0.004*** 0.002* -0.015**
(0.003) (0.001) (0.001) (0.007)
Aid recipient 0.068 -0.0008 -0.013** 0.107***
(0.044) (0.005) (0.006) (0.039)
Not democracy -0.125** -0.004 0.029*** -0.226***
(0.062) (0.008) (0.008) (0.055)
BRI member 0.091* 0.002 -0.026*** 0.193***
(0.049) (0.007) (0.007) (0.045)
Treatment * Aid recipient 0.010 0.007* -0.004 0.033**
(0.012) (0.004) (0.003) (0.016)
Treatment * Not democracy 0.002 -0.007 -0.009 0.051
(0.026) (0.009) (0.006) (0.035)
Treatment * BRI member -0.022 0.005 0.011* -0.066**
(0.024) (0.008) (0.006) (0.032)
Num.Obs. 43780 43780 43780 43780
R2 Marg. 0.008 0.0009 0.015 0.025
R2 Cond. 0.199 0.018 0.074 0.107
AIC -33564.8 -129158.5 -158019.2 -7041.7
BIC -33469.2 -129062.9 -157923.6 -6946.1
ICC 0.2 0.02 0.06 0.08
RMSE 0.16 0.06 0.04 0.22
* p < 0.1, ** p < 0.05, *** p < 0.01

2.2 utilize nondemocracy_robust

Show the code
regression_data %>% 
  pivot_longer(cols = compound:pos, names_to = "sentiment_name") %>% 
  group_nest(sentiment_name) %>% 
  mutate(models = map(data, ~ lmer(value ~  1 + treatment * (aid + nondemocracy_robust + BRI) + (1 | caseid) + (1 | day_f), data = .))) -> models_robust

modelsummary(list("neu" = models_robust$models[[3]],
                  "pos" = models_robust$models[[4]],
                  "neg" = models_robust$models[[2]],
                  "compound" = models_robust$models[[1]]),
             stars = c("*" = 0.10, "**" = 0.05, "***" = 0.01),
             coef_map = c("(Intercept)" = "Constant",
                            "treatment" = "Treatment",
                            "aid" = "Aid recipient",
                            "nondemocracy_robust" = "Not democracy",
                            "BRI" = "BRI member",
                            "treatmentaid" = "Treatment * Aid recipient",
                            "treatmentnondemocracy_robust" = "Treatment * Not democracy",
                            "treatmentBRI" = "Treatment * BRI member"))
neu pos neg compound
Constant 0.570*** 0.072*** 0.052*** 0.086***
(0.024) (0.002) (0.003) (0.022)
Treatment 0.004 -0.003*** 0.002* -0.015**
(0.003) (0.001) (0.001) (0.007)
Aid recipient 0.051 -0.002 -0.007 0.067*
(0.040) (0.005) (0.006) (0.037)
Not democracy -0.151** -0.006 0.024** -0.231***
(0.062) (0.009) (0.010) (0.060)
BRI member 0.133** 0.006 -0.028*** 0.242***
(0.056) (0.009) (0.009) (0.056)
Treatment * Aid recipient 0.007 0.005 -0.005* 0.034**
(0.011) (0.004) (0.003) (0.015)
Treatment * Not democracy 0.056 0.004 -0.015* 0.134**
(0.039) (0.013) (0.009) (0.052)
Treatment * BRI member -0.073* -0.005 0.018* -0.151***
(0.038) (0.013) (0.009) (0.052)
Num.Obs. 43780 43780 43780 43780
R2 Marg. 0.006 0.0008 0.014 0.022
R2 Cond. 0.193 0.017 0.085 0.115
AIC -33568.4 -129158.4 -158016.8 -7044.3
BIC -33472.9 -129062.9 -157921.2 -6948.8
ICC 0.2 0.02 0.07 0.09
RMSE 0.16 0.06 0.04 0.22
* p < 0.1, ** p < 0.05, *** p < 0.01

3 Reply

3.1 question 1

Both question 1 and question 2 are caused by different treatment coding (i.e., when does the treatment effect begin). Since I tried two ways before and after which brought ambiguities and inconsistencies. I am very sorry. Please code the treatment based on the day.

Show the code
full <- read_rds("sentence_level_result.Rdata") %>% 
  left_join(read_rds("basic_info.Rdata")) %>% 
  mutate(year = year + 1) %>% # since we use previous year's democracy information
  left_join(all_variables) %>% 
  group_by(country_code, events) %>% 
  mutate(caseid = cur_group_id() |> as_factor(), 
         day_f = as_factor(day)) %>% 
  ungroup() # all the data 

before <- full %>% 
  mutate(treatment = if_else(day > 0, 1, 0)) %>% 
  filter(day > -10 & day <= 10) # before

after <- full %>% 
  mutate(treatment = if_else(day >= 0, 1, 0)) %>% 
  filter(day >= -10 & day < 10) # after

question_1_country <- c("Kazakhstan", "Malaysia", "Turkmenistan", "United States of America") %>% 
  imap(~ countrycode(., origin = "country.name", destination = "iso3c")) %>% 
  unlist() 

before_n <- before %>% 
  filter(country_code %in% question_1_country) %>% 
  count(events, country_code, treatment, name = "before_n")

after_n <- after %>% 
  filter(country_code %in% question_1_country) %>% 
  count(events, country_code, treatment, name = "after_n")

before_n %>% 
  left_join(after_n) %>% 
  mutate(diff = after_n - before_n) %>% 
  datatable()

3.2 question 2

Show the code
question_2_country <- c("Brazil", "Cambodia", "Croatia", "Malaysia", "Nepal", "New Zealand", "Senegal", "Vietnam") %>% 
  imap(~ countrycode(., origin = "country.name", destination = "iso3c")) %>% 
  unlist()

before_n_2 <- before %>% 
  ungroup() %>% 
  filter(country_code %in% question_2_country & events == "chenguangcheng") %>% 
  count(events, country_code, treatment, name = "before_n")

after_n_2 <- after %>% 
  ungroup() %>% 
  filter(country_code %in% question_2_country & events == "chenguangcheng") %>% 
  count(events, country_code, treatment, name = "after_n")

before_n_2 %>% 
  left_join(after_n_2) %>% 
  mutate(diff = after_n - before_n) %>% 
  datatable()

3.3 question 3

As I have mentioned, only sentimentr in R will commit errors when it processes a very long sentence without proper punctuation. Since we choose the data calculated by the asent, so no articles will be dropped.

Asent in Python:

Show the code
full %>% 
  pull(compound) %>% 
  quantile()
     0%     25%     50%     75%    100% 
-0.9951 -0.0154  0.1088  0.2401  0.9988 

Sentimentr in R:

Show the code
full %>% 
  pull(sentiment_sr) %>% 
  quantile()
     0%     25%     50%     75%    100% 
-0.6905 -0.0119  0.0772  0.1646  3.5448