Text Analysis
Wordscores
library(countrycode)
library(classInt)
library(tidyverse)
library(maps)
library(rworldmap)
library(quanteda)
library(readtext)
library(RColorBrewer)
Load the UNGD data. Make sure you’ve downloaded the dataset from http://dx.doi.org/10.7910/DVN/0TJX8Y and placed it in your working directory.
ungd_debates <- readtext("UNGDC 1970-2016.zip",
ignore_missing_files = TRUE,
docvarsfrom = "filenames",
dvsep = "_",
docvarnames = c("country", "session", "year"),
verbosity = 0)
Create a corpus from the texts.
ungd_corpus <- corpus(ungd_debates)
Create a subset that only includes debates from 2014. We focus on 2014 to analyse the idealogical positions of countries during the crisis in Ukraine.
ungd_2014 <- corpus_subset(ungd_corpus, year == 2014)
Save the docvars in a data.frame that we’ll use later.
ungd_data <- as.data.frame(docvars(ungd_2014))
Create a document-feature matrix.
ungd_dfm <- dfm(ungd_2014,
stem = TRUE,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers = TRUE)
ungd_dfm <- dfm_trim(ungd_dfm, min_count = 10, min_docfreq = 5)
Assuming that U.S. and Russia represent the two extremes of idealogical positions over the Ukrainian crisis, we set the the reference scores as follows:
- RUS = -1
- USA = 1
First, we find the index of RUS and USA debates.
rus_index <- which(ungd_data$country == "RUS")
usa_index <- which(ungd_data$country == "USA")
Then we set the reference scrores for RUS and USA and leave the rest to NA
.
refscores <- rep(NA, nrow(ungd_dfm))
refscores[rus_index] <- -1
refscores[usa_index] <- 1
Fit a wordscores model using the reference scores.
wordscores_model <- textmodel(ungd_dfm,
refscores,
model = "wordscores",
scale = "linear",
smooth = 1)
Extract the wordscores, rescale them and then save in ungd_data
data.frame we created earlier.
wordscores <- predict(wordscores_model, rescaling = "mv")
ungd_data$wordscore <- wordscores@textscores$textscore_mv
You can use the wordscore estimates as explanatory variables to understand how the policy dimension affects some other response variable that you’re interested in.
Let’s see what the ungd_data
looks like:
head(ungd_data)
doc_id country session year wordscore
text7121 Session 69 - 2014/AFG_69_2014.txt AFG 69 2014 -0.3351900
text7122 Session 69 - 2014/AGO_69_2014.txt AGO 69 2014 -0.4436961
text7123 Session 69 - 2014/ALB_69_2014.txt ALB 69 2014 -0.3489190
text7124 Session 69 - 2014/AND_69_2014.txt AND 69 2014 -0.1707580
text7125 Session 69 - 2014/ARE_69_2014.txt ARE 69 2014 -0.2457200
text7126 Session 69 - 2014/ARG_69_2014.txt ARG 69 2014 -0.2114322
Use classIntervals
to create breaks between the continous scale from -1 to 1. We’ll need this for plotting.
class_intervals <- classIntervals(ungd_data$wordscore,
rtimes = 10,
style = 'bclust')
Committee Member: 1(1) 2(1) 3(1) 4(1) 5(1) 6(1) 7(1) 8(1) 9(1) 10(1)
Computing Hierarchical Clustering
Plotting with rworldmap
You can use rworldmap
package for plotting.
spatial_data <- joinCountryData2Map(ungd_data,
joinCode = "ISO3",
nameJoinColumn = "country")
192 codes from your data successfully matched countries in the map
2 codes from your data failed to match with a country code in the map
51 codes from the map weren't represented in your data
wordscore_map <- mapCountryData(spatial_data,
nameColumnToPlot = "wordscore",
catMethod = class_intervals$brks,
mapTitle = "Russia vs USA: Wordscores 2014",
colourPalette = brewer.pal(9, "Blues"),
missingCountryCol = "grey",
addLegend = FALSE)
do.call(addMapLegend, c(wordscore_map,
legendLabels = "limits",
labelFontSize = 0.7,
legendShrink = 0.7,
legendMar = 5,
legendWidth = 0.5))
Plotting with ggplot
ungd_data$wordscore_int <- cut(ungd_data$wordscore,
include.lowest = TRUE,
breaks = class_intervals$brks)
Merge map data with wordscores for plotting.
world_map <- map_data("world")
world_map$country <- countrycode(world_map$region,
"country.name",
"iso3c")
Warning in countrycode(world_map$region, "country.name", "iso3c"): Some values were not matched unambiguously: Ascension Island, Azores, Barbuda, Bonaire, Canary Islands, Chagos Archipelago, Grenadines, Heard Island, Kosovo, Madeira Islands, Micronesia, Saba, Saint Martin, Siachen Glacier, Sint Eustatius, Virgin Islands
world_map <- inner_join(world_map,
ungd_data,
by = c("country"))
world_map <- subset(world_map, select = c(lat, long, group, wordscore_int))
Create the plot and customize the appearance.
ggplot(world_map, aes(long, lat, group = group)) +
ggtitle("Russia vs USA: Wordscores 2014") +
geom_polygon(aes(fill = wordscore_int)) +
geom_path(color = "grey", size = 0.05) +
scale_fill_brewer(
palette = "Blues",
labels = c(-1, rep("", 7), 1),
guide = guide_legend(
nrow = 1,
label.hjust = 0.5,
label.position = "bottom"
)
) +
coord_equal() +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5),
legend.title = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.width = unit(15, units = "mm"),
legend.key.height = unit(2, units = "mm")
)