Document Similarity

This example shows how to use document similarity with quanteda.

library(quanteda)
library(quantedaData)
library(readtext)

We’ll compare Barack Obama’s 2008 election victory speech against the State-Of-The-Union (SOTU) corpus. First, let’s load the SOTU dataset and keep only the docvars that we care about.

sotu_corpus <- corpus(data_corpus_SOTU)
sotu_corpus <- corpus_subset(sotu_corpus, select = c(FirstName, President, party))

head(docvars(sotu_corpus))
                 FirstName  President       party
Washington-1790     George Washington Independent
Washington-1790b    George Washington Independent
Washington-1791     George Washington Independent
Washington-1792     George Washington Independent
Washington-1793     George Washington Independent
Washington-1794     George Washington Independent

Now load Obama’s 2008 election vistory speech and set the docvars.

obama2008 <- readtext("https://uclspp.github.io/datasets/data/obama_2008.txt")

obama2008_corpus <- corpus(obama2008)
docnames(obama2008_corpus) <- "Obama_2008"

# set the docvars to match what's in the SOTU corpus
docvars(obama2008_corpus, "FirstName") <- "Barack"
docvars(obama2008_corpus, "President") <- "Obama"
docvars(obama2008_corpus, "party") <- "Democratic"
obama2008_corpus <- corpus_subset(obama2008_corpus, select = c(FirstName, President, party))

head(docvars(obama2008_corpus))
           FirstName President      party
Obama_2008    Barack     Obama Democratic

Create a document-feature matrix combining the State-Of-The-Union corpus with Obama’s 2008 speech.

my_dfm <- dfm(c(sotu_corpus, obama2008_corpus),
              stem = TRUE,
              remove = stopwords("english"),
              remove_punct = TRUE,
              remove_numbers = TRUE)

Use textstat_simil to get similarity between SOTU addresses and Obama_2008.

doc_similarity <- textstat_simil(my_dfm, "Obama_2008", 
                                 margin = "documents", 
                                 method = "cosine")

We convert the result from textstat_simil to a data.frame and sort in descending order.

doc_similarity <- as.data.frame(as.matrix(doc_similarity))

doc_similarity_sorted <- doc_similarity[order(doc_similarity$Obama_2008, decreasing = TRUE), , drop = FALSE]

Top-10 most similar SOTU addresses:

head(doc_similarity_sorted, n = 10)
            Obama_2008
Obama_2008   1.0000000
Obama-2011   0.6363987
Bush-1990    0.6156319
su2017.txt   0.6066511
Obama-2010   0.5996327
Obama-2014   0.5941031
Bush-2006    0.5931273
Obama-2016   0.5891589
Reagan-1987  0.5778080
Bush-1991    0.5753668

Compare the speeches using TF-IDF weighted feature matrix.

doc_similarity <- textstat_simil(dfm_weight(my_dfm, type = "tfidf"), 
                                 "Obama_2008",
                                 margin = "documents", 
                                 method = "cosine")
doc_similarity <- as.data.frame(as.matrix(doc_similarity))

doc_similarity_sorted <- doc_similarity[order(doc_similarity$Obama_2008, decreasing = TRUE), , drop = FALSE]

head(doc_similarity_sorted, n = 10)
             Obama_2008
Obama_2008    1.0000000
Obama-2010    0.2255179
Obama-2015    0.2227388
Obama-2011    0.2191556
Obama-2013    0.2135144
Obama-2009    0.2083599
Johnson-1969  0.2025908
Obama-2014    0.1945594
Bush-1989     0.1926864
Obama-2012    0.1899707