Document Similarity
This example shows how to use document similarity with quanteda.
library(quanteda)
library(quantedaData)
library(readtext)
We’ll compare Barack Obama’s 2008 election victory speech against the State-Of-The-Union (SOTU) corpus. First, let’s load the SOTU dataset and keep only the docvars
that we care about.
sotu_corpus <- corpus(data_corpus_SOTU)
sotu_corpus <- corpus_subset(sotu_corpus, select = c(FirstName, President, party))
head(docvars(sotu_corpus))
FirstName President party
Washington-1790 George Washington Independent
Washington-1790b George Washington Independent
Washington-1791 George Washington Independent
Washington-1792 George Washington Independent
Washington-1793 George Washington Independent
Washington-1794 George Washington Independent
Now load Obama’s 2008 election vistory speech and set the docvars
.
obama2008 <- readtext("https://uclspp.github.io/datasets/data/obama_2008.txt")
obama2008_corpus <- corpus(obama2008)
docnames(obama2008_corpus) <- "Obama_2008"
# set the docvars to match what's in the SOTU corpus
docvars(obama2008_corpus, "FirstName") <- "Barack"
docvars(obama2008_corpus, "President") <- "Obama"
docvars(obama2008_corpus, "party") <- "Democratic"
obama2008_corpus <- corpus_subset(obama2008_corpus, select = c(FirstName, President, party))
head(docvars(obama2008_corpus))
FirstName President party
Obama_2008 Barack Obama Democratic
Create a document-feature matrix combining the State-Of-The-Union corpus with Obama’s 2008 speech.
my_dfm <- dfm(c(sotu_corpus, obama2008_corpus),
stem = TRUE,
remove = stopwords("english"),
remove_punct = TRUE,
remove_numbers = TRUE)
Use textstat_simil
to get similarity between SOTU addresses and Obama_2008
.
doc_similarity <- textstat_simil(my_dfm, "Obama_2008",
margin = "documents",
method = "cosine")
We convert the result from textstat_simil
to a data.frame
and sort in descending order.
doc_similarity <- as.data.frame(as.matrix(doc_similarity))
doc_similarity_sorted <- doc_similarity[order(doc_similarity$Obama_2008, decreasing = TRUE), , drop = FALSE]
Top-10 most similar SOTU addresses:
head(doc_similarity_sorted, n = 10)
Obama_2008
Obama_2008 1.0000000
Obama-2011 0.6363987
Bush-1990 0.6156319
su2017.txt 0.6066511
Obama-2010 0.5996327
Obama-2014 0.5941031
Bush-2006 0.5931273
Obama-2016 0.5891589
Reagan-1987 0.5778080
Bush-1991 0.5753668
Compare the speeches using TF-IDF weighted feature matrix.
doc_similarity <- textstat_simil(dfm_weight(my_dfm, type = "tfidf"),
"Obama_2008",
margin = "documents",
method = "cosine")
doc_similarity <- as.data.frame(as.matrix(doc_similarity))
doc_similarity_sorted <- doc_similarity[order(doc_similarity$Obama_2008, decreasing = TRUE), , drop = FALSE]
head(doc_similarity_sorted, n = 10)
Obama_2008
Obama_2008 1.0000000
Obama-2010 0.2255179
Obama-2015 0.2227388
Obama-2011 0.2191556
Obama-2013 0.2135144
Obama-2009 0.2083599
Johnson-1969 0.2025908
Obama-2014 0.1945594
Bush-1989 0.1926864
Obama-2012 0.1899707