9 Topic Models

9.1 Seminar

9.1.1 Exercise

Load the necessary packages up front.

library(quanteda)
library(quantedaData)
library(readtext)

library(tidyverse)

Load the fake-news data from Kaggle and create a corpus object using quanteda.

fake_news <- readtext("https://uclspp.github.io/datasets/data/fake-news.zip", 
                      text_field = "text")

fake_news_corpus = corpus(fake_news)

Create a document-feature matrix using dfm. Remove tokens that appear infrequently and those less than 4 characters long.

fake_news_dfm <- dfm(fake_news_corpus,
                stem = TRUE,
                remove = stopwords("SMART"),
                remove_punct = TRUE,
                remove_numbers = TRUE)

Warning: 'stopwords(language = "SMART")' is deprecated.
Use 'stopwords(source = "smart")' instead.
See help("Deprecated")

fake_news_dfm <- dfm_trim(fake_news_dfm, min_count = 10, min_docfreq = 5)
fake_news_dfm <- dfm_select(fake_news_dfm, min_nchar = 4)

Using topicmodels Package

library(topicmodels)
library(tidytext)

Convert the document-feature matrix to the format that the topicmodels package requires.

fake_news_dtm_topicmodels <- convert(fake_news_dfm, to = "topicmodels")

Fit a topic model using LDA. Wrapping your code inside system.time will tell you how long it took to run.

system.time(
  topic_model <- LDA(fake_news_dtm_topicmodels, k = 4)
)

   user  system elapsed 
448.964   0.588 449.622

Get topic proportions for each document.

posterior_probs <- posterior(topic_model)
topic_proportions <- as.data.frame(posterior_probs$topics)

# show topic proportions for first 10 documents
head(topic_proportions, n = 10)

                  1            2         3            4
text1  0.0026802877 0.0026805035 0.5972145 0.3974246880
text2  0.0006370018 0.0006370317 0.8612489 0.1374770259
text3  0.0010118949 0.0010119414 0.9969642 0.0010119426
text4  0.0036705725 0.0036708808 0.9889876 0.0036709545
text5  0.0005646082 0.7096049318 0.2892658 0.0005646300
text6  0.0008998546 0.0008999118 0.9973003 0.0008998991
text7  0.0002338378 0.0002338511 0.9992985 0.0002338476
text8  0.0004418409 0.0295411333 0.9695752 0.0004418431
text9  0.0008908851 0.0008909248 0.9973273 0.0008909124
text10 0.0020156010 0.7395633852 0.2306829 0.0277380697

Get the top 10 words from each topic

topic_terms <- tidy(topic_model) %>%
  mutate(topic = paste("Topic", topic)) %>%
  group_by(topic) %>%
  mutate(term = reorder(term, desc(beta))) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, desc(beta)) %>%
  mutate(order = row_number())

Plot the words using ggplot

ggplot(topic_terms, aes(order, beta)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~topic, scales = "free") +
  coord_flip() +
  scale_x_continuous(
    trans = "reverse",
    breaks = topic_terms$order,
    labels = topic_terms$term,
    expand = c(0,0)
  ) +
  theme(
    axis.title = element_blank()
  )

Using lda Package with LDAvis

library(lda)
library(LDAvis)

Convert the document-feature matrix to the format that the lda package requires.

fake_news_dtm_lda <- convert(fake_news_dfm, to = "lda")

Fit a topic model using LDA.

set.seed(12345)

alpha <- 0.02
eta <- 0.02

# you can reduce the number of iterations if it takes too long to run

system.time(
  topic_model <- lda.collapsed.gibbs.sampler(documents = fake_news_dtm_lda$documents, 
                                             K = 20,
                                             alpha = alpha,
                                             eta = eta, 
                                             vocab = fake_news_dtm_lda$vocab,
                                             num.iterations = 500, 
                                             initial = NULL, 
                                             burnin = 0,
                                             compute.log.likelihood = TRUE)
)

   user  system elapsed 
487.800   0.040 487.903

Extarct the phi and theta parameters from the trained model.

phi <- t(apply(topic_model$topics + eta, 1, function(x) x/sum(x)))
theta <- t(apply(topic_model$document_sums + alpha, 2, function(x) x/sum(x)))

We also need to find the number of tokens in each document.

doc_length <- sapply(fake_news_dtm_lda$documents, function(x) { sum(x[2,]) })

Crete a JSON object for our visualization.

json_vis <- createJSON(phi = phi,
                       theta = theta,
                       doc.length = doc_length,
                       vocab = fake_news_dtm_lda$vocab,
                       term.frequency = colSums(fake_news_dfm))

Run the visualization server. You should see a topic model in either the viewer tab or in a separate browser window. You can stop the server by pressing the ESC key on the keyboard.

serVis(json_vis, open.browser = TRUE)