Understanding Twitter Dynamics With R and Gephi: Text Analysis and Centrality

Social Network Centrality

Degree

Eigenvector

PageRank

Betweenness

Which Should I Use?

Hands-on Centrality Calculation

library("plyr")
library(igraph)
library(tidyverse)
library(NLP)
library("tm")
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
library("textmineR")
load("art1_tweets.RData")
Isolated = which(degree(net)==0)
net_clean = delete.vertices(net, Isolated)
cent<-data.frame(bet=betweenness(net_clean),eig=centr_eigen(net_clean)$vector,prank=(page_rank(net_clean)$vector),degr=degree(net_clean, mode="in"))
cent <- cbind(account = rownames(cent), cent)
top_n(cent,10,degr)%>% arrange(desc(degr))%>% select(degr)
top_n(cent,10,eig)%>% arrange(desc(eig))%>% select(eig)
top_n(cent,10,prank)%>% arrange(desc(prank))%>% select(prank)
top_n(cent,10,bet)%>% arrange(desc(bet))%>% select(bet)

Text Analysis: R and LDA

# This function normalizes text by removing Twitter-related terms and noisy characters
sanitize_text <- function(text) {
# Convert to ASCII to remove accented characters:
text <- iconv(text, to = "ASCII", sub = " ")
# Move to lower case and delete RT word (this is added by Twitter)
text <- gsub("rt", " ", tolower(text))
# Delete links and user names:
text <- gsub("@\\w+", " ", gsub("http.+ |http.+$", " ", text))
# Delete tabs and punctuation:
text <- gsub("[ |\t]{2,}", " ", gsub("[[:punct:]]", " ", text))
text <- gsub("amp", " ", text) # Remove HTML special character
# Delete leading and lagging blanks:
text <- gsub("^ ", "", gsub(" $", "", text))
text <- gsub(" +", " ", text) # Delete extra spaces
return(text)
}
# Detect communities:
my.com.fast <-cluster_louvain(as.undirected(simplify(net)))
largestCommunities <- order(sizes(my.com.fast), decreasing=TRUE)[1:3]
# Save the usernames of the biggest community:
community1 <- names(which(membership(my.com.fast) == largestCommunities[1]))
# Sanitize the text of the users of the biggest community:
text <- unique(sanitize_text(tweets.df[which(tweets.df$screen_name %in% community1),]$text))
text = text[text!=''] # Delete empty entries
stopwords_regex = paste(stopwords('es'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
# Remove English stopwords:
text = stringr::str_replace_all(text, stopwords_regex, '')
# Create the document term matrix:
dtm <- CreateDtm(text,
doc_names = seq(1:length(text)),
ngram_window = c(1, 2))

Topic Counts and Coherence Scores

tf <- TermDocFreq(dtm = dtm)
# Remove infrequent words:
tf_trimmed = tf$term[ tf$term_freq > 1 & tf$doc_freq < nrow(dtm) / 2 ]
# Create a folder to store trained models:
model_dir <- paste0("models_", digest::digest(tf_trimmed, algo = "sha1"))
if (!dir.exists(model_dir)) dir.create(model_dir)
# Define a function to infer LDA topics:
train_lda_model <- function(number_of_topics){
filename = file.path(model_dir, paste0(number_of_topics, "_topics.rda"))
# Check if the model already exists:
if (!file.exists(filename)) {
# To get exactly the same output on each run, use a constant seed:
set.seed(12345)
lda_model = FitLdaModel(dtm = dtm, k = number_of_topics, iterations = 500)
lda_model$k = number_of_topics
lda_model$coherence = CalcProbCoherence(phi = lda_model$phi, dtm = dtm, M = 5)
save(lda_model, file = filename)
} else {
load(filename)
}

lda_model
}
# The number of topics that we are going to infer in each LDA training run:
topic_count = seq(3, 20, by = 1)
# Train through the TmParallelApply function
models = TmParallelApply(X = topic_count,
FUN = train_lda_model,
export = c("dtm", "model_dir"))
coherence_by_topics_quantity = data.frame(
topic_number = sapply(models, function(model_instance) nrow(model_instance$phi)),
score_coherence = sapply(models,
function(model_instance) mean(model_instance$coherence)),
stringsAsFactors = FALSE)
ggplot(coherence_by_topics_quantity, aes(x = topic_number, y = score_coherence)) +
geom_point() +
geom_line(group = 1) +
ggtitle("Coherence by Topic") + theme_minimal() +
scale_x_continuous(breaks = seq(1,20,1)) + ylab("Coherence Score") + xlab("Number of topics")
best_model <- models[which.max(coherence_by_topics_quantity$score_coherence)][[ 1 ]]# Most important terms by topic:
best_model$top_terms <- GetTopTerms(phi = best_model$phi, M = 20)
top10 <- as.data.frame(best_model$top_terms)
top10
tweets.df.com1 = tweets.df[which(tweets.df$screen_name %in% community1),]
users_text <- ddply(tweets.df.com1,
~screen_name,
summarise,
text = paste(text, collapse = " "))
users_text$text <- sanitize_text(users_text$text) # Get rid of duplicates
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
users_text$text = stringr::str_replace_all(users_text$text, stopwords_regex, '')
dtm.users.com1 <- CreateDtm(users_text$text,
doc_names = users_text$screen_name,
ngram_window = c(1, 2))
com1.users.topics = predict(best_model, dtm.users.com1, method="gibbs", iterations=100)
# Get the subgraph of the first community:
net.com1 = induced_subgraph(net,community1)
# Estimate the topic with the max score for each user:
com1.users.maxtopic = cbind(users_text$screen_name,
colnames(com1.users.topics)[apply(com1.users.topics,
1,
which.max)])
# Order the users topic data frame by the users' order in the graph:
com1.users.maxtopic = com1.users.maxtopic[match(V(net.com1)$name,
com1.users.maxtopic[,1]),]
# Create a new attr of the graph by the topic most discussed by each user:
V(net.com1)$topic = com1.users.maxtopic[,2]
# Create a new graph:
write_graph(simplify(net.com1), "messi_graph_topics.gml", format = "gml")

Inferring Important Topics and Applying Social Network Centrality

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store