Understanding Twitter Dynamics With R and Gephi: Text Analysis and Centrality

Social Network Centrality

Degree

Eigenvector

PageRank

Betweenness

Which Should I Use?

Hands-on Centrality Calculation

library("plyr")
library(igraph)
library(tidyverse)
library(NLP)
library("tm")
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)
library("textmineR")
load("art1_tweets.RData")
Isolated = which(degree(net)==0)
net_clean = delete.vertices(net, Isolated)
cent<-data.frame(bet=betweenness(net_clean),eig=centr_eigen(net_clean)$vector,prank=(page_rank(net_clean)$vector),degr=degree(net_clean, mode="in"))
cent <- cbind(account = rownames(cent), cent)
top_n(cent,10,degr)%>% arrange(desc(degr))%>% select(degr)
top_n(cent,10,eig)%>% arrange(desc(eig))%>% select(eig)
top_n(cent,10,prank)%>% arrange(desc(prank))%>% select(prank)
top_n(cent,10,bet)%>% arrange(desc(bet))%>% select(bet)

Text Analysis: R and LDA

# This function normalizes text by removing Twitter-related terms and noisy characters
sanitize_text <- function(text) {
# Convert to ASCII to remove accented characters:
text <- iconv(text, to = "ASCII", sub = " ")
# Move to lower case and delete RT word (this is added by Twitter)
text <- gsub("rt", " ", tolower(text))
# Delete links and user names:
text <- gsub("@\\w+", " ", gsub("http.+ |http.+$", " ", text))
# Delete tabs and punctuation:
text <- gsub("[ |\t]{2,}", " ", gsub("[[:punct:]]", " ", text))
text <- gsub("amp", " ", text) # Remove HTML special character
# Delete leading and lagging blanks:
text <- gsub("^ ", "", gsub(" $", "", text))
text <- gsub(" +", " ", text) # Delete extra spaces
return(text)
}
# Detect communities:
my.com.fast <-cluster_louvain(as.undirected(simplify(net)))
largestCommunities <- order(sizes(my.com.fast), decreasing=TRUE)[1:3]
# Save the usernames of the biggest community:
community1 <- names(which(membership(my.com.fast) == largestCommunities[1]))
# Sanitize the text of the users of the biggest community:
text <- unique(sanitize_text(tweets.df[which(tweets.df$screen_name %in% community1),]$text))
text = text[text!=''] # Delete empty entries
stopwords_regex = paste(stopwords('es'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
# Remove English stopwords:
text = stringr::str_replace_all(text, stopwords_regex, '')
# Create the document term matrix:
dtm <- CreateDtm(text,
doc_names = seq(1:length(text)),
ngram_window = c(1, 2))

Topic Counts and Coherence Scores

tf <- TermDocFreq(dtm = dtm)
# Remove infrequent words:
tf_trimmed = tf$term[ tf$term_freq > 1 & tf$doc_freq < nrow(dtm) / 2 ]
# Create a folder to store trained models:
model_dir <- paste0("models_", digest::digest(tf_trimmed, algo = "sha1"))
if (!dir.exists(model_dir)) dir.create(model_dir)
# Define a function to infer LDA topics:
train_lda_model <- function(number_of_topics){
filename = file.path(model_dir, paste0(number_of_topics, "_topics.rda"))
# Check if the model already exists:
if (!file.exists(filename)) {
# To get exactly the same output on each run, use a constant seed:
set.seed(12345)
lda_model = FitLdaModel(dtm = dtm, k = number_of_topics, iterations = 500)
lda_model$k = number_of_topics
lda_model$coherence = CalcProbCoherence(phi = lda_model$phi, dtm = dtm, M = 5)
save(lda_model, file = filename)
} else {
load(filename)
}

lda_model
}
# The number of topics that we are going to infer in each LDA training run:
topic_count = seq(3, 20, by = 1)
# Train through the TmParallelApply function
models = TmParallelApply(X = topic_count,
FUN = train_lda_model,
export = c("dtm", "model_dir"))
coherence_by_topics_quantity = data.frame(
topic_number = sapply(models, function(model_instance) nrow(model_instance$phi)),
score_coherence = sapply(models,
function(model_instance) mean(model_instance$coherence)),
stringsAsFactors = FALSE)
ggplot(coherence_by_topics_quantity, aes(x = topic_number, y = score_coherence)) +
geom_point() +
geom_line(group = 1) +
ggtitle("Coherence by Topic") + theme_minimal() +
scale_x_continuous(breaks = seq(1,20,1)) + ylab("Coherence Score") + xlab("Number of topics")
best_model <- models[which.max(coherence_by_topics_quantity$score_coherence)][[ 1 ]]# Most important terms by topic:
best_model$top_terms <- GetTopTerms(phi = best_model$phi, M = 20)
top10 <- as.data.frame(best_model$top_terms)
top10
tweets.df.com1 = tweets.df[which(tweets.df$screen_name %in% community1),]
users_text <- ddply(tweets.df.com1,
~screen_name,
summarise,
text = paste(text, collapse = " "))
users_text$text <- sanitize_text(users_text$text) # Get rid of duplicates
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
users_text$text = stringr::str_replace_all(users_text$text, stopwords_regex, '')
dtm.users.com1 <- CreateDtm(users_text$text,
doc_names = users_text$screen_name,
ngram_window = c(1, 2))
com1.users.topics = predict(best_model, dtm.users.com1, method="gibbs", iterations=100)
# Get the subgraph of the first community:
net.com1 = induced_subgraph(net,community1)
# Estimate the topic with the max score for each user:
com1.users.maxtopic = cbind(users_text$screen_name,
colnames(com1.users.topics)[apply(com1.users.topics,
1,
which.max)])
# Order the users topic data frame by the users' order in the graph:
com1.users.maxtopic = com1.users.maxtopic[match(V(net.com1)$name,
com1.users.maxtopic[,1]),]
# Create a new attr of the graph by the topic most discussed by each user:
V(net.com1)$topic = com1.users.maxtopic[,2]
# Create a new graph:
write_graph(simplify(net.com1), "messi_graph_topics.gml", format = "gml")

Inferring Important Topics and Applying Social Network Centrality

--

--

--

bootcampai.org

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

Create an Interactive Report By Using Plotly

Assumptions of Linear Regression Model

Using Supervised Regression Models to Analyze the Effect of Socioeconomic Disparities in COVID-19…

Sink or Swim in 5 Big Data Challenges

Create a Customer Segmentation Report for Arvato Financial Solutions

You Can Do A Data Science Job Remotely; See the Benefits!

How to go about purchasing a latexmattress https://t.co/o98mU2tndG

Real Estate Investing in Philadelphia

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Bootcamp AI

Bootcamp AI

bootcampai.org

More from Medium

Starbucks customer behavior guide

The Day I Was Not Recognised by a Machine Learning Algorithm: Algorithm Bias and How I Was Not Able…

How to use AI for a Better Patient Experience? — Saxon AI

Predicting The Demand For a New Pharmaceutical Product Based on Data to Improve Odds of Success