-
Notifications
You must be signed in to change notification settings - Fork 21
/
DtmAndDendogramClustering.R
93 lines (74 loc) · 4.03 KB
/
DtmAndDendogramClustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#install.packages("ggplot2", dependencies = TRUE)
#install.packages("tm", dependencies = TRUE)
#install.packages("magrittr", dependencies = TRUE)
#install.packages("stopwords", dependencies = TRUE)
library("tm")
library("stopwords")
library("magrittr")
library("ggplot2")
#library(SnowballC)
#Creates a pdf-file
#May take a long time
#May not be human readable (for large files)
#EDIT this row
my_file <- "my_Scopus_TSE_articles_clean_data.RData"
#my_DtmAndDendogramClusterFile = function(my_file) {
print(paste("Dendogram Cluster, my_file: ", my_file))
my_temp_file = paste(my_data_dir, "/", sep="")
my_temp_file = paste(my_temp_file, my_file, sep="")
load(my_temp_file)
#-----Removing unnecessary words----------------
#various stopword lists can be used https://cran.r-project.org/web/packages/stopwords/stopwords.pdf
#stopword list is also context specific. Here you can do manual removals
#also automated methods tf/idf exist. EDIT
my_stopwords = c(stopwords::stopwords(language = "en", source = "snowball"),"myStopword1", "myStopword2")
#A good is to remove more words that we do not care about
Abstract_clean = removeWords(my_articles$Abstract_clean, my_stopwords)
Title = removeWords(my_articles$Title, my_stopwords)
#-----/Removing unnecessary words----------------
# Create corpus by appending title and abstract to character string
corpus = Corpus(VectorSource(paste(Title, Abstract_clean)))
dtm = DocumentTermMatrix(corpus, control=list(tolower=TRUE, stemming=FALSE,
stopwords=FALSE, wordLengths=c(3, Inf), removeNumbers=TRUE,
removePunctuation=TRUE, bounds=list(global=c(5,Inf))))
freq.terms = findFreqTerms(dtm, lowfreq=50)
#Is "and" or "are" meaningfull words? Lets get rid of them by setting stopwords=TRUE
dtm = DocumentTermMatrix(corpus, control=list(tolower=TRUE, stemming=FALSE,
stopwords=TRUE, wordLengths=c(3, Inf), removeNumbers=TRUE,
removePunctuation=TRUE, bounds=list(global=c(5,Inf))))
#Stemming can also be used but the results might not be any better.
dtm_stem = DocumentTermMatrix(corpus, control=list(tolower=TRUE, stemming=TRUE,
stopwords = c (stopwords("english"), my_stopwords),
wordLengths=c(3, Inf), removeNumbers=TRUE, removePunctuation=TRUE,
bounds=list(global=c(5,Inf)) ))
freq.terms = findFreqTerms(dtm_stem, lowfreq=50)
#Lets plot
#Convert to martix, compute colSums (total word counds), take only words with more than 500 occurecents,
#and convert to dataframe with two columns: terms and frequencies
#if too many or too little words showup EDIT number 500 accordingly
df <- dtm %>% as.matrix %>% colSums %>% subset (. >= 500) %>% data.frame(term=names(.), freq=.)
#do you see any new stopwords that could be added to the list
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +xlab("Terms") + ylab("Count") + coord_flip()
#--------------------Cluster and and Plot dendogram ---------------------------------------
#We can subset to 200 papers if we have thounsands
#dtm <- dtm[1:200,]
# Compute distance matrix https://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html
#https://en.wikipedia.org/wiki/Distance_matrix
#will be slow with thousands
dist.mat <- dist(as.matrix(dtm))
#Hirarcical clustering http://www.r-tutor.com/gpu-computing/clustering/hierarchical-cluster-analysis
h <- hclust(dist.mat, method = "ward.D")
#Finally, we plot to file. R-studio plotting view is not large enough.
my_pdf_file = my_temp_file
my_pdf_file = gsub("data/my", "data/pdf", my_pdf_file)
my_pdf_file = gsub(".RData", ".pdf", my_pdf_file)
print(paste("Printing the Cluster Dendogram file:", my_pdf_file))
print("Finished Dendogram Cluster")
pdf(my_pdf_file, width=40, height=15)
par(cex=0.7, mar=c(5, 8, 4, 1))
#Remember we removed stopwords from title so the variable Title no longer make sense
plot(h, labels = my_articles$Title, sub = "")
#when using smaller set e.g. 200 articles only
#plot(h, labels = my_articles$Title[1:200], sub = "")
dev.off()
#}