-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_mining.R
102 lines (84 loc) · 2.79 KB
/
text_mining.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#' Plot a word cloud with text mining
#'
#' A function returns a figure of a word cloud after text mining
#'
#' @import pdftools
#' @import tm
#' @import wordcloud
#' @import RColorBrewer
#'
#' @param text_directory - A path that contains all pdf files
#' @param lower_case - Convert text to lower case or not
#' @param remove_numbers - Remove numbers in the text or not
#' @param remove_common_words - Remove common english words or not
#' @param custom_stopwords - Customized words to be removed
#'
#' @return output a list object containing the text matrix, frequent words, and original text
#'
#' @export
#'
text_mining <- function(text_directory,
lower_case=TRUE,
remove_numbers=TRUE,
remove_common_words=TRUE,
custom_stopwords=NULL
){
setwd(file.path(text_directory))
# Read names of files
files <- list.files(pattern = "pdf$")
# Create metadata of files
docs <- Corpus(URISource(files),
readerControl = list(reader = readPDF))
# Convert symbols to space
toSpace <- content_transformer(
function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
# Convert text to lower case
if (lower_case==TRUE) docs <- tm_map(
docs,
content_transformer(tolower))
# Remove numbers
if (remove_numbers==TRUE) docs <- tm_map(
docs,
removeNumbers)
# Remove english common stopwords
if (remove_common_words==TRUE) docs <- tm_map(
docs,
removeWords,
stopwords("en"))
# Remove your own stop words
# specify your stopwords as a character vector
if (!is.null(custom_stopwords)) docs <- tm_map(
docs,
removeWords,
custom_stopwords)
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Format docs
output <- list()
output$matrix <- TermDocumentMatrix(docs)
m <- as.matrix(output$matrix)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word=names(v), freq=v)
output$freqwords <- findFreqTerms(output$matrix, lowfreq = 4)
output$docs <- docs
jpeg(filename=file.path(text_directory, "word_cloud.jpg"),
width=130, height=150,
units="mm", res=1200)
wordcloud(words=d$word, freq=d$freq, min.freq=1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
dev.off()
jpeg(filename=file.path(text_directory, "frequent_words.jpg"),
width=130, height=150,
units="mm", res=1200)
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
dev.off()
return(output)
}