-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda_4.py
171 lines (131 loc) · 5.25 KB
/
lda_4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#%%
import pandas as pd
import nltk
#%%
#all.csv is in the github, it is all of the tweets including the training set tweets we worked on, with the ukraine/russian/war tweets filtered out
data1 = pd.read_csv('labelled.csv',header=None)
data2 = pd.read_csv('remainder.csv',header=None)
#%%
data = pd.concat([data1,data2])
#%%
data = data.iloc[:,1]
data.columns = ['tweets']
# %% NLP PRE-PROCESSING
import re
cleaned_tweets = []
words = []
for tweet in data:
clean = re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([#@]\S+)|\n", " ", tweet)
clean = re.sub(r"\d", '', clean)
clean = re.sub(r"'\S+", '', clean)
clean = clean.replace('.', '').replace(';', '').lower()
words += re.findall(r"(?:\w+|'|’)+", clean)
cleaned_tweets.append(clean)
# removing other symbols
corpus = [[re.sub('[^a-zA-Z ]', ' ', document)] for document in cleaned_tweets]
#tokenizing
corpus_tokenized = [nltk.word_tokenize(document[0]) for document in corpus]
# stop words
#stopwords = nltk.corpus.stopwords.words("english")
#corpus_tokenized = [[word for word in document if word not in stopwords] for document in corpus_tokenized]
#lemmatizing
nltk.download('wordnet')
corpus_lemmatized = [[nltk.WordNetLemmatizer().lemmatize(word) for word in document] for document in corpus_tokenized]
#stitching back together
corpus = [' '.join(document) for document in corpus_lemmatized]
#%% GENSIM LDA
import numpy as np
import re
import string
#import spacy
import gensim
from gensim import corpora
#import pyLDAvis
#import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
import seaborn as sns
#%% encoding
dictionary = corpora.Dictionary(corpus_lemmatized)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in corpus_lemmatized]
#%%
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel
# Build LDA model (2 topic)
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=2, random_state=100,
chunksize=1000, passes=50,iterations=100)
#%% topic display
posterior = lda_model.print_topics()
two_topic_LDA = pd.DataFrame(posterior)[1]
two_topic_LDA = two_topic_LDA.transpose()
two_topic_LDA.index = ['topic ' + str(i) for i in range(0,2)]
two_topic_LDA.name = 'words'
print(two_topic_LDA)
#%% Perplexity and coherence scores (u_mass coherence scores)
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000)) # a measure of how good the model is. lower the better.
# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus_lemmatized, dictionary=dictionary , coherence='u_mass')
if __name__ == "__main__":
#freeze_support()
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
#%% GRAPH FUNCTION
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = LDA(corpus=corpus, num_topics=num_topics, id2word=dictionary)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
#%% list of coherence scores for different topic numbers
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=corpus_lemmatized, start=2, limit=50, step=1)
#%%
# Show graph
limit=50; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()# Print the coherence scores
#%%
# Print the coherence scores
for m, cv in zip(x, coherence_values):
print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
#%% the interactive visualization
import pyLDAvis
import pyLDAvis.gensim_models
vis_data = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.show(vis_data, open_browser=False, local=False)
#%% MATRIX TIME, THIS OUTPUT WILL GO INTO K-NN
tweet_vectors = pd.Series(0)
for i in range(len(doc_term_matrix)):
tweet_vectors[i] = lda_model.get_document_topics(doc_term_matrix[i], minimum_probability=0, minimum_phi_value=None, per_word_topics=False)
#%%
tweet_vectors_entries = [[tweet_vectors[i][0][1],tweet_vectors[i][1][1]] for i in range(len(tweet_vectors))]
LDA_tweet_frame = pd.DataFrame(tweet_vectors_entries, columns = ['Topic 0','Topic 1'])
#%%
labelled = LDA_tweet_frame.iloc[:1175,:]
remainder = LDA_tweet_frame.iloc[1175:,:]
#%%
labelled = labelled.join(data1[2])
#%%
labelled.columns = ['Topic 0','Topic 1','label']
#%%
labelled.to_csv('labelled_LDA_vectors_withStopwords.csv',header=None)
remainder.to_csv('remainder_LDA_vectors_withStopwords.csv',header=None)