-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda_2.py
87 lines (50 loc) · 2.14 KB
/
lda_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#%%
import json
import pandas as pd
import nltk
# %% NLP PRE-PROCESSING
import re
cleaned_tweets = []
words = []
for tweet in dataremoved['tweet']:
clean = re.sub(r"(http[s]?\://\S+)|([\[\(].*[\)\]])|([#@]\S+)|\n", " ", tweet)
clean = re.sub(r"\d", '', clean)
clean = re.sub(r"'\S+", '', clean)
clean = clean.replace('.', '').replace(';', '').lower()
words += re.findall(r"(?:\w+|'|’)+", clean)
cleaned_tweets.append(clean)
# removing other symbols
corpus = [[re.sub('[^a-zA-Z ]', ' ', document)] for document in cleaned_tweets]
#tokenizing
corpus_tokenized = [nltk.word_tokenize(document[0]) for document in corpus]
# stop words
stopwords = nltk.corpus.stopwords.words("english")
corpus_tokenized = [[word for word in document if word not in stopwords] for document in corpus_tokenized]
#lemmatizing
nltk.download('wordnet')
corpus_lemmatized = [[nltk.WordNetLemmatizer().lemmatize(word) for word in document] for document in corpus_tokenized]
#stitching back together
corpus = [' '.join(document) for document in corpus_lemmatized]
#%% VECTORIZING CORPUS
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Binarizer
vec = CountVectorizer(tokenizer = nltk.word_tokenize)
freq = vec.fit_transform(corpus)
ohot = Binarizer().fit_transform(freq)
corpus_binary = ohot.todense()
corpus_binary = np.asarray(corpus_binary)
#%% LDA
from sklearn.decomposition import LatentDirichletAllocation
ntopics = 2
lda = LatentDirichletAllocation(n_components = ntopics, learning_method = 'online')
lda.fit(corpus_binary)
posterior = lda.transform(corpus_binary)
#%%
lda.components_
wordTopics = pd.DataFrame(lda.components_.T, index = vec.get_feature_names_out())
wordTopics = wordTopics.apply(lambda x: x / sum(x), 1)
wordTopics = wordTopics.reset_index()
wordTopics.columns = ['word'] + ['topic ' + str(i) for i in range(0,ntopics)]
wordTopics.sort_values(by = 'topic 1', ascending = False)['word'].iloc[1:10]
wordTopics.sort_values(by = 'topic 0', ascending = False)['word'].iloc[1:10]