-
Notifications
You must be signed in to change notification settings - Fork 10
/
3. bag.py
29 lines (24 loc) · 1010 Bytes
/
3. bag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# importing regex and nltk
import re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# importing Counter to get word counts for bag of words
from collections import Counter
# importing a passage from Through the Looking Glass
from looking_glass import looking_glass_text
# importing part-of-speech function for lemmatization
from part_of_speech import get_part_of_speech
# Change text to another string:
text = "I just doing math today"
cleaned = re.sub('\W+', ' ', text).lower()
tokenized = word_tokenize(cleaned)
stop_words = stopwords.words('english')
filtered = [word for word in tokenized if word not in stop_words]
normalizer = WordNetLemmatizer()
normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in filtered]
# Comment out the print statement below
# print(normalized)
# Define bag_of_looking_glass_words & print:
bag_of_looking_glass_words = Counter(normalized)
print(bag_of_looking_glass_words)