-
Notifications
You must be signed in to change notification settings - Fork 1
/
lexicon.py
53 lines (44 loc) · 1.38 KB
/
lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import string
data_filename = "texts/all.txt"
output_filename = "lexicon.txt"
lexicon_counts_ouput = "lex_counts.txt"
with open(data_filename, 'r') as f:
data = f.read()
data = data.lower()
exclude = set(string.punctuation)
data = ''.join(ch for ch in data if ch not in exclude)
word_array = data.split(" ")
word_map = {}
for word in word_array:
if word not in word_map:
word_map[word] = 1
else:
word_map[word] += 1
print "Unique words: %d" % len(word_map)
dictlist = []
for key, value in word_map.iteritems():
temp = [key,value]
dictlist.append(temp)
dictlist.sort(key=lambda x: x[1])
dictlist.reverse()
counts = {}
one_words = 0
two_words = 0
with open(output_filename, 'wb+') as output:
for entry in dictlist:
try:
counts[str(entry[1])] += 1
except:
counts[str(entry[1])] = 1
output.write(entry[0] + ": " + str(entry[1]) + "\n")
countslist = []
for key in counts:
countslist.append((int(key), counts[key]))
countslist.sort(key=lambda x: x[0])
countslist
with open(lexicon_counts_ouput, "w") as output:
output.write("# (number of time word appears): (number of words in this category), (percentage of total words)%\n")
for entry in countslist:
cur_str = "%s words: %d, %f%%" % (entry[0], entry[1], (entry[1] / float(len(word_map)) * 100))
print cur_str
output.write(cur_str + "\n")