-
Notifications
You must be signed in to change notification settings - Fork 0
/
messenger.py
208 lines (194 loc) · 7 KB
/
messenger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 21 15:08:39 2016
@author: tibicen
"""
import time
import string
import os
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
print('imported.')
def splitFile():
'''splits 'messages.htm' into smallet files easier to open and inspect.
Only for inspecting, not a part of the rest.
'''
with open('messages.htm', 'r', encoding='UTF-8') as f:
filenr = 0
nr = 0
tmp = ''
for line in f:
tmp += line
nr += 1
if nr == 130:
partfile = open('messages%d.htm' % filenr, 'w',
encoding='UTF-8')
partfile.write(tmp)
partfile.close()
filenr += 1
tmp = ''
nr = 0
partfile = open('messages%d.htm' % filenr, 'w', encoding='UTF-8')
partfile.write(tmp)
partfile.close()
filenr += 1
tmp = ''
nr = ''
def splitIntoTxtFiles():
'''splits 'messages.htm' into txt files for every person.
in each file are only words
'''
# %% OPENING
text = ''
print('opening file...', end='\t')
t1 = time.time()
with open('messages.htm', 'r', encoding='UTF-8') as f:
for line in f:
text += line
t2 = time.time()
print('file opened in %.2f.' % (t2 - t1))
# %% CREATING SOUP
print("creating soup...", end='\t')
soup = BeautifulSoup(text, 'lxml')
t3 = time.time()
del text
print('text souped in %.2f.' % (t3 - t2))
# %% CREATING DICT
print('creating dict...', end='\t')
usersDict = {}
threads = soup.findAll('div', {'class': 'thread'})
jobCount = len(threads)
if 'realPersons' not in os.listdir():
os.mkdir('realPersons')
if 'randomConversations' not in os.listdir():
os.mkdir('randomConversations')
for thread in threads:
# print('%d threads left.' % jobCount)
jobCount -= 1
childs = list(thread.recursiveChildGenerator())
maxIter = len(childs)
for n, child in enumerate(childs):
# print(child)
# input()
if n + 7 + 1 > maxIter:
break
try:
if child.has_attr('class'):
if 'user' in child.attrs['class']:
if usersDict.get(childs[n + 1]):
usersDict[childs[n + 1]] += [childs[n + 5] + '\n']
else:
usersDict[childs[n + 1]] = [childs[n + 5] + '\n']
except(AttributeError, TypeError):
pass
del threads
t4 = time.time()
print('dict created in %.2f.' % (t4 - t3))
# %% SAVING
print('saving...', end='\t\t')
for user, text in usersDict.items():
if user.startswith(tuple([x for x in string.digits])):
folder = 'randomConversations'
else:
folder = 'realPersons'
f = open(os.path.join(folder, user + '.txt'), 'w', encoding='UTF-8')
f.writelines(text)
f.close()
t5 = time.time()
del usersDict
print('saved in %.2f.' % (t5 - t4))
# %%
def getWordsDict(ppl, filename):
''' populates ppl dict with records form filename'''
f = open(os.path.join('realPersons', filename), 'r', encoding='UTF-8')
personName = filename.rstrip('.txt')
ppl[personName] = {}
text = f.read().lower()
nonwords = string.digits + string.punctuation + string.whitespace
for s in nonwords:
text = text.replace(s, ' ')
wordList = []
# deleting useles words with no meaning
# PUT YOUR OWN WORDS HERE
badWords = [x for x in string.ascii_lowercase] + \
['', 'ma', 'w', 'i', 'na', 'z', 'a', 'bo', 'o', 'za', 'ze',
'od', 'po', 'na', 'pod', 'no', 'do', 'co', 'że', 'jak',
'czy', 'sie', 'już', 'to', 'się', 'też', 'coś', 'żeby',
'są', 'we', 'te', 'ale', 'więc', 'tym', 'tam', 'com', 'http',
'https', 'www', 'dla', 'pl', 'at', 'and', 'of', 'in', 'for', 'so',
'am', 'so']
for w in text.split(' '):
if w not in badWords:
wordList.append(w)
for w in wordList:
if ppl[personName].get(w):
ppl[personName][w] += 1
else:
ppl[personName][w] = 1
if ppl['all'].get(w):
ppl['all'][w] += 1
else:
ppl['all'][w] = 1
del wordList
# %%
def createXLS(ppl):
'''sorts all data and saves it into xls file'''
wb = Workbook()
sh = wb.active
# type of sorting here
wordDict = sorted([(k,
sorted(v.items(), key=lambda x: x[1], reverse=True),
len(v.items())) for (k, v) in ppl.items()],
key=lambda x: x[2], reverse=True)
col = 1
# CUSTOMIZE ALL THE WORDS FOR YOUR CONVINIENCE
for name, words, count in wordDict:
sh.cell(row=1, column=col).value = name
for n, word in enumerate(words):
cell = sh.cell(row=2 + n, column=col)
cell.value = word[0]
if word[0] == 'nie':
cell.fill = PatternFill(
fill_type='solid', start_color='fc5834')
cell.font = Font(color='000000')
elif word[0] == 'tak':
cell.fill = PatternFill(
fill_type='solid', start_color='a1c870')
cell.font = Font(color='000000')
elif word[0] in ('ja', 'mi', 'mam', 'mnie', 'jestem', 'wiem',
'bede', 'będę', 'mogę', 'chce', 'mialam',
'bym', 'sam', 'mój', 'chcę', 'moje', 'mną',
'chciałem', 'my', 'i'):
cell.fill = PatternFill(
fill_type='solid', start_color='7ee0e9')
cell.font = Font(color='ffffff')
elif word[0] in ('ci', 'ty', 'masz', 'ciebie', 'jesteś',
'możesz', 'chcesz', 'cię', 'będziesz', 'you'):
cell.fill = PatternFill(
fill_type='solid', start_color='d57ee3')
cell.font = Font(color='ffffff')
elif word[0] in ('mu', 'jej', 'on', 'ten', 'go', 'ona'):
cell.fill = PatternFill(
fill_type='solid', start_color='7b89e8')
cell.font = Font(color='ffffff')
else:
cell.font = Font(color='5c5c5c')
col += 1
wb.save('ppl.xls')
if __name__ == '__main__':
# splitIntoTxtFiles()
ppl = {} # ppl['person name'] = {'word1': wordCountNr}
ppl['all'] = {}
files = os.listdir('realPersons')
# populate ppl dict with people records and words
t1 = time.time()
print('populating ppl dict...', end='\t')
for filename in files:
getWordsDict(ppl, filename)
t2 = time.time()
print('done in %.2f.' % (t2 - t1))
print('creating xls file...', end='\t')
createXLS(ppl)
t3 = time.time()
print('done and saved in %.2f.' % (t3 - t2))