-
Notifications
You must be signed in to change notification settings - Fork 0
/
telegram-stats.py
132 lines (111 loc) · 4.81 KB
/
telegram-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# https://medium.com/mcd-unison/telegram-group-chat-analysis-5b49446c2ec9
import re
from collections import Counter
import emoji
import numpy
import pandas as pd
from matplotlib import pyplot
from wordcloud import WordCloud, STOPWORDS
df = pd.read_json('./result.json', dtype={'from_id': str})
df = df[df.from_id != 'nan'] # This is a telegram service, likely updates
# Grab all unique Id's, and re-write the names.
from_ids = df['from_id'].unique()
for from_id in from_ids:
df.loc[df.from_id == from_id, 'from'] = df['from']
# Plot Stickers sent / Messages sent
df[['type','from']].groupby(['from']).count().sort_values(['type'], ascending=False)
df[['media_type', 'id']].groupby('media_type', as_index=False).count()
df[['type', 'id']].groupby('type', as_index=False).count()
sticker_df = df.loc[df['media_type'] == 'sticker'][['from', 'id']]\
.groupby(['from'], as_index=False)\
.agg('count')\
.sort_values(['id'], ascending=False)
messages_df = df.loc[df['type'] == 'message'][['from', 'id']]\
.groupby(['from'], as_index=False)\
.agg('count')\
.sort_values(['id'], ascending=False)
import plotly.express as px
fig = px.pie(sticker_df, hole=.5, values=sticker_df['id'], names=sticker_df['from'],
title='Stickers sent')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()
import plotly.express as px
fig = px.pie(messages_df, hole=.5, values=messages_df['id'], names=messages_df['from'],
title='Messages sent')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()
# how emojis and words are distributed and counted
def get_emojis_in_message(row):
message = row.text
emojis = ""
# Telegram may save some messages as json
if message is None or type(message) != str:
return None
return emojis.join(char for char in message if emoji.is_emoji(char))
def get_words_count(row):
message = row.text
# Telegram may save some messages as json
if message is None or type(message) != str:
return None
return re.sub("[^\w]", " ", message).split().__len__()
df["emojis"] = df[["text"]].apply(get_emojis_in_message, axis=1)
df["word_count"] = df[["text"]].apply(get_words_count, axis=1)
people = df['from'].unique()
for name in people:
user_df = df[df["from"] == name]
words_per_message = numpy.sum(user_df['word_count'])
print('stats for ', name)
print(name, ' sent ', int(words_per_message), ' words, average ', words_per_message/user_df.shape[0], ' per message')
# how the emoji distribution is looking
total_emojis_list = list(df.emojis)
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.replace(to_replace='None', value=numpy.nan).dropna()
emoji_df.replace(to_replace=0, value=numpy.nan).dropna()
import plotly.express as px
fig = px.pie(emoji_df.loc[2:].head(60), hole=.5, values='count', names='emoji',
title='Emoji Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
# word cloud
text_df = df.text.dropna()
text = " ".join(review for review in df.text.dropna() if review is not None and type(review) == str)
print ("There are {} words in all the messages.".format(len(text)))
stopwords = set(STOPWORDS)
stopwords.update(["pero", "en", "que", "lo", "de", "si", "con","se","tengo","por", "la", "el", "ya", "los", "es", "tiene", "como","mi","te","un","esta","del", "tu", "Yo","eso", "pue","para","las","porque","al","bueno","al","donde","ese","son","una","ese","sí","son","le","está","estaba","dice","creo"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
# Display the generated image:
pyplot.figure( figsize=(10,5))
pyplot.imshow(wordcloud, interpolation='bilinear')
pyplot.axis("off")
pyplot.show()
# number of words shared as time moves on.
df["datetime"] = pd.to_datetime(df['date'])
df.index = df['datetime']
date_df = df.resample("D").sum()
date_df.reset_index(inplace=True)
fig = px.line(date_df, x="datetime", y="word_count", title='Number of words shared as time moves on.')
fig.update_xaxes(nticks=30)
fig.show()
# how active we were each day of the week
def dayofweek(i):
l = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
return l[i];
day_df=pd.DataFrame(df["word_count"])
day_df['day_of_date'] = df['datetime'].dt.weekday
day_df['day_of_date'] = day_df["day_of_date"].apply(dayofweek)
day_df["messagecount"] = 1
day = day_df.groupby("day_of_date").sum()
day.reset_index(inplace=True)
fig = px.line_polar(day, r='messagecount', theta='day_of_date', line_close=True)
fig.update_traces(fill='toself')
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True
)),
showlegend=False
)
fig.show()