-
Notifications
You must be signed in to change notification settings - Fork 0
/
Twitter_Sentiment_Analysis.py
152 lines (122 loc) · 4.54 KB
/
Twitter_Sentiment_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# coding: utf-8
# In[ ]:
#import other libraries
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
# import SQLContext and data types
from pyspark.sql import SQLContext
from pyspark.sql.types import *
#sc is a SparkContext.
sqlContext = SQLContext(sc)
# fetching the data of the twitter tweets from the parquet file
# the parquet file containis all the tweets from last 30 seconds
tweet_data = sqlContext.read.parquet("swift://notebooks.spark/tweetsFull.parquet")
tweet_data.registerTempTable("tweets");
sqlContext.cacheTable("tweets")
tweets = sqlContext.sql("SELECT * FROM tweets")
tweets.cache()
# In[ ]:
#create an array that will hold the count for each sentiment
sentimentDistribution=[0] * 9
#store the data in the array
for i, sentiment in enumerate(tweets.columns[-9:]):
sentimentDistribution[i]=sqlContext.sql("SELECT count(*) as sentCount FROM tweets where " + sentiment + " > 60")
.collect()[0].sentCount
# In[ ]:
get_ipython().magic('matplotlib inline')
ind=np.arange(9)
width = 0.35
bar = plt.bar(ind, sentimentDistribution, width, color='g', label = "distributions")
#Setting graph parameters
params = plt.gcf()
plSize = params.get_size_inches()
params.set_size_inches( (plSize[0]*2.5, plSize[1]*2) )
plt.ylabel('Count of Tweets')
plt.xlabel('Sentiment Tone')
plt.title('Distribution of tweets by sentiments > 60%')
plt.xticks(ind+width, tweets.columns[-9:])
plt.legend()
plt.show()
# In[ ]:
#here write the topic that you want to search.
#for example, "#Bigdata" will fetch tweets related to big data
from operator import add
import re
rdd = tweets.flatMap( lambda t: re.split("s", t.text))
.filter( lambda word: word.startswith("#") )
.map( lambda word : (word, 1 ))
.reduceByKey(add, 10).map(lambda (a,b): (b,a)).sortByKey(False).map(lambda (a,b):(b,a))
top10_tweets_with_tag = rdd.take(10)
# In[ ]:
#Setting graph parameters
get_ipython().magic('matplotlib inline')
params = plt.gcf()
plSize = params.get_size_inches()
params.set_size_inches( (plSize[0]*2, plSize[1]*2) )
labels = [i[0] for i in top10_tweets_with_tag]
sizes = [int(i[1]) for i in top10_tweets_with_tag]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', "beige", "paleturquoise", "pink", "lightyellow", "coral"]
plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.show()
# In[ ]:
cols = tweets.columns[-9:]
def expand( t ):
ret = []
for s in [i[0] for i in top10_tweets_with_tag]:
if ( s in t.text ):
for tone in cols:
ret += [s + u"-" + unicode(tone) + ":" + unicode(getattr(t, tone))]
return ret
def makeList(l):
return l if isinstance(l, list) else [l]
#creating the rdd
rdd = tweets.map(lambda t: t )
rdd = rdd.filter( lambda t: any(s in t.text for s in [i[0] for i in top10_tweets_with_tag] ) )
#creating a flatMap
rdd = rdd.flatMap( expand )
rdd = rdd.map( lambda fullTag : (fullTag.split(":")[0], float( fullTag.split(":")[1]) ))
#formating the data
rdd = rdd.combineByKey((lambda x: (x,1)),
(lambda x, y: (x[0] + y, x[1] + 1)),
(lambda x, y: (x[0] + y[0], x[1] + y[1])))
#reindexing the map
rdd = rdd.map(lambda (key, ab): (key.split("-")[0], (key.split("-")[1], round(ab[0]/ab[1], 2))))
rdd = rdd.reduceByKey( lambda x, y : makeList(x) + makeList(y) )
#Sorting the tuples
rdd = rdd.mapValues( lambda x : sorted(x) )
#mapping the values
rdd = rdd.mapValues( lambda x : ([elt[0] for elt in x],[elt[1] for elt in x]) )
#sorting the tweet entries
def customCompare( key ):
for (k,v) in top10_tweets_with_tag:
if k == key:
return v
return 0
rdd = rdd.sortByKey(ascending=False, numPartitions=None, keyfunc = customCompare)
#taking the mean score
top10_tweets_with_tagMeanScores = rdd.take(10)
# In[ ]:
get_ipython().magic('matplotlib inline')
#Setting graph parameters
params = plt.gcf()
plSize = params.get_size_inches()
params.set_size_inches( (plSize[0]*3, plSize[1]*2) )
top5tagsMeanScores = top10tagsMeanScores[:5]
width = 0
ind=np.arange(9)
(a,b) = top5tagsMeanScores[0]
labels=b[0]
colors = ["beige", "paleturquoise", "pink", "lightyellow", "coral", "lightgreen", "gainsboro", "aquamarine","c"]
idx=0
for key, value in top5tagsMeanScores:
plt.bar(ind + width, value[1], 0.15, color=colors[idx], label=key)
width += 0.15
idx += 1
plt.xticks(ind+0.3, labels)
plt.ylabel('Average Score')
plt.xlabel('Tones')
plt.title('Breakdown of top hashtags by sentiment tones')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='center',ncol=5, mode="expand", borderaxespad=0.)
plt.show()