-
Notifications
You must be signed in to change notification settings - Fork 1
/
DeepNN.py
82 lines (63 loc) · 2.75 KB
/
DeepNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Basic packages
import pandas as pd
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS
# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers
# Importing the dataset
tweets_df = pd.read_csv('Tweets.csv')
tweets_df = tweets_df[['text', 'airline_sentiment']]
def remove_stopwords(input_text):
stopwords_list = stopwords.words('english')
# Some words which might indicate a certain sentiment are kept via a whitelist
whitelist = ["n't", "not", "no"]
words = input_text.split()
clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
return " ".join(clean_words)
def remove_mentions(input_text):
return re.sub(r'@\w+', '', input_text)
tweets_df.text = tweets_df.text.apply(remove_stopwords).apply(remove_mentions)
X_train, X_test, y_train, y_test = train_test_split(tweets_df.text, tweets_df.airline_sentiment, test_size=0.1, random_state=37)
tk = Tokenizer(num_words= 10000,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=" ")
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)
def one_hot_seq(seqs, nb_features = 10000):
ohs = np.zeros((len(seqs), nb_features))
for i, s in enumerate(seqs):
ohs[i, s] = 1.
return ohs
X_train_oh = one_hot_seq(X_train_seq)
X_test_oh = one_hot_seq(X_test_seq)
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)
X_train_rest, X_valid, X_train_rest, y_valid = train_test_split(X_train_oh, y_train_oh, test_size=0.1, random_state=37)
assert X_valid.shape[0] == y_valid.shape[0]
assert X_train_rest.shape[0] == y_train_rest.shape[0]
drop_model = models.Sequential()
drop_model.add(layers.Dense(64, init = 'uniform', activation='relu', input_shape=(10000,)))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(64,init = 'uniform', activation='relu'))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(3, activation='softmax'))
#drop_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
print(drop_model.summary())
drop_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
drop_model.fit(X_train_rest,y_train_rest, batch_size = 64, nb_epoch = 5)