forked from FakeNewsChallenge/fnc-1-baseline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fnc_kfold.py
119 lines (90 loc) · 3.77 KB
/
fnc_kfold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import sys
import numpy as np,io
from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
from utils.system import parse_params, check_version
#keras imports
from keras.models import Sequential
from keras.layers import Dense, Embedding,Activation,Dropout
from keras.layers import LSTM
from keras.utils.np_utils import to_categorical
def getmlpmodel(max_words):
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(4))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
def generate_features(stances,dataset,name):
h, b, y = [],[],[]
for stance in stances:
y.append(LABELS.index(stance['Stance']))
h.append(stance['Headline'])
b.append(dataset.articles[stance['Body ID']])
X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")
X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
return X,y
if __name__ == "__main__":
#check_version()
parse_params()
d = DataSet()
print("got data")
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)
Xs = dict()
ys = dict()
print("Load/Precompute all features now")
# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
for fold in fold_stances:
Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))
print("got fold")
best_score = 0
best_fold = None
# Classifier for each fold
for fold in fold_stances:
print("Folds")
ids = list(range(len(folds)))
del ids[fold]
X_train = np.vstack(tuple([Xs[i] for i in ids]))
y_train = np.hstack(tuple([ys[i] for i in ids]))
X_test = Xs[fold]
y_test = ys[fold]
y_train = to_categorical(y_train, 4)
y_test = to_categorical(y_test, 4)
print(y_train)
print(X_train.shape)
#sys.exit(0)
#clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
clf=getmlpmodel(X_train.shape[1])
#clf.fit(X_train, y_train)
clf.fit(X_train, y_train,
batch_size=32,
nb_epoch=20,
verbose=1)
#validation_data=(X_test, y_test))
predicted = [LABELS[int(np.argmax(a))] for a in clf.predict(X_test)]
actual = [LABELS[int(np.argmax(a))] for a in y_test]
fold_score, _ = score_submission(actual, predicted)
max_fold_score, _ = score_submission(actual, actual)
score = fold_score/max_fold_score
print("Score for fold "+ str(fold) + " was - " + str(score))
if score > best_score:
best_score = score
best_fold = clf
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]
report_score(actual,predicted)