-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_handler.py
214 lines (177 loc) · 6.38 KB
/
data_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import config
import logging
import numpy
train_file_path = ""
test_file_path = ""
train_data_features = []
train_data_target = []
test_data_features = []
test_data_target = []
"""
Takes 2 args and returns array of length 2
First: first line from csv to put in array
Last: last line from csv to put in array
Element 0: Features data
Element 1: Target data
"""
def getTrainData(first, last, isDota):
dataNtargets = []
count = 1
for line in open(train_file_path, 'r'):
if count >= first and count <= last:
split_line = line.rstrip('\n').split(',')
for i in range(1, len(split_line)):
if isDota:
split_line[i] = float(split_line[i])
else:
split_line[i] = float(split_line[i])
train_data_features.append(split_line[1:])
train_data_target.append(int(split_line[0]))
elif count > last:
break
count += 1
dataNtargets = [train_data_features, train_data_target]
return dataNtargets
"""
Takes no args and returns array of length 2
Element 0: Features data
Element 1: Target data
"""
def getTestData(isDota):
dataNtargets = []
for line in open(test_file_path, 'r'):
split_line = line.rstrip('\n').split(',')
for i in range(1, len(split_line)):
if isDota:
split_line[i] = float(split_line[i])
else:
split_line[i] = float(split_line[i])
test_data_features.append(split_line[1:])
test_data_target.append(int(split_line[0]))
dataNtargets = [test_data_features, test_data_target]
return dataNtargets
"""
Specifically returns test and train data for dota in array size 4
Takes 2 args
First: first line from csv to start recording
Last: last line from csv to record
Element 0: Train Feature data
Element 1: Train Target data
Element 2: Test Feature data
element 3: Test Target data
"""
def getDotaData(first, last):
MAX = 92650
if last > MAX or first < 1:
print("ENTER VALID LINES")
return []
global train_file_path
global test_file_path
train_file_path = config.dota_train_data_path
test_file_path = config.dota_test_data_path
train = getTrainData(first, last, True)
test = getTestData(True)
allData = [train[0], train[1], test[0], test[1]]
return allData
"""
Specifically returns test and train data for pulsar (HTRU2) in array size 4
Takes 2 args
First: first line from csv to start recording
Last: last line from csv to record
Element 0: Train Feature data
Element 1: Train Target data
Element 2: Test Feature data
element 3: Test Target data
"""
def getPulsarData(first, last):
MAX = 10000
if last > MAX or first < 1:
print("ENTER VALID LINES")
return []
global train_file_path
global test_file_path
train_file_path = config.pulsar_train_data_path
test_file_path = config.pulsar_test_data_path
train = getTrainData(first, last, False)
test = getTestData(False)
allData = [train[0], train[1], test[0], test[1]]
return allData
"""
Records data on the reseults of a prediction set. Assumes Binary classification.
Saves data in csv format in a file.
Takes 5 args and returns an array of size 7
INPUTS
pos: the positive target value
neg: the negative target value
outFile: the name of the file the data should be stored in
targets: correct answers
test_data_prediction: the outputted answers fomr query
write: boolean value, of weather to write the results for every instance
OUTPUTS
Element1: % of targets correctly predicted
element2: % of targets that are positive
Element3: % of targets that are negative
Element4: % of positive values correctly predicted
Element5: % of negative values correctly predicted
Element6: % of predictions that were false positives
Element7: % of targets that were false negatives
"""
def recordResults(pos, neg, outFile, targets, test_data_prediction, approach, write):
correct_count = 0
variance = 0
correct_count = 0
positive_count = 0
negative_count = 0
false_positive_count = 0
false_negative_count = 0
all_count = len(test_data_prediction)
fileLines = []
fileLines.append("Prediction,Target,Variance,,Is correct,False positive,False Negative\n")
for i in range(all_count):
falsePositive = 0
falseNegative = 0
isCorrect = abs(targets[i] - test_data_prediction[i]) < 0.1
if isCorrect:
correct_count += 1
else:
variance += 1
falsePositive = int((int(test_data_prediction[i]) is pos) and (int(targets[i]) is neg))
falseNegative = int((int(test_data_prediction[i]) is neg) and (int(targets[i]) is pos))
if targets[i] is pos:
positive_count += 1
else:
negative_count += 1
line = (str(test_data_prediction[i]) + "," + str(targets[i]) + ","+str(variance)+",,"+str(isCorrect)+","+str(falsePositive)+","+str(falseNegative)+"\n")
fileLines.append(line)
false_negative_count += falseNegative
false_positive_count += falsePositive
#Add summary information
per_correct = correct_count * 100.0 / all_count
per_pos = positive_count * 100.0 / all_count
per_neg = negative_count * 100.0 / all_count
per_pos_cor = 100 - (false_negative_count * 100.0 / positive_count)
per_neg_cor = 100 - (false_positive_count * 100.0 / negative_count)
fal_pos_per = false_positive_count * 100.0 / all_count
fal_neg_per = false_negative_count * 100.0 /all_count
if correct_count is 0:
write = True
#includes data on all instances if true
if(write):
results = open(approach + outFile, "w")
results.write("Summary\n")
results.write("Percent correct," + str(per_correct) + "\n")
results.write("Percent positive," + str(per_pos) + "\n")
results.write("Percent negative," + str(per_neg) + "\n")
results.write("Percent positive correct," + str(per_pos_cor) + "\n")
results.write("Percent negative correct," + str(per_neg_cor) + "\n")
results.write("False positive percentage," + str(fal_pos_per) + "\n")
results.write("False negative percentage," + str(fal_neg_per) + "\n\n")
for line in fileLines:
results.write(line)
results.close()
logging.info('average variance %d / %d = %f' % (variance, all_count, 1.0 * variance / all_count))
logging.info('Percent correct: %.5f%%' % (correct_count * 1.0 / all_count * 100))
logging.info('Percent False correct: %.5f%%' % per_neg_cor)
logging.info('Percent Positive correct: %.5f%%' % (per_pos_cor))
return [per_correct, per_pos, per_neg, per_pos_cor, per_neg_cor, fal_pos_per, fal_neg_per]
#end record results