-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline.jl
155 lines (124 loc) · 3.65 KB
/
baseline.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
training_file = "C:\\Users\\batuh\\Desktop\\COMP541\\Datasets\\seq2seq_atis\\train.txt"
inputs = []
outputs = []
input_tokens = []
output_tokens = []
size_input = 1 #max input sequence length. It will be updated later.
size_output = 1 #max output sequence length. It will be updated later.
f = open(training_file)
#reading the training file ans storing the contents in "inputs" and "outputs" variables. also, uniwuee tokens are stored
#in "input_tokens" and "output_tokens" variables.
while ! eof(f)
s = readline(f)
input, output = split(s, "\t")
append!(inputs, [input])
append!(outputs, [output])
tokens = split(input, " ")
if size(tokens)[1] > size_input
size_input = size(tokens)[1]
end
for token in tokens
if !(token in input_tokens)
append!(input_tokens, [token])
end
end
tokens = split(output, " ")
if size(tokens)[1] > size_output
size_output = size(tokens)[1]
end
for token in tokens
if !(token in output_tokens)
append!(output_tokens, [token])
end
end
end
K = size(input_tokens)[1]
L = size(output_tokens)[1]
#creating one-hot vectors for each token
one_hot_to_token_input = Dict()
one_hot_to_token_output = Dict()
token_to_one_hot_input = Dict()
token_to_one_hot_output = Dict()
i = 0
for token in input_tokens
i += 1
x = zeros(K)
x[i] = 1
one_hot_to_token_input[x] = token
token_to_one_hot_input[token] = x
end
i = 0
for token in output_tokens
i += 1
y = zeros(L)
y[i] = 1
one_hot_to_token_output[y] = token
token_to_one_hot_output[token] = y
end
#the input and output sequences are transformed into their one-hot representations:
model_input = []
model_output = []
for input in inputs
tokens = split(input, " ")
seq = zeros(size_input*K)
i = 0
for token in tokens
seq[(K*i+1):(K*(i+1))] = token_to_one_hot_input[token]
i += 1
end
append!(model_input, [seq])
end
for output in outputs
tokens = split(output, " ")
seq = zeros(size_output*L)
i = 0
for token in tokens
seq[(L*i+1):(L*(i+1))] = token_to_one_hot_output[token]
i += 1
end
append!(model_output, [seq])
end
#outputs are created randomly and stored in "predictions" variable.
predictions = []
for j in 1:size(model_input)[1]
prediction = zeros(size_output*L)
length = count(i->(i==1), model_input[j])
for k in 0:length+14
a = rand(1:L)
prediction[k*L + a] = 1
end
append!(predictions, [prediction])
end
#below, F1 scores are calculated.
j = 0
F1_scores = 0.0
for prediction in predictions
j += 1
pred_length = count(i->(i==1), prediction)
gold_length = count(i->(i==1), model_output[j])
gold_tokens = split(outputs[j], " ")
correct = 0
pred_tokens = []
for k in 0:pred_length-1
vector1 = prediction[(k*L+1):((k+1)*L)]
token1 = one_hot_to_token_output[vector1]
append!(pred_tokens, [token1])
end
for pred_token in pred_tokens
for gold_token in gold_tokens
if (pred_token == gold_token)
correct += 1
deleteat!(gold_tokens, findfirst(x -> x==gold_token, gold_tokens))
break
end
end
end
precision = correct / pred_length
recall = correct / gold_length
if precision == 0 || recall == 0
F1_scores += 0
else
F1_scores += 2*precision*recall / (precision + recall)
end
end
F1_score = F1_scores / size(model_input)[1]