-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
236 lines (207 loc) · 12.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#This script is the starting point for the backend of the proactive search system with terminal interaction
#The code works for synthetic and real data
# main.py controls the following process:
# 1. Load the experiment logs
# 2. Create (or load) the low dimensional representation of data
# 3. Interaction loop:
# 3.1. Receive new snapshots (real-time documents)
# 3.2. Update the user model
# 3.3. Recommend items from different views
# 3.4. gather feedback for items
from DataLoader import DataLoader
from DataProjector import DataProjector
from UserModelCoupled import UserModelCoupled
import numpy as np
import re
import os
import json
#---------------Initialization of parameters and methods
params = {
# Number of recommended entities from each view
"suggestion_count": 10,
# Number of online snapshots to consider (the latest snapshots)
"imp_doc_to_consider": 4,
# True: normalize TF-IDF weights to sum to 1, False: no normalization. TODO: DOES THIS MAKE SENSE?
"normalize_terms": True,
# True: use exploration algorithm (Thompson Sampling) for recommendation, False: use the mean of the estimate.
"Thompson_exploration": False,
# True: allow the algorithm to show previously recommended items, False: each item can be recommended only once
"repeated_recommendation": True,
# A heuristic method to shrink the variance of the posterior (reduce the exploration). it should be in (0,1];
"exploration_rate": 1, # NOT IMPLEMENTED YET
# Number of iterations of the simulated study
"num_iterations": 50,
# Number of latent dimensions for data representation
"num_latent_dims": 100,
# Number of runs (only for the simulated study, set to 1 for real data setting)
"num_runs": 1, # NOT IMPLEMENTED YET
# True: prepare the data for UI but have the interaction in the terminal
"UI_simulator": True,
# The directory of the corpus (It should have /corpus.mm, /dictionary.dict, and views_ind_1.npy files)
#"corpus_directory": 'corpus1_2/corpus7_sim',
"corpus_directory": 'corpus1_2/P01',
# The directory of the new snapshots that will be checked at the beginning of each iteration
"snapshots_directory": 'user activity',
# True: Use the simulated user data to simulate the user feedback
"Simulated_user": False,
}
# Set the desirable method to True for the experiment
Methods = {
"LSA-coupled-Thompson": True,
"LSA-coupled-UCB": False,
"Random": False
}
Method_list = []
num_methods = 0
for key in Methods:
if Methods[key] == True:
Method_list.append(key)
num_methods = num_methods + 1
for runs in range(params["num_runs"]):
#---------------------- Phase 1: Load the experiment logs ----------------------------------------#
#load the data from the log files
data_dir = params["corpus_directory"]
data = DataLoader(data_dir)
data.print_info()
#data.process_item_info() # Use this to list entities for off line feedback gathering
#---------------------- Phase 2: Create (or load) the low dimensional representation of data ------#
projector = DataProjector(data, params)
projector.generate_latent_space()
projector.create_feature_matrices()
#---------------------- Phase 3: Interaction loop ------------------------------------------------#
for method_ind in range(num_methods):
method = Method_list[method_ind]
selected_terms = [] # ID of terms that the user has given feedback to
feedback_terms = [] # feedback value on the selected terms
recommended_terms = [] # list of ID of terms that have been recommended to the user
selected_docs = [] # ID of snapshots that the user has given feedback to (may not be available in practice)
feedback_docs = [] # feedback value on the selected snapshots (may not be available in practice)
for iteration in range(params["num_iterations"]):
print 'Iteration = %d' %iteration
# 3.1 check the snapshot folder and consider positive feedback for the real-time generated snapshots
# the snapshot format is doc = [(term_idx,freq),..]
print 'Loading real-time generated snapshots...'
all_online_docs = [] # all snapshots generated from realtime user activity
fv_online_docs = [] # considered snapshots generated from realtime user activity
fb_online_docs = [] # dummy feedback for the newly generated snapshots
for document in os.listdir(params["snapshots_directory"]):
if document != ".DS_Store" and document != "readme.txt":
# load the numpy file
snapshot_fv = np.load(params["snapshots_directory"]+"/"+document)
all_online_docs.append(snapshot_fv)
# only consider the most recent snapshots
all_online_docs.reverse()
for snapshot_fv in all_online_docs:
if len(fv_online_docs) < params["imp_doc_to_consider"]:
fv_online_docs.append(snapshot_fv)
fb_online_docs.append(1) #dummy feedback on the newly generated documents
# 3.2 and 3.3: Update the user model and recommend new items based on the chosen method
if method == "LSA-coupled-Thompson":
# initialize the user model in the projected space
user_model = UserModelCoupled(params)
# create the design matrices for docs and terms
user_model.create_design_matrices(projector, selected_terms, feedback_terms,selected_docs, feedback_docs, fv_online_docs, fb_online_docs)
#user_model.create_design_matrices(projector, selected_terms, feedback_terms, [1], [2], [[(1,2),(4,3)], [(2,2),(14,1)] ], [0.5, 0.1])
# posterior inference
user_model.learn()
# Thompson sampling for coupled EVE
#TODO: test having K thompson sampling for the K recommendations
if params["Thompson_exploration"]:
theta = user_model.thompson_sampling()
else:
theta = user_model.Mu # in case of no exploration, use the mean of the posterior
scored_docs = np.dot(projector.doc_f_mat, theta)
scored_terms = np.dot(projector.term_f_mat, theta)
#print theta
if method == "LSA-coupled-UCB":
# initialize the user model in the projected space
user_model = UserModelCoupled(params)
# create the design matrices for docs and terms
user_model.create_design_matrices(projector, selected_terms, feedback_terms,selected_docs, feedback_docs, fv_online_docs, fb_online_docs)
# posterior inference
user_model.learn()
# Upper confidence bound method
scored_docs = user_model.UCB(projector.doc_f_mat)
scored_terms = user_model.UCB(projector.term_f_mat)
if method == "Random":
scored_docs = np.random.uniform(0,1,projector.num_docs)
scored_terms = np.random.uniform(0,1,projector.num_terms)
#---------------------- 3.4: gather user feedback ---------------------------#
#sort items based on their index
#todo: if time consuming then have k maxs instead of sort
sorted_docs = sorted(range(len(scored_docs)), key=lambda k:scored_docs[k], reverse=True)
# make sure the selected items are not recommended to user again
sorted_docs_valid = [doc_idx for doc_idx in sorted_docs if doc_idx not in set(selected_docs)]
# make sure the selected terms are not recommended to user again
sorted_terms = sorted(range(len(scored_terms)), key=lambda k:scored_terms[k], reverse=True)
sorted_views_list = [] # sorted ranked list of each view
for view in range(1, data.num_views):
# sort items of each view. Exclude (or not exclude) the previously recommended_terms.
if params["repeated_recommendation"]:
sorted_view = [term_idx for term_idx in sorted_terms
if term_idx not in set(selected_terms) and data.views_ind[term_idx] == view]
else:
sorted_view = [term_idx for term_idx in sorted_terms
if term_idx not in set(recommended_terms) and data.views_ind[term_idx] == view]
sorted_views_list.append(sorted_view)
# TERMINAL USER INTERFACE
if params["UI_simulator"]:
for view in range(1, data.num_views):
print 'view %d:' %view
for i in range(min(params["suggestion_count"],data.num_items_per_view[view])):
print ' %d,' %sorted_views_list[view-1][i] + ' ' + data.feature_names[sorted_views_list[view-1][i]]
print 'Relevant document IDs (for debugging):'
for i in range(params["suggestion_count"]):
print ' %d' %sorted_docs_valid[i]
# save the new recommendations in this iteration and all the recommendations till now
new_recommendations = []
for view in range(1, data.num_views):
for i in range(min(params["suggestion_count"],data.num_items_per_view[view])):
new_recommendations.append(sorted_views_list[view-1][i])
if sorted_views_list[view-1][i] not in set(recommended_terms):
recommended_terms.append(sorted_views_list[view-1][i])
#organize the recommentations in the right format
data_output = {}
data_output["keywords"] = [(sorted_views_list[0][i],data.feature_names[sorted_views_list[0][i]],
scored_terms[sorted_views_list[0][i]]) for i in range(min(params["suggestion_count"],data.num_items_per_view[1]))]
data_output["applications"] = [(sorted_views_list[1][i],data.feature_names[sorted_views_list[1][i]],
scored_terms[sorted_views_list[1][i]]) for i in range(min(params["suggestion_count"],data.num_items_per_view[2]))]
data_output["people"] = [(sorted_views_list[2][i],data.feature_names[sorted_views_list[2][i]],
scored_terms[sorted_views_list[2][i]]) for i in range(min(params["suggestion_count"],data.num_items_per_view[3]))]
# TODO: how many document? I can also send the estimated relevance.
data_output["document_ID"] = [(sorted_docs_valid[i]) for i in range(params["suggestion_count"])]
#for now write everything in a file
with open('data.txt', 'w') as outfile:
json.dump(data_output, outfile)
#Terminal feedback gathering
input = 1
print 'Give your feedback as "id fb_value" for terms then press Enter. Press Enter to go to the next iteration.'
while input != 0:
input_string = raw_input()
array_input = re.findall(r'\d*\.?\d+', input_string)
if len(array_input) == 0:
input = 0
else:
# save user feedbacks to update the model in the next iteration
#fb_Val coding: 1: pinning, 0:unpinning, -1:removing]
item_id = int(array_input[0])
fb_Val = float(array_input[1])
# Based on FOCUS requirements we will remove the previous feedbacks on the same item
repeated_indx = [index for index, id in enumerate(selected_terms) if id == item_id]
if len(repeated_indx) > 0:
del selected_terms[repeated_indx[0]]
del feedback_terms[repeated_indx[0]]
#todo: You may want to just replace the feedback or just remove it (decide this later)
selected_terms.append(item_id)
feedback_terms.append(fb_Val)
# TODO: recored the interactions
#Simluated data experiment
if params["Simulated_user"]:
# save the new recommendations in this iteration and all the recommendations till now
new_recommendations = []
for view in range(1, data.num_views):
for i in range(range(min(params["suggestion_count"],data.num_items_per_view[view]))):
item_id = sorted_views_list[view-1][i]
new_recommendations.append(item_id)
if item_id not in set(recommended_terms):
recommended_terms.append(item_id)