-
Notifications
You must be signed in to change notification settings - Fork 6
/
example-ffnn.py
227 lines (192 loc) · 9.38 KB
/
example-ffnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
'''
In Contextual Multi-Armed Bandit (C-MAB), the machine learning (ML) agent
uses a table to store values of actions for different contexts. This
context-action value table can be replaced by a feed forward neural network
(FFNN). There are several advantages to use a FFNN instead of a table.
FFNN can better handle context with a huge space, and can deal with context
containing continuous values. Besides, FFNN can natively predict action values
for unseen contexts whereas table-driven C-MAB cannot and it must use a
separate technique such as KNN to infer the action values. However, FFNN
requires more data to get trained.
The following is the procedure:
- For each episode:
- a number of users are served, and this number is predefined
- for each user service
- the context is obtained from the environment, which is the user age
group in our example
- the ML agent picks an action either (i) picking randomly during exploration,
or (ii) picking whose value is the highest among all actions during exploitation
- the picked action is executed
- the corresponding reward is observed from the environment, which is
1 if the user clicked the offered ad, or 0 otherwise
- the context-action-reward outcome is kept in a memory
- at the end of an episode, the memory is used to train the FFNN
- the memory is cleared, and the ML agent proceeds to the next episode
'''
import random
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
######################################################################
## User behaviour matrix for the environment (static class)
######################################################################
class Ad:
Type = { # age group
# arm # <25 <35 <45 <55 >55
# --------------------------------------------
"toys" : [0.80, 0.15, 0.10, 0.05, 0.05],
"cars" : [0.05, 0.50, 0.30, 0.15, 0.10],
"sports" : [0.15, 0.30, 0.40, 0.30, 0.30],
"holidays" : [0.05, 0.20, 0.35, 0.50, 0.50],
"foods" : [0.05, 0.25, 0.25, 0.40, 0.60]
}
AllArms = list(Type.keys()) # list of all ad types
ArmSize = len(AllArms)
AgeGroupSize = len(list(Type.values())[0])
AllAgeGroups = range(AgeGroupSize)
######################################################################
## Client profile
######################################################################
class Client:
def __init__(self, age_group=None):
if age_group is None:
self.group = random.randint(0,Ad.AgeGroupSize-1)
else:
self.group = age_group
def will_click(self, ad) -> bool:
'''Will this client click this advert?'''
click_prob = random.randint(0,99)
if click_prob<100*Ad.Type[ad][self.group]:
return True
return False
######################################################################
## Machine Learning Model
######################################################################
class ContextActionTable:
'''This is a class to keep the memory during an episode. For this particular
example, the memory is the context-action value table.'''
def __init__(self,num_contexts,num_actions):
self.num_contexts = num_contexts
self.num_actions = num_actions
self.clear()
def clear(self):
'''Clear all contents in the table.'''
self.sum = [[0 for i in range(self.num_actions)] for j in range(self.num_contexts)]
self.count = [[0 for i in range(self.num_actions)] for j in range(self.num_contexts)]
self.value = [[0 for i in range(self.num_actions)] for j in range(self.num_contexts)]
def update_reward(self,context,action,value):
'''Update the reward into the table.'''
self.value[context][action] = value
self.sum[context][action] += value
self.count[context][action] += 1
def get_mean_value(self,context,action,default=0):
'''Return the mean value of the (context,action), or the default value if
there is no update for this action.'''
if self.count[context][action]==0: return default
return self.sum[context][action]/self.count[context][action]
def get_last_value(self,context,action,default=0):
'''Return the last seen value of the (context,action), or the default value if
there is no update for this action.'''
if self.count[context][action]==0: return default
return self.value[context][action]
####################################################################
## main loop
####################################################################
def array_to_str(array):
'''This function convert a one-dimensional array to a formatted string for printing.'''
array_str = "["
for element in array:
array_str += f"----," if element is None else f"{element:4.2f},"
return array_str[:-1] + "]"
def context_array(context_idx:int):
'''Convert context into an array form for the inputs of FFNN,
e.g. if context=3, then array = [0,0,0,1,0].'''
array = [0]*Ad.AgeGroupSize
array[context_idx] = 1
return array
if __name__ == "__main__":
## setup environment parameters
num_users = 20000 # number of users to visit the website
num_clicks = 0 # number of clicks collected
animation = True # True/False
## setup training arrangement
episode_len = 500 # the length of an episode for each training
exploration_round = 8000 # explore so many rounds first
memory = [] # the memory to keep (context,action,reward)
alpha = 0.2 # learning rate
## setup context-action tables
num_contexts=Ad.AgeGroupSize
num_actions=Ad.ArmSize
context_action_life = ContextActionTable(num_contexts, num_actions)
context_action_episode = ContextActionTable(num_contexts, num_actions)
## setup FFNN
model = Sequential()
model.add(Dense(units=20, activation='relu', input_dim=num_contexts))
model.add(Dense(units=50, activation='tanh'))
model.add(Dense(units=num_actions))
model.compile(loss='mean_squared_error', optimizer='adam')
## this is the main loop
## the objective of ML agent is to achieve
## as many clicks as possible through learning
print(f"Testing Deep Reinforcement Learning\n")
for round in range(num_users):
## a user has visited the website
user = Client()
## prepare an advertisement
## ..by exploration
if round<exploration_round:
offered_ad = random.choices(Ad.AllArms)[0]
action_index = Ad.AllArms.index(offered_ad)
## ..by exploitation
else:
list_action_value = model.predict(np.array([context_array(user.group)]),verbose=0)[0]
action_index = np.array(list_action_value).argmax()
offered_ad = Ad.AllArms[action_index]
## will the user click?
if user.will_click(offered_ad):
num_clicks += 1
reward = 1
else:
reward = 0
## keep the outcome in the memory
memory.append((user.group,action_index,reward))
## do training when episode ends
if len(memory)==episode_len:
## summarize the memory
context_action_episode.clear()
for (context,action,reward) in memory:
context_action_life.update_reward(context,action,reward)
context_action_episode.update_reward(context,action,reward)
## construct the data record for training
x_train = [] # input contexts
y_train = [] # output action values
for context in range(num_contexts):
x_train.append(context_array(context))
y_value = model.predict(np.array([context_array(context)]),verbose=0)[0]
for action in range(num_actions):
## there are many ways we can incorporate the memory into FFNN
## here we use exponential smoothing to incorporate the memory for
## each episode into FFNN
y_value[action] = (1-alpha)*y_value[action] \
+ alpha*context_action_episode.get_mean_value(context,action,y_value[action])
y_train.append(y_value)
## train the model
model.fit(np.array([x_train]), np.array([y_train]), epochs=5, verbose=0)
## print the results
print(" "*13,"Fact"," "*16,"This episode",
" "*12,"C-MAB table"," "*10,"FFNN prediction")
for context in range(num_contexts):
print(f"c={context}: ",end="")
array_fact = [Ad.Type[arm][context] for arm in Ad.AllArms]
array_episode = []
array_life = []
array_ffnn = []
for action in range(num_actions):
array_life.append(context_action_life.get_mean_value(context,action,None))
array_episode.append(context_action_episode.get_mean_value(context,action,None))
array_ffnn = model.predict(np.array([context_array(context)]),verbose=0)[0]
print(array_to_str(array_fact),end="")
print(array_to_str(array_episode),end="")
print(array_to_str(array_life),end="")
print(array_to_str(array_ffnn))
memory.clear()