forked from claudia-maria-dudau/RL-Poker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
poker_agent_dqn.py
400 lines (302 loc) · 16.7 KB
/
poker_agent_dqn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
from poker_agent_general import PokerAgent
import tensorflow as tf
import collections
import numpy as np
import tqdm
from typing import List, Tuple
from enum import IntEnum
import random
class ReplayBuffer:
def __init__(self, buffer_size):
self.buffer_size = buffer_size
self.replay_mem = collections.deque(maxlen=buffer_size)
# add a new experience to the buffer
def add(self, state, action, reward, next_state, done):
self.replay_mem.append((state, action, reward, next_state, done))
# get a sample batch from the memory
def sample(self, batch_size):
if batch_size <= len(self.replay_mem):
return random.sample(self.replay_mem, batch_size)
else:
assert False
def __len__(self):
return len(self.replay_mem)
class DQNModel(tf.keras.Model):
def __init__(self, input_shape, n_action, method):
super(DQNModel, self).__init__()
# add an input layer of a given input shape
self.input_layer = tf.keras.layers.InputLayer(input_shape=input_shape)
if method == MethodToUse.DQN_BASE:
# avg 173.6 - 1000 ep
# add a few hidden layers
self.hidden_layers = []
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='tanh'))
self.hidden_layers.append(tf.keras.layers.Dense(16, activation='relu'))
self.hidden_layers.append(tf.keras.layers.Dense(16, activation='relu'))
self.output_layer = tf.keras.layers.Dense(units=n_action, activation='linear')
elif method == MethodToUse.DQN_TARGET_NETWORK:
# avg 179 - 1100 ep
# add a few hidden layers
self.hidden_layers = []
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='relu'))
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='selu'))
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='relu'))
self.output_layer = tf.keras.layers.Dense(units=n_action, activation='linear')
elif method == MethodToUse.DQN_TARGET_NETWORK_AND_EXPERIENCE_REPLAY:
# 176.5 - 2700 ep
# add a few hidden layers
self.hidden_layers = []
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='tanh'))
self.hidden_layers.append(tf.keras.layers.Dense(16, activation='selu'))
self.hidden_layers.append(tf.keras.layers.Dense(8, activation='selu'))
self.output_layer = tf.keras.layers.Dense(units=n_action, activation='linear')
else:
# avg 252.6 - 500 ep
# add a few hidden layers
self.hidden_layers = []
self.hidden_layers.append(tf.keras.layers.Dense(32, activation='tanh'))
self.hidden_layers.append(tf.keras.layers.Dense(16, activation='relu'))
self.hidden_layers.append(tf.keras.layers.Dense(16, activation='relu'))
self.output_layer = tf.keras.layers.Dense(units=n_action, activation='linear')
@tf.function
def call(self, inputs):
# go through the input layer to ensure that inputs are of N x input_shape
z = self.input_layer(inputs)
# go over the hidden layers
for l in self.hidden_layers:
z = l(z)
# results / state-action values are in the output layer
q_values = self.output_layer(z)
return q_values
class MethodToUse(IntEnum):
DQN_BASE = 0
DQN_TARGET_NETWORK = 1
DQN_TARGET_NETWORK_AND_EXPERIENCE_REPLAY = 2
DDQN_AND_ALL = 3
class DQN_PokerAgent(PokerAgent):
def __init__(self, env, seed = None, replay_buffer_size = 32000, gamma = 0.9,
batch_size = 1024, lr = 0.001, # learning rate (alpha)
steps_until_sync = 20, # at how many steps should we update the target network weights
pre_train_steps = 1, # steps to run before starting the training process
start_epsilon = 1, end_epsilon = 0.1, final_epsilon_step = 10000,
method = MethodToUse.DQN_TARGET_NETWORK, load_prev = False):
super().__init__(method.name.lower(), env, gamma, lr, start_epsilon, end_epsilon, (start_epsilon - end_epsilon) / (final_epsilon_step - pre_train_steps))
self.method = method
# model for function approximation methods
observation_space_shape = list(env.observation_space)
self.dqn = DQNModel(input_shape=observation_space_shape, n_action=self.env.action_space.n, method=self.method)
# input = BatchSize (unknown, that's why it is None) X input_shape of the environment
inputShape = [None]
inputShape.extend(observation_space_shape)
self.dqn.build(tf.TensorShape(inputShape))
if self.method > MethodToUse.DQN_BASE:
self.dqn_target = DQNModel(input_shape=observation_space_shape, n_action=env.action_space.n, method=self.method)
self.dqn_target.build(tf.TensorShape(inputShape))
# replay buffer
self.replay_buffer = ReplayBuffer(buffer_size=replay_buffer_size)
# all other parameters
self.batch_size = batch_size
self.steps_until_sync = steps_until_sync
self.pre_train_steps = pre_train_steps
self.final_epsilon_step = final_epsilon_step
# loss function
self.loss_function = tf.keras.losses.MSE
# optimizer
self.optimizer = tf.optimizers.Adam(learning_rate=lr)
# deterministic output at each run for easy debugging
if seed is not None:
self.env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
# total steps of the running algorithm
self.total_steps = 0
if load_prev:
self.load_model()
# get epsilon
def get_epsilon(self, no_episode):
if no_episode <= self.pre_train_steps:
return 1.0 # full exploration in the beginning
else:
epsilon = max((1.0 - self.epsilon_decay * (no_episode - self.pre_train_steps)), self.end_epsilon)
return epsilon
# receive action returns a state, reward, done (wrapped to be compatible in a tf graph op)
def env_step(self, action : np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
state, reward, done, _ = self.env.step(action)
return (state.astype(np.float32),
np.array(reward, np.float32),
np.array(done, np.float32))
# this is under a tensorflow wrapper to be able to use it inside tensorflow graphs
def tf_env_step(self, action:tf.Tensor) -> List[tf.Tensor]:
return tf.numpy_function(self.env_step, [action], [tf.float32, tf.float32, tf.float32])
# gets an action based on epsilon and the DQN model
def select_action(self, states, epsilon):
# current legal moves
legal_moves = [x.value for x in self.env.legal_moves]
# random action choice when in the pre-training phase, or we still wait to fill
# some data in the buffer, or sampled value is less than epsilon
if self.total_steps <= self.pre_train_steps or \
np.random.rand() < epsilon or len(self.replay_buffer) < self.batch_size:
actions = np.random.choice(legal_moves, size=[len(states), ])
else:
# run the network and get the action to do for each of the states in the batch
predict_q = self.dqn(states)
# get the argmax on the columns
actions = np.argmax(predict_q, axis=1)
return actions
# runs a single episode to collect training data
# returns the reward from this episode, the average loss and epsilon, avg magnitude of grads for each layer
def run_episode(self) -> Tuple[float, float, float, tf.Tensor]:
# grab the initial state and convert it to a tensorflow type tensor
initial_state = self.env.reset()
initial_state = tf.constant(initial_state, dtype=tf.float32)
initial_state_shape = initial_state.shape
current_state = initial_state
current_action = None
episode_reward = tf.constant(0.0, dtype=tf.float32)
total_loss = 0.0
# accumulate the total gradient magnitude on each trainable variable
total_grads_magnitude = tf.constant(0.0, shape=[len(self.dqn.trainable_variables),])
no_losses_computed = 0
epsilon = 1.0
for episode_step in tf.range(self.max_steps_per_episode):
# get epsilon for this step
epsilon = self.get_epsilon(no_episode=self.total_steps)
# convert to a batch 1 tensor the state
current_state_batched = tf.expand_dims(current_state, 0)
actions = self.select_action(current_state_batched, epsilon)
current_action = tf.constant(actions[0]) # first action in the batch
# apply action, get next step and reward
new_state, reward, done = self.tf_env_step(current_action)
new_state.set_shape(initial_state_shape)
episode_reward += reward
# add the experience to the replay buffer
self.replay_buffer.add(state=current_state, action=current_action, reward=reward, next_state=new_state, done=done)
# check if we should do a training step: did we past the pre train phase
# and does the replay buffer contain a certain replay buffer size ?
if (self.total_steps > self.pre_train_steps and len(self.replay_buffer) >= self.batch_size):
loss, grads_magnitude = self.train_step()
total_loss += loss
total_grads_magnitude += grads_magnitude
no_losses_computed += 1
# check if we should update the target network weights
if self.method > MethodToUse.DQN_BASE and self.total_steps % self.steps_until_sync == 0:
self.dqn_target.set_weights(self.dqn.get_weights())
# update the current state, total episodes
current_state = new_state
self.total_steps += 1
# check if episode ended
if done == 1.0:
break
# calculating average loss and gradients magnitude
avg_loss = (total_loss / no_losses_computed if no_losses_computed > 0 else 0)
avg_grads_magnitude = (total_grads_magnitude / no_losses_computed if no_losses_computed > 0 else 0)
return episode_reward, avg_loss, epsilon, avg_grads_magnitude
# runs a train step using the current replay buffer
# returns the loss and gradients magnitude for updating the model
def train_step(self):
# sample a batch from replay memory
train_batch = self.replay_buffer.sample(batch_size=self.batch_size)
# get separate tensors for states, actions, rewards, nextstates, dones
b_states = tf.stack([x[0] for x in train_batch], axis=0)
b_actions = tf.stack([tf.cast(x[1], tf.int32) for x in train_batch], axis=0)
b_rewards = tf.stack([x[2] for x in train_batch], axis=0)
b_next_states = tf.stack([x[3] for x in train_batch], axis=0)
b_dones = tf.stack([x[4] for x in train_batch], axis=0)
# perfom loss computations under an automatic gradient computation scope
with tf.GradientTape() as tape:
# get the list of trainable variables and make them watchable by the gradient tape
dqn_variables = self.dqn.trainable_variables
tape.watch(dqn_variables)
# compute the estimated values from next_states batch Q(nextState, action) for all actions
q_next_state = self.dqn(b_next_states) if self.method <= MethodToUse.DQN_BASE \
else self.dqn_target(b_next_states)
# compute the next best action to select from q_next_state (reduce on the columns axis in the batch)
next_best_actions = tf.argmax(q_next_state, axis=1)
# compute the targetQ value
b_next_best_actions_one_hot_encoding = tf.one_hot(indices=next_best_actions, depth=self.n_action)
targetQ = b_next_best_actions_one_hot_encoding * q_next_state
targetQ = tf.reduce_sum(targetQ, axis=1)
# for each element in the batch:
# if done is true, then the reward is considered final
# if not, then add the estimation from next state too
targetQ = b_rewards + (1.0 - b_dones) * self.gamma * targetQ
# get the estimates for each state in the batch
Q_states = self.dqn(b_states)
# compute the predictedQ (Q(state, action) for each state-action pair in the training batch)
b_actions_one_hot_encoding = tf.one_hot(indices=b_actions, depth=self.n_action) # (a)
predictedQ = b_actions_one_hot_encoding * Q_states
predictedQ = tf.reduce_sum(predictedQ, axis=1)
# apply loss function
loss = self.loss_function(y_true=targetQ, y_pred=predictedQ)
# backpropagation: compute the gradients from the loss and apply them using the optimizer
grads = tape.gradient(loss, dqn_variables)
self.optimizer.apply_gradients(zip(grads, dqn_variables))
# compute the gradients magnitude to be sure that we are still able to learn something
grads_magnitude = tf.convert_to_tensor([tf.norm(layer) for layer in grads])
return loss, grads_magnitude
def train(self, max_episodes, max_steps_per_episode, reward_threshold, no_episodes_for_average = 10):
episode_reward_history = []
last_rewards = collections.deque(maxlen=no_episodes_for_average)
self.max_episodes = max_episodes
self.max_steps_per_episode = max_steps_per_episode
with tqdm.trange(self.max_episodes) as t:
for episode in t:
# run a full episode and get the reward from it
episode_reward, avg_loss, epsilon_used, grads_magnitude = self.run_episode()
# update statistics from this episode
episode_reward_history.append(episode_reward)
last_rewards.append(episode_reward)
mean_episode_reward = np.mean(last_rewards)
t.set_description(f'Episode {episode}')
t.set_postfix(episode_reward=episode_reward, running_reward=mean_episode_reward)
if episode % no_episodes_for_average == 0:
print(f'\nEpisode {episode} (total step) {self.total_steps}: average reward: {mean_episode_reward}. avg loss: {avg_loss} epsilon used: {epsilon_used} grads mag: {grads_magnitude}')
if episode != 0 and mean_episode_reward > self.max_score[self.method.value]:
self.max_score[self.method.value] = mean_episode_reward
self.round_max_score[self.method.value] = episode
self.save_model()
print(self.max_score[self.method.value], self.round_max_score[self.method.value])
if episode != 0 and mean_episode_reward > reward_threshold:
break
print(f"\nSolved at episode{episode}: average rewards {mean_episode_reward:.2f}!")
print(self.max_score[self.method.value], self.round_max_score[self.method.value])
self.save_model()
def save_model(self):
self.dqn.save_weights(self.name)
def load_model(self):
print(self.name)
self.dqn.load_weights(self.name)
if self.method > MethodToUse.DQN_BASE:
self.dqn_target.set_weights(self.dqn.get_weights())
def play(self, no_episodes):
scores = []
actions_per_ep = []
for episode_index in tqdm.trange(no_episodes):
# for the record
score = 0
no_actions = 0
# initiate state
state = tf.constant(self.env.reset(), dtype=tf.float32)
state_shape = state.shape
while True:
# convert to a batch 1 tensor the state
state_batched = tf.expand_dims(state, 0)
actions = self.select_action(state_batched, self.end_epsilon)
current_action = tf.constant(actions[0]) # first action in the batch
# apply action, get next step and reward
next_state, reward, done = self.tf_env_step(current_action)
next_state.set_shape(state_shape)
# records
score += reward
no_actions += 1
# move to next state
state = next_state
if done:
break
# record
scores.append(score)
actions_per_ep.append(no_actions)
average_score = np.mean(scores)
average_actions = np.mean(actions_per_ep)
print(f'Average score: {average_score}, Average actions per ep: {average_actions}')
return average_score, average_actions