-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagent.py
297 lines (245 loc) · 10.2 KB
/
agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
"""This module contains the agent class
:Date: 2019-03-11
:Version: 1
:Authors:
- Janosch Moos
- Kay Hansel
- Cedric Derstroff
"""
import matplotlib.pyplot as plt
import numpy as np
import pickle
import csv
import os
from utilities.logger import Logger
class Agent:
"""Agent Class
It wraps around environment, policy and algorithm. The class
basically controls the training process, benchmarks and
documentation of results.
Attributes
---------
policy
The decision making policy (maps states to actions)
env: Environment
Contains the gym environment the simulations are
performed on
algorithm: NPG or NES
The learning algorithm
plot: bool
If True the results of Training and Benchmark will
be plotted
logger: Logger
Logger to log the training data
"""
def __init__(self, env, policy, algorithm, plot: bool = True):
"""
:param env: Contains the gym environment the simulations are
performed on
:type env: Environment
:param policy: The policy to improve
:type policy: Policy
:param algorithm: The learning algorithm
:type algorithm: NES or NPG
:param plot: If True the results of Training and Benchmark will
be plotted
:type plot: bool
"""
self.policy = policy
self.env = env
self.algorithm = algorithm
self.plot = plot
self.logger = Logger()
# Utility Functions
# ===============================================================
def __print(self, i_episode: int):
"""Prints results for a given episode of the logged episodes"""
episode = self.logger.logger[i_episode]
s = "s" if episode["roll_outs"] > 1 else ""
print("Episode {} with {} roll-out{}:\n"
"finished after {} time steps and obtained a reward of {}.\n "
.format(i_episode, episode["roll_outs"], s,
episode["time_mean"].squeeze(),
episode["reward_mean"].squeeze()))
def __plot_results(self):
"""Generates plots after the training process containing the
relevant information such as reward and time steps of each
episode. In case more than one simulation was performed each
episode, the mean and standard deviation for each are plotted.
"""
# string for csv file
string = './trained_data/training_data_{}_{}.csv'\
.format(self.env.to_string(), self.algorithm.name)
# get data out of logger
r_means = []
r_stds = []
t_means = []
t_stds = []
# os.makedirs(os.path.dirname(string), exist_ok=True)
# with open(string, 'w') as writerFile:
# for i, e in np.ndenumerate(self.logger.logger):
# r_means.append(e["reward_mean"])
# r_stds.append(e["reward_std"])
# t_means.append(e["time_mean"])
# t_stds.append(e["time_std"])
#
# """ write to csv file """
# writer = csv.writer(writerFile)
# writer.writerow([i[0], e["reward_mean"].squeeze(),
# e["reward_mean"].squeeze()
# - e["reward_std"].squeeze(),
# e["reward_mean"].squeeze()
# + e["reward_std"].squeeze(),
# e["time_mean"].squeeze(),
# e["time_mean"].squeeze()
# - e["time_std"].squeeze(),
# e["time_mean"].squeeze()
# + e["time_std"].squeeze()])
# writerFile.close()
for i, e in np.ndenumerate(self.logger.logger):
r_means.append(e["reward_mean"])
r_stds.append(e["reward_std"])
t_means.append(e["time_mean"])
t_stds.append(e["time_std"])
r_means = np.concatenate(r_means).squeeze()
r_stds = np.concatenate(r_stds).squeeze()
t_means = np.concatenate(t_means).squeeze()
t_stds = np.concatenate(t_stds).squeeze()
# get length
length = r_stds.size
# plot
plt.subplot(2, 1, 1)
plt.title(self.env.name + "\n"
+ self.algorithm.title
+ ", Policy: {}".format(self.policy.hidden_dim))
plt.fill_between(np.arange(length),
r_means - r_stds, r_means + r_stds,
alpha=0.3, label='standard deviation',
color='green')
plt.plot(np.arange(length), r_means, label='mean',
color='green')
plt.legend()
plt.xlabel('Episodes')
plt.ylabel('Total reward')
plt.subplot(2, 1, 2)
plt.fill_between(np.arange(length),
t_means - t_stds, t_means + t_stds,
alpha=0.3, label='standard deviation')
plt.plot(np.arange(length), t_means, label='mean')
plt.legend()
plt.xlabel('Episodes')
plt.ylabel('Time steps')
plt.show()
# Main Functions
# ===============================================================
def train_policy(self, episodes: int, n_roll_outs: int = 1,
save: bool = False, path: str = "./trained_data/"):
"""Basic overlay for training the algorithms. It controls the
amount of episodes, logging and saving of policies and data.
:param episodes: Number of episodes to Train
:type episodes: int
:param n_roll_outs: Number of roll outs
:type n_roll_outs: int
:param save: If True the policy is saved after every learning
step
:type save: bool
:param path: The path to the folder where the policy shall be
stored
:type path: str
"""
for i_episode in range(episodes):
# update policy
returns, steps = self.algorithm.do(self.env, self.policy,
n_roll_outs)
# log data
self.logger.log_data(returns, steps, n_roll_outs)
# analyze episode
self.__print(i_episode)
if save:
print("{:-^50s}".format(' Save '))
file_name = "{}/{}_{}.p".format(path, self.env.to_string(),
self.algorithm.name)
pickle_out = open(file_name, "wb")
os.makedirs(os.path.dirname(file_name), exist_ok=True)
pickle.dump((self.policy, self.algorithm), pickle_out)
pickle_out.close()
if self.plot:
self.__plot_results()
def run_benchmark(self, episodes: int = 100, render: bool = False):
"""Runs a benchmark test with a set amount of simulations
(episodes) and plots results. There are two plots generated:
1. Reward per episode
3. Reward per time step for all episodes
The third plot does not take the mean but rather plots
a curve for each episode.
:param episodes: Number of episodes for the benchmark
:type episodes: int
:param render: If True the episodes will be rendered
:type render: bool
"""
# perform simulations
trajectories = self.env.roll_out(self.policy, n_roll_outs=episodes,
normalizer=self.algorithm.normalizer,
greedy=True, render=render)
total_rewards = []
rewards = []
time_steps = []
for i, t in np.ndenumerate(trajectories):
print("{} Reward reached: {}".format(i[0] + 1, t["total_reward"]))
total_rewards.append(t["total_reward"])
rewards.append(t["rewards"])
time_steps.append(t["time_steps"])
if not render:
print("-------------------")
print("Average reward: ", np.mean(total_rewards))
print("Min reward:", np.min(total_rewards))
print("Max reward:", np.max(total_rewards))
if self.plot:
self.__plot_benchmark(total_rewards, rewards, time_steps,
trajectories)
def __plot_benchmark(self, total_rewards, rewards, time_steps,
trajectories):
"""Generates plots after the benchmark process containing the
relevant information.
"""
# 1. Plot: Total reward
plt.plot(np.arange(len(total_rewards)), total_rewards,
label='Total reward per episode', color='darkgreen')
plt.fill_between(np.arange(len(total_rewards)),
0, total_rewards,
alpha=0.3, color='green')
plt.legend()
plt.xlabel('Trial')
plt.ylim(bottom=0)
plt.ylabel('Total reward')
plt.title("Benchmark Result for " + self.env.name + "\n"
+ "with " + self.algorithm.title
+ ", Policy: {}".format(self.policy.hidden_dim))
plt.show()
# 3. Plot: reward per time step for all runs
for r in rewards:
plt.plot(np.arange(len(r)), r, linewidth=1)
plt.legend(["Each Trial"])
plt.xlabel('Time steps')
plt.ylabel('Reward')
plt.title("Reward per time step during benchmark of "
+ self.env.name + "\n"
+ "with " + self.algorithm.title
+ ", Policy: {}".format(self.policy.hidden_dim))
plt.show()
# save in csv
# string = './trained_data/benchmark_data_{}_{}.csv' \
# .format(self.env.to_string(), self.algorithm.name)
#
# os.makedirs(os.path.dirname(string), exist_ok=True)
# with open(string, 'w') as writerFile:
# for step in range(max(time_steps)):
# step_rewards = [step]
# for r in rewards:
# try:
# step_rewards.append(r[step])
# except IndexError:
# step_rewards.append(None)
# writer = csv.writer(writerFile)
# writer.writerow(step_rewards)
# writerFile.close()