-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
97 lines (83 loc) · 3.96 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Agents import Bandit, EpsilonGreedyAgent, ThomspsonSamplingAgent, OTS
import pandas as pd
def compare_epsilons_and_TS(
epsilons: List[float],
bandits_true_means: List[float],
timesteps: int,
num_simulations:int):
"""
Compare different epsilons for epsilon-greedy algorithm over num_simulations
"""
bandits = [Bandit(id, m) for id, m in enumerate(bandits_true_means)]
Agents_rewards = np.zeros((len(epsilons), timesteps))
Agents_actions = np.zeros((len(epsilons), len(bandits_true_means), timesteps))
_, ax1 = plt.subplots()
for n in range(num_simulations):
for ag, epsilon in enumerate(epsilons):
if epsilon == "TS":
print("Running TS for simulation_num = {}".format(n+1))
agent = ThomspsonSamplingAgent(bandits=bandits)
elif epsilon == "OTS":
print("Running oTS for simulation_num = {}".format(n+1))
agent = OTS(bandits=bandits)
else:
print("Running epsilon with epsilon = {} for simulation_num = {}".format(epsilon, n+1))
agent = EpsilonGreedyAgent(bandits=bandits, epsilon=epsilon)
# print(timesteps)
rewards, actions = agent.actions(timesteps)
Agents_rewards[ag]+=rewards # aadding rewards for averaging
ax1.plot(np.divide(Agents_rewards[ag], n+1), label = "epsilon = " + str(epsilon) if type(epsilon)!=str else str(epsilon))
ax1.legend()
for j in range(len(bandits_true_means)): # loop over all actions to find the Average percentage of each action at a timestep
Agents_actions[ag][j] += np.uint8(np.array(actions)==j)
ax1.set_xlabel("iteration")
ax1.set_ylabel("Average_Expected_reward")
plt.savefig("Comparisons_with_OTS")
ax1.clear()
return Agents_actions
epsilons = [0, 0.01, 0.1, "TS", "OTS"]
bandits_means = list(np.random.normal(loc=0, scale = 1, size = 10)) # 10 bandits sampled from standard normal distribution
# Finding the true optimal action of a particular run with it expected value for each action
print("True Order of Optimal Actions is")
ranked = np. argsort(bandits_means)
sort_indices = ranked[::-1]
print("Ranks -", sort_indices)
print("corresponding_values = ", np.array(bandits_means)[sort_indices])
print("-"*50)
# plotting true reward functions with Q*(a) as bandit_means
rewards = []
for i, b_m in enumerate(bandits_means):
for _ in range(1000):
rewards.append([str(i), np.random.randn() + b_m])
data = pd.DataFrame(rewards, columns=['Actions', 'Rewards'])
sns.violinplot(data = data, x ="Actions", y = "Rewards")
plt.title("True Reward Distribution")
plt.savefig("True_Rewards.png")
# Now comes the Actual code part
if __name__ == "__main__":
timesteps = 1500
num_simulations = 2500
Agents_actions = compare_epsilons_and_TS(epsilons, bandits_means, timesteps, num_simulations)
fig, (ax1, ax2, ax3) = plt.subplots(figsize = (15,5), ncols= 3)
for i in range(len(bandits_means)):
ax1.plot(np.divide(Agents_actions[1][i], num_simulations)*100, label = "Action = " + str(i))
ax2.plot(np.divide(Agents_actions[3][i], num_simulations)*100, label = "Action = " + str(i))
ax3.plot(np.divide(Agents_actions[4][i], num_simulations)*100, label = "Action = " + str(i))
ax1.legend()
ax2.legend()
ax3.legend()
ax1.set_xlabel("Iterations")
ax1.set_ylabel("Average Optimal Action Percentage")
ax1.set_title("For Greedy agent with epsilon = 0.01")
ax2.set_xlabel("Iterations")
ax2.set_ylabel("Average Optimal Action Percentage")
ax2.set_title("For TS")
ax3.set_xlabel("Iterations")
ax3.set_ylabel("Average Optimal Action Percentage")
ax3.set_title("For OTS")
plt.savefig("Optimal_Actions_with_OTS.png")
print("Ranks = ", sort_indices) # Again printing in the last to see