-
Notifications
You must be signed in to change notification settings - Fork 177
/
bellman_equation.py
65 lines (53 loc) · 2.3 KB
/
bellman_equation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
#MIT License
#Copyright (c) 2017 Massimiliano Patacchiola
#
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.
#Using the Bellman equation to esimate the utility of a state
import numpy as np
def return_state_utility(v, T, u, reward, gamma):
action_array = np.zeros(4)
for action in range(0, 4):
action_array[action] = np.sum(np.multiply(u, np.dot(v, T[:,:,action])))
return reward + gamma * np.max(action_array)
def main():
#Change as you want
state = 8 #it corresponds to (1,1) in the robot world
#Assuming that the discount factor is equal to 1.0
gamma = 1.0
#Starting state vector
#The agent starts from (1, 1)
v = np.zeros(12)
v[state] = 1.0
#Transition matrix loaded from file
#(It is too big to write here)
T = np.load("T.npy")
#Utility vector
u = np.array([[0.812, 0.868, 0.918, 1.0,
0.762, 0.0, 0.660, -1.0,
0.705, 0.655, 0.611, 0.388]])
#Reward vector
r = np.array([-0.04, -0.04, -0.04, +1.0,
-0.04, 0.0, -0.04, -1.0,
-0.04, -0.04, -0.04, -0.04])
#Use the Beelman equation to find the utility of state (1,1)
utility = return_state_utility(v, T, u, r[state], gamma)
print("Utility of the state: " + str(utility))
if __name__ == "__main__":
main()