-
Notifications
You must be signed in to change notification settings - Fork 0
/
policy_efirst.R
48 lines (48 loc) · 1.48 KB
/
policy_efirst.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
EpsilonFirstLinearRegressionPolicy <- R6::R6Class(
portable = FALSE,
class = FALSE,
inherit = Policy,
public = list(
class_name = "EpsilonFirstLinearRegressionPolicy",
b = NULL,
A = NULL,
epsilon = NULL,
initialize = function(b = matrix(c(0, 0, 0), nrow=1, ncol=3, byrow = TRUE),
A = matrix(diag(c(1,1,1)), nrow=3, ncol=3, byrow = TRUE),
epsilon = 2000
) {
super$initialize()
self$b <- b
self$A <- A
self$epsilon <- epsilon
},
set_parameters = function(context_params) {
self$theta <- list('b' = self$b, 'A' = self$A, 'n' = 0)
},
get_action = function(t, context) {
if(self$theta$n <= self$epsilon){
action$choice <- runif(1,0,1)
} else{
betas <- solve(self$theta$A, tol = 1e-200)%*% matrix(self$theta$b)
action$choice <- -(betas[2] / (2*betas[3]))
if(action$choice > 1){
action$choice <- 1
} else if(action$choice < 0) {
action$choice <- 0
}
}
action
},
set_reward = function(t, context, action, reward) {
if(self$theta$n <= self$epsilon){
y <- reward$reward
x <- action$choice
x <- matrix(c(1,x,x^2), nrow = 1, ncol = 3, byrow = TRUE)
self$theta$b <- (x*y) + self$theta$b
self$theta$A <- t(x)%*%x + self$theta$A
}
self$theta$n <- self$theta$n + 1
self$theta
}
)
)