-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathqlearning.py
59 lines (47 loc) · 1.67 KB
/
qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
def QLearning(env,lr=1,eps = 0.5, gamma=0.99, T=5000):
"""Q Learning Algorithm,
Parameters
----------
env : gym-like env,
environment with finite state and action space
lr : float/int or function of t and nv(number of visits of the state action couple),
Learning Rate
eps : float/int or function of t,
Exploration parameter,
gamma : float,
Discount factor
T : int,
Number of iteration
Yields
------
Q : array of shape (S,A),
Action Value function/matrix, shape (S,A) with S the state space dimension and A the action space dimension.
"""
## INIT
S = env.observation_space.n
A = env.action_space.n
Q = np.random.random((S, A)) # How can we improve this initialization?
visited_matrix = np.ones((S,A))
state = env.reset()
if isinstance(lr,float) or isinstance(lr,int):
lr = lambda t,nv : lr
if isinstance(eps,float) or isinstance(eps,int):
eps = lambda t : eps
for t in range(T):
if np.random.random() < eps(t) : # Exploration
action = env.action_space.sample()
else :
action = Q[state].argmax()
# Take action and observe
next_state, reward, is_terminal, _ = env.step(action)
# Update Q
delta_t = reward + gamma * Q[next_state].max() - Q[state,action]
Q[state,action] += lr(t,visited_matrix[state,action])*delta_t
# Transition to next iteration
visited_matrix[state,action] += 1
state = next_state
# If experiment over, reset the env
if is_terminal :
state = env.reset()
return Q