|
| 1 | +import torch |
| 2 | +from mushroom_rl.policy import DeterministicPolicy |
| 3 | +from mushroom_rl.environments.segway import Segway |
| 4 | +from mushroom_rl.core import Core, Agent |
| 5 | +from mushroom_rl.approximators import Regressor |
| 6 | +from mushroom_rl.approximators.parametric import LinearApproximator, TorchApproximator |
| 7 | +from mushroom_rl.rl_utils.value_functions import compute_gae, compute_advantage_montecarlo |
| 8 | + |
| 9 | +from mushroom_rl.utils.episodes import split_episodes, unsplit_episodes |
| 10 | + |
| 11 | +def test_compute_advantage_montecarlo(): |
| 12 | + def advantage_montecarlo(V, s, ss, r, absorbing, last, gamma): |
| 13 | + with torch.no_grad(): |
| 14 | + r = r.squeeze() |
| 15 | + q = torch.zeros(len(r)) |
| 16 | + v = V(s).squeeze() |
| 17 | + |
| 18 | + for rev_k in range(len(r)): |
| 19 | + k = len(r) - rev_k - 1 |
| 20 | + if last[k] or rev_k == 0: |
| 21 | + q_next = V(ss[k]).squeeze().item() |
| 22 | + q_next = r[k] + gamma * q_next * (1 - absorbing[k].int()) |
| 23 | + q[k] = q_next |
| 24 | + |
| 25 | + adv = q - v |
| 26 | + return q[:, None], adv[:, None] |
| 27 | + |
| 28 | + torch.manual_seed(42) |
| 29 | + _value_functions_tester(compute_advantage_montecarlo, advantage_montecarlo, 0.99) |
| 30 | + |
| 31 | +def test_compute_gae(): |
| 32 | + def gae(V, s, ss, r, absorbing, last, gamma, lam): |
| 33 | + with torch.no_grad(): |
| 34 | + v = V(s) |
| 35 | + v_next = V(ss) |
| 36 | + gen_adv = torch.empty_like(v) |
| 37 | + for rev_k in range(len(v)): |
| 38 | + k = len(v) - rev_k - 1 |
| 39 | + if last[k] or rev_k == 0: |
| 40 | + gen_adv[k] = r[k] - v[k] |
| 41 | + if not absorbing[k]: |
| 42 | + gen_adv[k] += gamma * v_next[k] |
| 43 | + else: |
| 44 | + gen_adv[k] = r[k] - v[k] + gamma * v_next[k] + gamma * lam * gen_adv[k + 1] |
| 45 | + return gen_adv + v, gen_adv |
| 46 | + |
| 47 | + torch.manual_seed(42) |
| 48 | + _value_functions_tester(compute_gae, gae, 0.99, 0.95) |
| 49 | + |
| 50 | +def _value_functions_tester(test_fun, correct_fun, *args): |
| 51 | + mdp = Segway() |
| 52 | + V = Regressor(TorchApproximator, input_shape=mdp.info.observation_space.shape, output_shape=(1,), network=Net, loss=torch.nn.MSELoss(), optimizer={'class': torch.optim.Adam, 'params': {'lr': 0.001}}) |
| 53 | + |
| 54 | + state, action, reward, next_state, absorbing, last = _get_episodes(mdp, 10) |
| 55 | + |
| 56 | + correct_v, correct_adv = correct_fun(V, state, next_state, reward, absorbing, last, *args) |
| 57 | + v, adv = test_fun(V, state, next_state, reward, absorbing, last, *args) |
| 58 | + |
| 59 | + assert torch.allclose(v, correct_v) |
| 60 | + assert torch.allclose(adv, correct_adv) |
| 61 | + |
| 62 | + V.fit(state, correct_v) |
| 63 | + |
| 64 | + correct_v, correct_adv = correct_fun(V, state, next_state, reward, absorbing, last, *args) |
| 65 | + v, adv = test_fun(V, state, next_state, reward, absorbing, last, *args) |
| 66 | + |
| 67 | + assert torch.allclose(v, correct_v) |
| 68 | + assert torch.allclose(adv, correct_adv) |
| 69 | + |
| 70 | +def _get_episodes(mdp, n_episodes=100): |
| 71 | + mu = torch.tensor([6.31154476, 3.32346271, 0.49648221]).unsqueeze(0) |
| 72 | + |
| 73 | + approximator = Regressor(LinearApproximator, |
| 74 | + input_shape=mdp.info.observation_space.shape, |
| 75 | + output_shape=mdp.info.action_space.shape, |
| 76 | + weights=mu) |
| 77 | + |
| 78 | + policy = DeterministicPolicy(approximator) |
| 79 | + |
| 80 | + agent = Agent(mdp.info, policy) |
| 81 | + core = Core(agent, mdp) |
| 82 | + dataset = core.evaluate(n_episodes=n_episodes) |
| 83 | + |
| 84 | + return dataset.parse(to='torch') |
| 85 | + |
| 86 | +class Net(torch.nn.Module): |
| 87 | + def __init__(self, input_shape, output_shape, **kwargs): |
| 88 | + super().__init__() |
| 89 | + self._q = torch.nn.Linear(input_shape[0], output_shape[0]) |
| 90 | + |
| 91 | + def forward(self, x): |
| 92 | + return self._q(x.float()) |
0 commit comments