[Refactoring] Refactor dreamer helper in smaller pieces (#662)

vmoens · web-flow · commit 213ae5bc84a7 · 2022-11-10T10:31:51.000Z
diff --git a/torchrl/trainers/helpers/models.py b/torchrl/trainers/helpers/models.py
@@ -1216,6 +1216,57 @@ def make_dreamer(
         out_features=1, depth=2, num_cells=cfg.mlp_num_units, activation_class=nn.ELU
     )
 
+    world_model = _dreamer_make_world_model(
+        obs_encoder, obs_decoder, rssm_prior, rssm_posterior, reward_module
+    ).to(device)
+    with torch.no_grad(), set_exploration_mode("random"):
+        tensordict = proof_environment.rollout(4)
+        tensordict = tensordict.to_tensordict().to(device)
+        tensordict = tensordict.to(device)
+        world_model(tensordict)
+
+    model_based_env = _dreamer_make_mbenv(
+        reward_module,
+        rssm_prior,
+        obs_decoder,
+        proof_environment,
+        use_decoder_in_env,
+        cfg.state_dim,
+        cfg.rssm_hidden_dim,
+    )
+    model_based_env = model_based_env.to(device)
+
+    actor_simulator, actor_realworld = _dreamer_make_actors(
+        obs_encoder,
+        rssm_prior,
+        rssm_posterior,
+        cfg.mlp_num_units,
+        action_key,
+        proof_environment,
+    )
+    actor_simulator = actor_simulator.to(device)
+
+    value_model = _dreamer_make_value_model(cfg.mlp_num_units, value_key)
+    value_model = value_model.to(device)
+    with torch.no_grad(), set_exploration_mode("random"):
+        tensordict = model_based_env.rollout(4)
+        tensordict = tensordict.to(device)
+        tensordict = actor_simulator(tensordict)
+        value_model(tensordict)
+
+    actor_realworld = actor_realworld.to(device)
+    if proof_env_is_none:
+        proof_environment.close()
+        torch.cuda.empty_cache()
+        del proof_environment
+
+    del tensordict
+    return world_model, model_based_env, actor_simulator, value_model, actor_realworld
+
+
+def _dreamer_make_world_model(
+    obs_encoder, obs_decoder, rssm_prior, rssm_posterior, reward_module
+):
     # World Model and reward model
     rssm_rollout = RSSMRollout(
         TensorDictModule(
@@ -1261,14 +1312,38 @@ def make_dreamer(
         transition_model,
         reward_model,
     )
+    return world_model
 
-    # actor for simulator: interacts with states ~ prior
+
+def _dreamer_make_actors(
+    obs_encoder,
+    rssm_prior,
+    rssm_posterior,
+    mlp_num_units,
+    action_key,
+    proof_environment,
+):
     actor_module = DreamerActor(
         out_features=proof_environment.action_spec.shape[0],
         depth=3,
-        num_cells=cfg.mlp_num_units,
+        num_cells=mlp_num_units,
         activation_class=nn.ELU,
     )
+    actor_simulator = _dreamer_make_actor_sim(
+        action_key, proof_environment, actor_module
+    )
+    actor_realworld = _dreamer_make_actor_real(
+        obs_encoder,
+        rssm_prior,
+        rssm_posterior,
+        actor_module,
+        action_key,
+        proof_environment,
+    )
+    return actor_simulator, actor_realworld
+
+
+def _dreamer_make_actor_sim(action_key, proof_environment, actor_module):
     actor_simulator = ProbabilisticTensorDictModule(
         TensorDictModule(
             actor_module,
@@ -1293,6 +1368,12 @@ def make_dreamer(
             }
         ),
     )
+    return actor_simulator
+
+
+def _dreamer_make_actor_real(
+    obs_encoder, rssm_prior, rssm_posterior, actor_module, action_key, proof_environment
+):
     # actor for real world: interacts with states ~ posterior
     # Out actor differs from the original paper where first they compute prior and posterior and then act on it
     # but we found that this approach worked better.
@@ -1344,17 +1425,33 @@ def make_dreamer(
             ],
         ),
     )
+    return actor_realworld
+
+
+def _dreamer_make_value_model(mlp_num_units, value_key):
+    # actor for simulator: interacts with states ~ prior
     value_model = TensorDictModule(
         MLP(
             out_features=1,
             depth=3,
-            num_cells=cfg.mlp_num_units,
+            num_cells=mlp_num_units,
             activation_class=nn.ELU,
         ),
         in_keys=["state", "belief"],
         out_keys=[value_key],
     )
-
+    return value_model
+
+
+def _dreamer_make_mbenv(
+    reward_module,
+    rssm_prior,
+    obs_decoder,
+    proof_environment,
+    use_decoder_in_env,
+    state_dim,
+    rssm_hidden_dim,
+):
     # MB environment
     if use_decoder_in_env:
         mb_env_obs_decoder = TensorDictModule(
@@ -1387,49 +1484,22 @@ def make_dreamer(
             transition_model,
             reward_model,
         ),
-        prior_shape=torch.Size([cfg.state_dim]),
-        belief_shape=torch.Size([cfg.rssm_hidden_dim]),
+        prior_shape=torch.Size([state_dim]),
+        belief_shape=torch.Size([rssm_hidden_dim]),
         obs_decoder=mb_env_obs_decoder,
     )
 
     model_based_env.set_specs_from_env(proof_environment)
     model_based_env = TransformedEnv(model_based_env)
     default_dict = {
-        "next_state": NdUnboundedContinuousTensorSpec(cfg.state_dim),
-        "next_belief": NdUnboundedContinuousTensorSpec(cfg.rssm_hidden_dim),
+        "next_state": NdUnboundedContinuousTensorSpec(state_dim),
+        "next_belief": NdUnboundedContinuousTensorSpec(rssm_hidden_dim),
         # "action": proof_environment.action_spec,
     }
     model_based_env.append_transform(
         TensorDictPrimer(random=False, default_value=0, **default_dict)
     )
-
-    world_model = world_model.to(device)
-
-    # init nets
-    with torch.no_grad(), set_exploration_mode("random"):
-        tensordict = proof_environment.rollout(4)
-        tensordict = tensordict.to_tensordict().to(device)
-        tensordict = tensordict.to(device)
-        world_model(tensordict)
-    model_based_env = model_based_env.to(device)
-
-    actor_simulator = actor_simulator.to(device)
-    value_model = value_model.to(device)
-
-    with torch.no_grad(), set_exploration_mode("random"):
-        tensordict = model_based_env.rollout(4)
-        tensordict = tensordict.to(device)
-        tensordict = actor_simulator(tensordict)
-        value_model(tensordict)
-
-    actor_realworld = actor_realworld.to(device)
-    if proof_env_is_none:
-        proof_environment.close()
-        torch.cuda.empty_cache()
-        del proof_environment
-
-    del tensordict
-    return world_model, model_based_env, actor_simulator, value_model, actor_realworld
+    return model_based_env
 
 
 @dataclass