Allow warm-starting through pre-trained policies for fine-tuning (#480)

Rocamonde · web-flow · commit b8a4c369b3d6 · 2022-07-25T21:38:15.000+01:00
* Initial go at adding support for pre-trained policies

* Reconstruct policy for imitation learning

* Add tests for warmstart feature

* Suggested changes and fixes

* Typos and linting issues

* Included Adam suggestions

* Fix multiple linting issues

* Fix sphinx version
diff --git a/src/imitation/scripts/common/common.py b/src/imitation/scripts/common/common.py
@@ -138,7 +138,7 @@ def make_venv(
 ) -> vec_env.VecEnv:
     """Builds the vector environment.
 
-     Args:
+    Args:
         env_name: The environment to train in.
         num_vec: Number of `gym.Env` instances to combine into a vector environment.
         parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`.
diff --git a/src/imitation/scripts/common/rl.py b/src/imitation/scripts/common/rl.py
@@ -144,9 +144,9 @@ def load_rl_algo_from_path(
     _seed: int,
 ) -> base_class.BaseAlgorithm:
     agent = serialize.load_stable_baselines_model(
-        rl_cls,
-        agent_path,
-        venv,
+        cls=rl_cls,
+        path=agent_path,
+        venv=venv,
         seed=_seed,
         **rl_kwargs,
     )
diff --git a/src/imitation/scripts/config/train_adversarial.py b/src/imitation/scripts/config/train_adversarial.py
@@ -29,6 +29,7 @@ def defaults():
     algorithm_specific = {}  # algorithm_specific[algorithm] is merged with config
 
     checkpoint_interval = 0  # Num epochs between checkpoints (<0 disables)
+    agent_path = None  # Path to load agent from, optional.
 
 
 @train_adversarial_ex.config
diff --git a/src/imitation/scripts/config/train_imitation.py b/src/imitation/scripts/config/train_imitation.py
@@ -40,6 +40,7 @@ def config():
         expert_policy_type=None,  # 'ppo', 'random', or 'zero'
         total_timesteps=1e5,
     )
+    agent_path = None  # Path to load agent from, optional.
 
 
 @train_imitation_ex.config
diff --git a/src/imitation/scripts/train_adversarial.py b/src/imitation/scripts/train_adversarial.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import os.path as osp
-from typing import Any, Mapping, Type
+from typing import Any, Mapping, Optional, Type
 
 import sacred.commands
 import torch as th
@@ -72,6 +72,7 @@ def train_adversarial(
     algorithm_kwargs: Mapping[str, Any],
     total_timesteps: int,
     checkpoint_interval: int,
+    agent_path: Optional[str],
 ) -> Mapping[str, Mapping[str, float]]:
     """Train an adversarial-network-based imitation learning algorithm.
 
@@ -94,6 +95,10 @@ def train_adversarial(
             `checkpoint_interval` rounds and after training is complete. If 0,
             then only save weights after training is complete. If <0, then don't
             save weights at all.
+        agent_path: Path to a directory containing a pre-trained agent. If
+            provided, then the agent will be initialized using this stored policy
+            (warm start). If not provided, then the agent will be initialized using
+            a random policy.
 
     Returns:
         A dictionary with two keys. "imit_stats" gives the return value of
@@ -111,7 +116,12 @@ def train_adversarial(
     expert_trajs = demonstrations.load_expert_trajs()
 
     venv = common_config.make_venv()
-    gen_algo = rl.make_rl_algo(venv)
+
+    if agent_path is None:
+        gen_algo = rl.make_rl_algo(venv)
+    else:
+        gen_algo = rl.load_rl_algo_from_path(agent_path=agent_path, venv=venv)
+
     reward_net = reward.make_reward_net(venv)
 
     logger.info(f"Using '{algo_cls}' algorithm")
diff --git a/src/imitation/scripts/train_imitation.py b/src/imitation/scripts/train_imitation.py
@@ -2,12 +2,13 @@
 
 import logging
 import os.path as osp
+import warnings
 from typing import Any, Mapping, Optional, Type
 
 from sacred.observers import FileStorageObserver
 from stable_baselines3.common import policies, utils, vec_env
 
-from imitation.algorithms.bc import BC
+from imitation.algorithms import bc as bc_algorithm
 from imitation.algorithms.dagger import SimpleDAggerTrainer
 from imitation.data import rollout
 from imitation.policies import serialize
@@ -22,13 +23,19 @@ def make_policy(
     venv: vec_env.VecEnv,
     policy_cls: Type[policies.BasePolicy],
     policy_kwargs: Mapping[str, Any],
+    agent_path: Optional[str],
 ) -> policies.BasePolicy:
     """Makes policy.
 
     Args:
         venv: Vectorized environment we will be imitating demos from.
         policy_cls: Type of a Stable Baselines3 policy architecture.
+            Specify only if policy_path is not specified.
         policy_kwargs: Keyword arguments for policy constructor.
+            Specify only if policy_path is not specified.
+        agent_path: Path to serialized policy. If provided, then load the
+            policy from this path. Otherwise, make a new policy.
+            Specify only if policy_cls and policy_kwargs are not specified.
 
     Returns:
         A Stable Baselines3 policy.
@@ -43,7 +50,14 @@ def make_policy(
                 "lr_schedule": utils.get_schedule_fn(1),
             },
         )
-    policy = policy_cls(**policy_kwargs)
+    if agent_path is not None:
+        warnings.warn(
+            "When agent_path is specified, policy_cls and policy_kwargs are ignored.",
+            RuntimeWarning,
+        )
+        policy = bc_algorithm.reconstruct_policy(agent_path)
+    else:
+        policy = policy_cls(**policy_kwargs)
     logger.info(f"Policy network summary:\n {policy}")
     return policy
 
@@ -88,27 +102,31 @@ def train_imitation(
     bc_train_kwargs: Mapping[str, Any],
     dagger: Mapping[str, Any],
     use_dagger: bool,
+    agent_path: Optional[str],
 ) -> Mapping[str, Mapping[str, float]]:
     """Runs DAgger (if `use_dagger`) or BC (otherwise) training.
 
     Args:
         bc_kwargs: Keyword arguments passed through to `bc.BC` constructor.
-        bc_train_kwargs: Keyword arguments passed through to `BC.train` method.
+        bc_train_kwargs: Keyword arguments passed through to `BC.train()` method.
         dagger: Arguments for DAgger training.
         use_dagger: If True, train using DAgger; otherwise, use BC.
+        agent_path: Path to serialized policy. If provided, then load the
+            policy from this path. Otherwise, make a new policy.
+            Specify only if policy_cls and policy_kwargs are not specified.
 
     Returns:
         Statistics for rollouts from the trained policy and demonstration data.
     """
     custom_logger, log_dir = common.setup_logging()
     venv = common.make_venv()
-    imit_policy = make_policy(venv)
+    imit_policy = make_policy(venv, agent_path=agent_path)
 
     expert_trajs = None
     if not use_dagger or dagger["use_offline_rollouts"]:
         expert_trajs = demonstrations.load_expert_trajs()
 
-    bc_trainer = BC(
+    bc_trainer = bc_algorithm.BC(
         observation_space=venv.observation_space,
         action_space=venv.action_space,
         policy=imit_policy,
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -88,8 +88,8 @@ def test_main_console(script_mod):
 _rl_agent_loading_configs = {
     "agent_path": CARTPOLE_TEST_POLICY_PATH,
     # FIXME(yawen): the policy we load was trained on 8 parallel environments
-    # and for some reason using it breaks if we use just 1 (like would be the
-    # default with the fast named_config)
+    #  and for some reason using it breaks if we use just 1 (like would be the
+    #  default with the fast named_config)
     "common": dict(num_vec=8),
 }
 
@@ -232,6 +232,40 @@ def test_train_dagger_main(tmpdir):
     assert isinstance(run.result, dict)
 
 
+def test_train_dagger_warmstart(tmpdir):
+    run = train_imitation.train_imitation_ex.run(
+        command_name="dagger",
+        named_configs=["cartpole"] + ALGO_FAST_CONFIGS["imitation"],
+        config_updates=dict(
+            common=dict(log_root=tmpdir),
+            demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
+            dagger=dict(
+                expert_policy_type="ppo",
+                expert_policy_path=CARTPOLE_TEST_POLICY_PATH,
+            ),
+        ),
+    )
+    assert run.status == "COMPLETED"
+
+    log_dir = pathlib.Path(run.config["common"]["log_dir"])
+    policy_path = log_dir / "scratch" / "policy-latest.pt"
+    run_warmstart = train_imitation.train_imitation_ex.run(
+        command_name="dagger",
+        named_configs=["cartpole"] + ALGO_FAST_CONFIGS["imitation"],
+        config_updates=dict(
+            common=dict(log_root=tmpdir),
+            demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
+            dagger=dict(
+                expert_policy_type="ppo",
+                expert_policy_path=CARTPOLE_TEST_POLICY_PATH,
+            ),
+            agent_path=policy_path,
+        ),
+    )
+    assert run_warmstart.status == "COMPLETED"
+    assert isinstance(run_warmstart.result, dict)
+
+
 def test_train_dagger_error_and_exceptions(tmpdir):
     with pytest.raises(Exception, match=".*expert_policy_path cannot be None.*"):
         train_imitation.train_imitation_ex.run(
@@ -261,6 +295,32 @@ def test_train_bc_main(tmpdir):
     assert isinstance(run.result, dict)
 
 
+def test_train_bc_warmstart(tmpdir):
+    run = train_imitation.train_imitation_ex.run(
+        command_name="bc",
+        named_configs=["cartpole"] + ALGO_FAST_CONFIGS["imitation"],
+        config_updates=dict(
+            common=dict(log_root=tmpdir),
+            demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
+        ),
+    )
+    assert run.status == "COMPLETED"
+
+    policy_path = pathlib.Path(run.config["common"]["log_dir"]) / "final.th"
+    run_warmstart = train_imitation.train_imitation_ex.run(
+        command_name="bc",
+        named_configs=["cartpole"] + ALGO_FAST_CONFIGS["imitation"],
+        config_updates=dict(
+            common=dict(log_root=tmpdir),
+            demonstrations=dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
+            agent_path=policy_path,
+        ),
+    )
+
+    assert run_warmstart.status == "COMPLETED"
+    assert isinstance(run_warmstart.result, dict)
+
+
 TRAIN_RL_PPO_CONFIGS = [{}, _rl_agent_loading_configs]
 
 
@@ -376,6 +436,35 @@ def test_train_adversarial(tmpdir, named_configs, command):
     _check_train_ex_result(run.result)
 
 
+@pytest.mark.parametrize("command", ("airl", "gail"))
+def test_train_adversarial_warmstart(tmpdir, command):
+    named_configs = ["cartpole"] + ALGO_FAST_CONFIGS["adversarial"]
+    config_updates = {
+        "common": dict(log_root=tmpdir),
+        "demonstrations": dict(rollout_path=CARTPOLE_TEST_ROLLOUT_PATH),
+    }
+    run = train_adversarial.train_adversarial_ex.run(
+        command_name=command,
+        named_configs=named_configs,
+        config_updates=config_updates,
+    )
+
+    log_dir = pathlib.Path(run.config["common"]["log_dir"])
+    policy_path = log_dir / "checkpoints" / "final" / "gen_policy"
+
+    run_warmstart = train_adversarial.train_adversarial_ex.run(
+        command_name=command,
+        named_configs=named_configs,
+        config_updates={
+            "agent_path": policy_path,
+            **config_updates,
+        },
+    )
+
+    assert run_warmstart.status == "COMPLETED"
+    _check_train_ex_result(run_warmstart.result)
+
+
 @pytest.mark.parametrize("command", ("airl", "gail"))
 def test_train_adversarial_sac(tmpdir, command):
     """Smoke test for imitation.scripts.train_adversarial."""

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ def config():`
`40`	`40`	`expert_policy_type=None, # 'ppo', 'random', or 'zero'`
`41`	`41`	`total_timesteps=1e5,`
`42`	`42`	`)`
	`43`	`+ agent_path = None # Path to load agent from, optional.`
`43`	`44`
`44`	`45`
`45`	`46`	`@train_imitation_ex.config`