meta-pytorch · felipemello1 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py
@@ -358,7 +358,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc
             --config mistral/7B_full_ppo_low_memory \
             output_dir={tmpdir} \
             checkpointer._component_=torchtune.training.FullModelHFCheckpointer \
-            checkpointer.checkpoint_dir='{policy_tmpdir}' \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
             checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, model_ckpt_fname)}]\
             checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\
             checkpointer.output_dir={policy_tmpdir} \
@@ -367,7 +367,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc
             ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \
             ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\
 
-            value_checkpointer.checkpoint_dir='{value_tmpdir}' \
+            value_checkpointer.checkpoint_dir='{ckpt_dir}' \
             value_checkpointer.checkpoint_files=[{os.path.join(value_tmpdir, epoch_folder_minus_one, model_ckpt_fname)}]\
             value_checkpointer.output_dir={value_tmpdir} \
 

diff --git a/tests/torchtune/training/checkpointing/test_checkpointer.py b/tests/torchtune/training/checkpointing/test_checkpointer.py
@@ -152,8 +152,11 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2):
             * embed_dim: 64
             * max_seq_len: 128
         """
-        checkpoint_file_1 = tmp_path / "llama2_hf_checkpoint_01.pt"
-        checkpoint_file_2 = tmp_path / "llama2_hf_checkpoint_02.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file_1 = checkpoint_dir / "llama2_hf_checkpoint_01.pt"
+        checkpoint_file_2 = checkpoint_dir / "llama2_hf_checkpoint_02.pt"
 
         torch.save(state_dict_1, checkpoint_file_1)
         torch.save(state_dict_2, checkpoint_file_2)
@@ -163,7 +166,7 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2):
             "num_attention_heads": 4,
             "num_key_value_heads": 4,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -174,23 +177,27 @@ def single_file_checkpointer(
         self, llama2_hf_checkpoints, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file, _ = llama2_hf_checkpoints
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="LLAMA2",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     @pytest.fixture
     def multi_file_checkpointer(
         self, llama2_hf_checkpoints, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file_1, checkpoint_file_2 = llama2_hf_checkpoints
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file_1, checkpoint_file_2],
             model_type="LLAMA2",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -242,7 +249,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")
@@ -306,12 +313,12 @@ def test_save_load_checkpoint_multiple_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file_1 = Path.joinpath(
-            checkpoint_file_1.parent,
+            checkpoint_file_1.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="2".zfill(5)),
         ).with_suffix(".safetensors")
         output_file_2 = Path.joinpath(
-            checkpoint_file_2.parent,
+            checkpoint_file_2.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="2".zfill(5), num_shards="2".zfill(5)),
         ).with_suffix(".safetensors")
@@ -338,12 +345,14 @@ def test_load_save_adapter_only(
         single_file_checkpointer.save_checkpoint(state_dict, epoch=2, adapter_only=True)
 
         output_file_1 = Path.joinpath(
-            tmp_path,
+            tmp_path / "output_dir",
             "epoch_2",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         )
         output_file_2 = Path.joinpath(
-            tmp_path, "epoch_2", f"{ADAPTER_MODEL_FNAME}.safetensors"
+            tmp_path / "output_dir",
+            "epoch_2",
+            f"{ADAPTER_MODEL_FNAME}.safetensors",
         )
 
         with pytest.raises(ValueError, match="Unable to load checkpoint from"):
@@ -437,12 +446,16 @@ def test_save_checkpoint_in_peft_format(
 
         # Load saved adapter weights and config from file for comparison
         adapter_weights_file = Path.joinpath(
-            checkpoint_file.parent, "epoch_1", f"{ADAPTER_MODEL_FNAME}.safetensors"
+            checkpoint_file.parent.parent / "output_dir",
+            "epoch_1",
+            f"{ADAPTER_MODEL_FNAME}.safetensors",
         )
         actual_adapter_state_dict = safe_torch_load(adapter_weights_file)
 
         adapter_config_file = Path.joinpath(
-            checkpoint_file.parent, "epoch_1", f"{ADAPTER_CONFIG_FNAME}.json"
+            checkpoint_file.parent.parent / "output_dir",
+            "epoch_1",
+            f"{ADAPTER_CONFIG_FNAME}.json",
         )
         with open(adapter_config_file, "r") as f:
             adapter_config = json.load(f)
@@ -558,7 +571,10 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict):
             * intermediate_dim: 256
 
         """
-        checkpoint_file = tmp_path / "mistral_reward_model_hf_checkpoint.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file = checkpoint_dir / "mistral_reward_model_hf_checkpoint.pt"
 
         torch.save(state_dict, checkpoint_file)
 
@@ -568,7 +584,7 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict):
             "num_key_value_heads": 4,
             "num_classes": 1,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -579,11 +595,13 @@ def single_file_checkpointer(
         self, mistral_reward_model_hf_checkpoint, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file = mistral_reward_model_hf_checkpoint
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="REWARD",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -636,7 +654,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")
@@ -708,7 +726,10 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict):
             * head_dim : 16
 
         """
-        checkpoint_file = tmp_path / "gemma_hf_checkpoint.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file = checkpoint_dir / "gemma_hf_checkpoint.pt"
 
         torch.save(state_dict, checkpoint_file)
 
@@ -719,7 +740,7 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict):
             "head_dim": _HEAD_DIM,
             "intermediate_size": _HIDDEN_DIM,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -730,11 +751,13 @@ def single_file_checkpointer(
         self, gemma_hf_checkpoint, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file = gemma_hf_checkpoint
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="GEMMA",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -788,7 +811,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")

diff --git a/tests/torchtune/training/checkpointing/test_checkpointer_utils.py b/tests/torchtune/training/checkpointing/test_checkpointer_utils.py
@@ -11,6 +11,7 @@
 import torch
 from torchtune.models.llama2 import llama2, llama2_classifier
 from torchtune.training.checkpointing._utils import (
+    check_outdir_not_in_ckptdir,
     FormattedCheckpointFiles,
     safe_torch_load,
     update_state_dict_for_classifier,
@@ -226,3 +227,47 @@ def test_build_checkpoint_filenames(self, expected_filenames):
         formatted_files = FormattedCheckpointFiles.from_dict(formatted_file_dict)
         actual_filenames = formatted_files.build_checkpoint_filenames()
         assert actual_filenames == expected_filenames
+
+
+class TestCheckOutdirNotInCkptdir:
+    def test_sibling_directories(self):
+        # Sibling directories should pass without raising an error
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/output")
+        check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_ckpt_dir_in_output_dir(self):
+        # out_dir is a parent of ckpt_dir, should pass without raising an error
+        ckpt_dir = Path("/path/to/output/ckpt_dir")
+        out_dir = Path("/path/to/output")
+        check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_equal_directories(self):
+        # Equal directories should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_output_dir_in_ckpt_dir(self):
+        # out_dir is a subdirectory of ckpt_dir, should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt/subdir")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_output_dir_ckpt_dir_few_levels_down(self):
+        # out_dir is a few levels down in ckpt_dir, should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt/subdir/another_subdir")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
@@ -30,6 +30,7 @@
 from torchtune.training.checkpointing._utils import (
     ADAPTER_CONFIG_FNAME,
     ADAPTER_MODEL_FNAME,
+    check_outdir_not_in_ckptdir,
     copy_files,
     get_adapter_checkpoint_path,
     get_model_checkpoint_path,
@@ -162,7 +163,7 @@ def __init__(
         # TODO: support loading more than one file
         if len(checkpoint_files) != 1:
             raise ValueError(
-                "Currently we only support reading from a single torchtune checkpoint file. "
+                "Currently we only support reading from a single checkpoint file. "
                 f"Got {len(checkpoint_files)} files instead."
             )
 
@@ -177,6 +178,9 @@ def __init__(
 
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
+        )
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
         #  resume from adapter_model ckpt
@@ -422,6 +426,9 @@ def __init__(
         self._checkpoint_dir = Path(checkpoint_dir)
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
+        )
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
         # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly
@@ -950,7 +957,7 @@ def __init__(
         # TODO: support loading more than one file
         if len(checkpoint_files) != 1:
             raise ValueError(
-                "Currently we only support reading from a single torchtune checkpoint file. "
+                "Currently we only support reading from a single checkpoint file. "
                 f"Got {len(checkpoint_files)} files instead."
             )
 
@@ -963,6 +970,9 @@ def __init__(
             )
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
+        )
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
         #  resume from adapter_model ckpt

diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
@@ -572,3 +572,23 @@ def validate_checkpoint_files(
     )
 
     return checkpoint_paths
+
+
+def check_outdir_not_in_ckptdir(ckpt_dir: Path, out_dir: Path) -> bool:
+    """
+    Checks that the output directory is not equal to or a subdirectory of the checkpoint directory.
+    This is necessary to avoid making copies of copies when geting config files from ckpt_dir.
+    """
+
+    # Resolve the absolute paths to avoid issues with relative paths
+    _ckpt_dir = ckpt_dir.resolve()
+    _out_dir = out_dir.resolve()
+
+    # Check if out_dir is the same as ckpt_dir or a subdirectory of it
+    if _out_dir == _ckpt_dir or _ckpt_dir in _out_dir.parents:
+        raise ValueError(
+            "The output directory cannot be the same as or a subdirectory of the checkpoint directory. "
+            f"Found {ckpt_dir=} and {out_dir=}."
+        )
+
+    return True