unit test for flux_dataset dataloader checkpointing (#1346)

wesleytruong · web-flow · commit 5d4cc9a14c8a · 2025-06-27T12:37:58.000-07:00
Adds a unit test for loading flux dataset from a checkpoint. Creates the
new dataloader from a checkpoint then ensures that the next generated
labels and tokens are the same in both dataloaders, starting from the
checkpoint.
diff --git a/torchtitan/experiments/flux/tests/unit_tests/test_flux_dataloader.py b/torchtitan/experiments/flux/tests/unit_tests/test_flux_dataloader.py
@@ -4,96 +4,98 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
 from torchtitan.config_manager import ConfigManager
 from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
-from torchtitan.tools.profiling import (
-    maybe_enable_memory_snapshot,
-    maybe_enable_profiling,
-)
 
 
 class TestFluxDataLoader:
     def test_load_dataset(self):
-        for dataset_name in ["cc12m-test"]:
-            self._test_flux_dataloader(dataset_name)
+        # The test checks for the correct tensor shapes during the first num_steps
+        # The next num_steps ensure the loaded from checkpoint dataloader generates tokens and labels correctly
+        for world_size in [2, 4]:
+            for rank in range(world_size):
+                dataset_name = "cc12m-test"
+                batch_size = 4
+
+                num_steps = 10
+
+                path = "torchtitan.experiments.flux.job_config"
+                config_manager = ConfigManager()
+                config = config_manager.parse_args(
+                    [
+                        f"--experimental.custom_args_module={path}",
+                        "--training.img_size",
+                        str(256),
+                        "--training.dataset",
+                        dataset_name,
+                        "--training.local_batch_size",
+                        str(batch_size),
+                        "--training.seed",
+                        "0",
+                        "--training.classifer_free_guidance_prob",
+                        "0.447",
+                        "--encoder.t5_encoder",
+                        "google/t5-v1_1-xxl",
+                        "--encoder.clip_encoder",
+                        "openai/clip-vit-large-patch14",
+                        # "--encoder.max_t5_encoding_len",
+                        # "512",
+                    ]
+                )
 
-    def _test_flux_dataloader(self, dataset_name):
-        batch_size = 4
-        world_size = 4
-        rank = 0
+                dl = build_flux_dataloader(
+                    dp_world_size=world_size,
+                    dp_rank=rank,
+                    job_config=config,
+                    tokenizer=None,
+                    infinite=True,
+                )
 
-        num_steps = 10
+                it = iter(dl)
 
-        path = "torchtitan.experiments.flux.job_config"
-        config_manager = ConfigManager()
-        config = config_manager.parse_args(
-            [
-                f"--experimental.custom_args_module={path}",
-                # Profiling options
-                # "--profiling.enable_profiling",
-                # "--profiling.profile_freq",
-                # "5",
-                # "--profiling.enable_memory_snapshot",
-                # "--profiling.save_memory_snapshot_folder",
-                # "memory_snapshot_flux",
-                "--training.img_size",
-                str(256),
-                "--training.dataset",
-                dataset_name,
-                "--training.local_batch_size",
-                str(batch_size),
-                "--training.seed",
-                "0",
-                "--training.classifer_free_guidance_prob",
-                "0.447",
-                "--encoder.t5_encoder",
-                "google/t5-v1_1-small",
-                "--encoder.clip_encoder",
-                "openai/clip-vit-large-patch14",
-                "--encoder.max_t5_encoding_len",
-                "512",
-            ]
-        )
+                for i in range(0, num_steps):
+                    input_data, labels = next(it)
 
-        with maybe_enable_profiling(
-            config, global_step=0
-        ) as torch_profiler, maybe_enable_memory_snapshot(
-            config, global_step=0
-        ) as memory_profiler:
-            dl = self._build_dataloader(
-                config,
-                world_size,
-                rank,
-            )
-            dl = iter(dl)
+                    assert len(input_data) == 2  # (clip_encodings, t5_encodings)
+                    assert labels.shape == (batch_size, 3, 256, 256)
+                    assert input_data["clip_tokens"].shape == (
+                        batch_size,
+                        1,
+                        77,
+                    )
+                    assert input_data["t5_tokens"].shape == (
+                        batch_size,
+                        1,
+                        256,
+                    )
 
-            for i in range(0, num_steps):
-                input_data, labels = next(dl)
-                if torch_profiler:
-                    torch_profiler.step()
-                if memory_profiler:
-                    memory_profiler.step()
+                state = dl.state_dict()
 
-                assert len(input_data) == 2  # (clip_encodings, t5_encodings)
-                assert labels.shape == (batch_size, 3, 256, 256)
-                # assert input_data["clip_tokens"].shape[0] == batch_size
-                # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+                # Create new dataloader, restore checkpoint, and check if next data yielded is the same as above
+                dl_resumed = build_flux_dataloader(
+                    dp_world_size=world_size,
+                    dp_rank=rank,
+                    job_config=config,
+                    tokenizer=None,
+                    infinite=True,
+                )
+                dl_resumed.load_state_dict(state)
+                it_resumed = iter(dl_resumed)
 
-            if torch_profiler:
-                torch_profiler.step()
-            if memory_profiler:
-                memory_profiler.step(exit_ctx=True)
+                for i in range(num_steps):
+                    # Set torch manual seed before each dataloader iteration to ensure consistent randomness
+                    # across dataloaders for testing purposes.
+                    torch.manual_seed(i)
+                    expected_input_ids, expected_labels = next(it)
+                    torch.manual_seed(i)
+                    input_ids, labels = next(it_resumed)
 
-    def _build_dataloader(
-        self,
-        job_config,
-        world_size,
-        rank,
-    ):
-        return build_flux_dataloader(
-            dp_world_size=world_size,
-            dp_rank=rank,
-            job_config=job_config,
-            tokenizer=None,
-            infinite=True,
-        )
+                    assert torch.equal(
+                        input_ids["clip_tokens"], expected_input_ids["clip_tokens"]
+                    )
+                    assert torch.equal(
+                        input_ids["t5_tokens"], expected_input_ids["t5_tokens"]
+                    )
+                    assert torch.equal(labels, expected_labels)