Migrating flux checkpointing to hf api (#1377)

wesleytruong · web-flow · commit 851830693d85 · 2025-07-10T11:23:52.000-07:00
Migrating the checkpointing of the flux dataset to use huggingface's
faster api for IterableDatasets.
diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py
@@ -194,13 +194,13 @@ def __init__(
         self._all_samples: list[dict[str, Any]] = []
 
     def _get_data_iter(self):
-        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
-            return iter([])
+        if isinstance(self._data, Dataset):
+            if self._sample_idx == len(self._data):
+                return iter([])
+            else:
+                return iter(self._data.skip(self._sample_idx))
 
-        it = iter(self._data)
-        for _ in range(self._sample_idx):
-            next(it)
-        return it
+        return iter(self._data)
 
     def __iter__(self):
         dataset_iterator = self._get_data_iter()
@@ -223,8 +223,13 @@ def __iter__(self):
                 else:
                     # Reset offset for the next iteration if infinite
                     self._sample_idx = 0
-                    logger.info(f"Dataset {self.dataset_name} is being re-looped.")
+                    logger.warning(f"Dataset {self.dataset_name} is being re-looped.")
                     dataset_iterator = self._get_data_iter()
+                    if not isinstance(self._data, Dataset):
+                        if hasattr(self._data, "set_epoch") and hasattr(
+                            self._data, "epoch"
+                        ):
+                            self._data.set_epoch(self._data.epoch + 1)
                     continue
 
             # Use the dataset-specific preprocessor
@@ -244,7 +249,7 @@ def __iter__(self):
 
             # Classifier-free guidance: Replace some of the strings with empty strings.
             # Distinct random seed is initialized at the beginning of training for each FSDP rank.
-            dropout_prob = self.job_config.training.classifer_free_guidance_prob
+            dropout_prob = self.job_config.training.classifier_free_guidance_prob
             if dropout_prob > 0.0:
                 if torch.rand(1).item() < dropout_prob:
                     sample_dict["t5_tokens"] = self._t5_empty_token
@@ -258,12 +263,17 @@ def __iter__(self):
             yield sample_dict, labels
 
     def load_state_dict(self, state_dict):
-        self._sample_idx = state_dict["sample_idx"]
+        if isinstance(self._data, Dataset):
+            self._sample_idx = state_dict["sample_idx"]
+        else:
+            assert "data" in state_dict
+            self._data.load_state_dict(state_dict["data"])
 
     def state_dict(self):
-        return {
-            "sample_idx": self._sample_idx,
-        }
+        if isinstance(self._data, Dataset):
+            return {"sample_idx": self._sample_idx}
+        else:
+            return {"data": self._data.state_dict()}
 
 
 def build_flux_dataloader(
diff --git a/torchtitan/experiments/flux/job_config.py b/torchtitan/experiments/flux/job_config.py
@@ -9,7 +9,7 @@
 
 @dataclass
 class Training:
-    classifer_free_guidance_prob: float = 0.0
+    classifier_free_guidance_prob: float = 0.0
     """Classifier-free guidance with probability `p` to dropout each text encoding independently.
     If `n` text encoders are used, the unconditional model is trained in `p ^ n` of all steps.
     For example, if `n = 2` and `p = 0.447`, the unconditional model is trained in 20% of all steps"""
@@ -37,7 +37,7 @@ class Encoder:
 
 @dataclass
 class Eval:
-    enable_classifer_free_guidance: bool = False
+    enable_classifier_free_guidance: bool = False
     """Whether to use classifier-free guidance during sampling"""
     classifier_free_guidance_scale: float = 5.0
     """Classifier-free guidance scale when sampling"""
diff --git a/torchtitan/experiments/flux/sampling.py b/torchtitan/experiments/flux/sampling.py
@@ -93,7 +93,7 @@ def generate_image(
     img_height = 16 * (job_config.training.img_size // 16)
     img_width = 16 * (job_config.training.img_size // 16)
 
-    enable_classifer_free_guidance = job_config.eval.enable_classifer_free_guidance
+    enable_classifier_free_guidance = job_config.eval.enable_classifier_free_guidance
 
     # Tokenize the prompt. Unsqueeze to add a batch dimension.
     clip_tokens = clip_tokenizer.encode(prompt).unsqueeze(0)
@@ -111,7 +111,7 @@ def generate_image(
         },
     )
 
-    if enable_classifer_free_guidance:
+    if enable_classifier_free_guidance:
         empty_clip_tokens = clip_tokenizer.encode("").unsqueeze(0)
         empty_t5_tokens = t5_tokenizer.encode("").unsqueeze(0)
         empty_batch = preprocess_data(
@@ -135,12 +135,12 @@ def generate_image(
         denoising_steps=job_config.eval.denoising_steps,
         clip_encodings=batch["clip_encodings"],
         t5_encodings=batch["t5_encodings"],
-        enable_classifer_free_guidance=enable_classifer_free_guidance,
+        enable_classifier_free_guidance=enable_classifier_free_guidance,
         empty_t5_encodings=(
-            empty_batch["t5_encodings"] if enable_classifer_free_guidance else None
+            empty_batch["t5_encodings"] if enable_classifier_free_guidance else None
         ),
         empty_clip_encodings=(
-            empty_batch["clip_encodings"] if enable_classifer_free_guidance else None
+            empty_batch["clip_encodings"] if enable_classifier_free_guidance else None
         ),
         classifier_free_guidance_scale=job_config.eval.classifier_free_guidance_scale,
     )
@@ -158,7 +158,7 @@ def denoise(
     denoising_steps: int,
     clip_encodings: torch.Tensor,
     t5_encodings: torch.Tensor,
-    enable_classifer_free_guidance: bool = False,
+    enable_classifier_free_guidance: bool = False,
     empty_t5_encodings: torch.Tensor | None = None,
     empty_clip_encodings: torch.Tensor | None = None,
     classifier_free_guidance_scale: float | None = None,
@@ -181,7 +181,7 @@ def denoise(
     ).to(latents)
     text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents)
 
-    if enable_classifer_free_guidance:
+    if enable_classifier_free_guidance:
         latents = torch.cat([latents, latents], dim=0)
         t5_encodings = torch.cat([empty_t5_encodings, t5_encodings], dim=0)
         clip_encodings = torch.cat([empty_clip_encodings, clip_encodings], dim=0)
@@ -200,7 +200,7 @@ def denoise(
             y=clip_encodings,
             timesteps=t_vec,
         )
-        if enable_classifer_free_guidance:
+        if enable_classifier_free_guidance:
             pred_u, pred_c = pred.chunk(2)
             pred = pred_u + classifier_free_guidance_scale * (pred_c - pred_u)
 
diff --git a/torchtitan/experiments/flux/tests/test_generate_image.py b/torchtitan/experiments/flux/tests/test_generate_image.py
@@ -46,7 +46,7 @@ def test_generate_image(self):
                 "./outputs",
                 "--training.seed",
                 "0",
-                "--training.classifer_free_guidance_prob",
+                "--training.classifier_free_guidance_prob",
                 "0.447",
                 "--encoder.t5_encoder",
                 "google/t5-v1_1-base",
@@ -59,7 +59,7 @@ def test_generate_image(self):
                 # eval params
                 "--eval.denoising_steps",
                 str(num_steps),
-                "--eval.enable_classifer_free_guidance",
+                "--eval.enable_classifier_free_guidance",
                 "--eval.classifier_free_guidance_scale",
                 str(classifier_free_guidance_scale),
                 "--eval.save_img_folder",
diff --git a/torchtitan/experiments/flux/tests/unit_tests/test_flux_dataloader.py b/torchtitan/experiments/flux/tests/unit_tests/test_flux_dataloader.py
@@ -4,22 +4,47 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import unittest
+
 import torch
 
+from datasets import load_dataset
+
 from torchtitan.config_manager import ConfigManager
-from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.experiments.flux.dataset.flux_dataset import (
+    _cc12m_wds_data_processor,
+    build_flux_dataloader,
+    DATASETS,
+    TextToImageDatasetConfig,
+)
+
 
+class TestFluxDataLoader(unittest.TestCase):
+    def setUp(self):
+        DATASETS["cc12m-test-iterable"] = TextToImageDatasetConfig(
+            path="torchtitan/experiments/flux/tests/assets/cc12m_test",
+            loader=lambda path: load_dataset(
+                path, split="train", data_files={"train": "*tar"}
+            ).to_iterable_dataset(num_shards=4),
+            data_processor=_cc12m_wds_data_processor,
+        )
+
+    def tearDown(self):
+        del DATASETS["cc12m-test-iterable"]
 
-class TestFluxDataLoader:
     def test_load_dataset(self):
         # The test checks for the correct tensor shapes during the first num_steps
         # The next num_steps ensure the loaded from checkpoint dataloader generates tokens and labels correctly
-        for world_size in [2, 4]:
+        for world_size in [2]:
             for rank in range(world_size):
-                dataset_name = "cc12m-test"
-                batch_size = 4
+                dataset_name = "cc12m-test-iterable"
+                batch_size = 1
+
+                num_steps = 15
 
-                num_steps = 10
+                # TODO: if num_steps * batch_size * world_size is larger than the number of samples
+                # in the dataset, then the test will fail, due to huggingface's
+                # non-resumption when checkpointing after the first epoch
 
                 path = "torchtitan.experiments.flux.job_config"
                 config_manager = ConfigManager()
@@ -32,16 +57,12 @@ def test_load_dataset(self):
                         dataset_name,
                         "--training.local_batch_size",
                         str(batch_size),
-                        "--training.seed",
-                        "0",
-                        "--training.classifer_free_guidance_prob",
+                        "--training.classifier_free_guidance_prob",
                         "0.447",
                         "--encoder.t5_encoder",
                         "google/t5-v1_1-xxl",
                         "--encoder.clip_encoder",
                         "openai/clip-vit-large-patch14",
-                        # "--encoder.max_t5_encoding_len",
-                        # "512",
                     ]
                 )
 
diff --git a/torchtitan/experiments/flux/train_configs/debug_model.toml b/torchtitan/experiments/flux/train_configs/debug_model.toml
@@ -38,7 +38,7 @@ max_norm = 2.0  # grad norm clipping
 steps = 10
 compile = false
 dataset = "cc12m-test"
-classifer_free_guidance_prob = 0.447
+classifier_free_guidance_prob = 0.447
 img_size = 256
 
 [encoder]
@@ -48,8 +48,8 @@ max_t5_encoding_len = 256
 autoencoder_path = "torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
 
 [eval]
-enable_classifer_free_guidance = true
-classifer_free_guidance_scale = 5.0
+enable_classifier_free_guidance = true
+classifier_free_guidance_scale = 5.0
 denoising_steps = 4
 save_img_folder = "img"
 eval_freq = 5
diff --git a/torchtitan/experiments/flux/train_configs/flux_dev_model.toml b/torchtitan/experiments/flux/train_configs/flux_dev_model.toml
@@ -37,7 +37,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 30_000
 compile = false
 dataset = "cc12m-wds"
-classifer_free_guidance_prob = 0.447
+classifier_free_guidance_prob = 0.447
 img_size = 256
 
 [encoder]
@@ -47,8 +47,8 @@ max_t5_encoding_len = 512
 autoencoder_path = "torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
 
 [eval]
-enable_classifer_free_guidance = true
-classifer_free_guidance_scale = 5.0
+enable_classifier_free_guidance = true
+classifier_free_guidance_scale = 5.0
 denoising_steps = 50
 save_img_folder = "img"
 eval_freq = 1000
diff --git a/torchtitan/experiments/flux/train_configs/flux_schnell_model.toml b/torchtitan/experiments/flux/train_configs/flux_schnell_model.toml
@@ -37,7 +37,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 30_000
 compile = false
 dataset = "cc12m-wds"
-classifer_free_guidance_prob = 0.447
+classifier_free_guidance_prob = 0.447
 img_size = 256
 
 [encoder]
@@ -47,8 +47,8 @@ max_t5_encoding_len = 256
 autoencoder_path = "torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
 
 [eval]
-enable_classifer_free_guidance = true
-classifer_free_guidance_scale = 5.0
+enable_classifier_free_guidance = true
+classifier_free_guidance_scale = 5.0
 denoising_steps = 50
 save_img_folder = "img"
 eval_freq = 1000