[Flux] Reduce debug model size to speed up flux integration tests (#1295)

wwwjn · web-flow · commit bf835b59d689 · 2025-06-13T16:10:45.000-07:00
1. Reduce the frequency of saving a checkpoint
2. Reduce the hidden dimension size and num_head to trim the size of
debug model. Reducing the debugging model size still makes sense, it has
~0.25 B params now (previous is 1B) , it shows faster converge and
running speed on local testing
3. Delete checkpoints before uploading CI results to save time
diff --git a/.github/workflows/integration_test_8gpu_flux.yaml b/.github/workflows/integration_test_8gpu_flux.yaml
@@ -31,6 +31,7 @@ jobs:
       docker-image: torchtitan-ubuntu-20.04-clang12
       repository: pytorch/torchtitan
       upload-artifact: outputs
+      # delete the checkpoints in the artifacts to save CI uploading time
       script: |
         set -eux
 
@@ -44,3 +45,4 @@ jobs:
 
         mkdir artifacts-to-be-uploaded
         python -m torchtitan.experiments.flux.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
+        rm -rf artifacts-to-be-uploaded/*/checkpoint
diff --git a/torchtitan/experiments/flux/__init__.py b/torchtitan/experiments/flux/__init__.py
@@ -81,9 +81,9 @@
         out_channels=64,
         vec_in_dim=768,
         context_in_dim=4096,
-        hidden_size=3072,
+        hidden_size=1536,
         mlp_ratio=4.0,
-        num_heads=24,
+        num_heads=12,
         depth=2,
         depth_single_blocks=2,
         axes_dim=(16, 56, 56),
diff --git a/torchtitan/experiments/flux/train_configs/debug_model.toml b/torchtitan/experiments/flux/train_configs/debug_model.toml
@@ -67,7 +67,7 @@ mode = "full"
 [checkpoint]
 enable_checkpoint = false
 folder = "checkpoint"
-interval = 5
+interval = 10
 last_save_model_weights_only = false
 export_dtype = "float32"
 async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]