Hack an init_fn for llama3 and observe loss decreasing with autoparallel

wconstab · wconstab · commit 42d5da6db85b · 2025-06-16T16:25:24.000-07:00
"""
[rank0]:[titan] 2025-06-16 16:24:16,593 - root - INFO - Training starts at step 1.
[rank0]:[titan] 2025-06-16 16:24:23,544 - root - INFO - step:  1  loss:  8.1880  memory:  4.88GiB(6.16%)  tps: 28
[rank0]:[titan] 2025-06-16 16:24:23,545 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
[rank0]:[titan] 2025-06-16 16:24:23,842 - root - INFO - step:  2  loss:  8.1610  memory:  4.90GiB(6.20%)  tps: 13,785
[rank0]:[titan] 2025-06-16 16:24:24,135 - root - INFO - step:  3  loss:  8.0871  memory:  4.90GiB(6.20%)  tps: 14,006
[rank0]:[titan] 2025-06-16 16:24:24,433 - root - INFO - step:  4  loss:  7.9516  memory:  4.90GiB(6.20%)  tps: 13,770
[rank0]:[titan] 2025-06-16 16:24:24,727 - root - INFO - step:  5  loss:  7.8552  memory:  4.90GiB(6.20%)  tps: 13,959
[rank0]:[titan] 2025-06-16 16:24:25,023 - root - INFO - step:  6  loss:  7.7732  memory:  4.90GiB(6.20%)  tps: 13,859
[rank0]:[titan] 2025-06-16 16:24:25,324 - root - INFO - step:  7  loss:  7.6987  memory:  4.90GiB(6.20%)  tps: 13,664
[rank0]:[titan] 2025-06-16 16:24:25,617 - root - INFO - step:  8  loss:  7.6779  memory:  4.90GiB(6.20%)  tps: 13,985
[rank0]:[titan] 2025-06-16 16:24:25,911 - root - INFO - step:  9  loss:  7.6043  memory:  4.90GiB(6.20%)  tps: 13,962
[rank0]:[titan] 2025-06-16 16:24:26,207 - root - INFO - step: 10  loss:  7.5778  memory:  4.90GiB(6.20%)  tps: 13,891
"""
diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py
@@ -405,6 +405,7 @@ def log(
             f"{color.blue}tps: {round(tps):,}  "
             # f"{color.cyan}tflops: {tflops:,.2f}  "
             # f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"
+            f"{color.reset}"
         )
 
         self.ntokens_since_last_log = 0
diff --git a/torchtitan/experiments/auto_parallel/parallelize_llama.py b/torchtitan/experiments/auto_parallel/parallelize_llama.py
@@ -21,6 +21,7 @@
 
 def parallelize_llama(
     model_fn,
+    init_fn,  # TODO(whc) hack to pass stuff to autoparallel
     world_mesh: DeviceMesh,
     parallel_dims: ParallelDims,
     job_config: JobConfig,
@@ -32,9 +33,14 @@ def parallelize_llama(
     NOTE: The passed-in model preferably should be on meta device. Otherwise,
     the model must fit on GPU or CPU memory.
     """
-    # model = model.to_empty(device="cuda")
-
     # TODO: make auto-p work with already created model object?
+    # wherever the auto-parallel code that creates a FakeTensorMode is...
+    # fake_mode = ...
+    # for k, v in m.named_parameters():
+    #     # swap each param in your model with a fake tensor version
+    #     # warning - we probably need to do this before initializing the optimizer?
+    #     setattr(m, k, fake_mode.from_tensor(v))
+    # # also do the same for named_buffers
 
     def input_fn():
         global_batch_size = job_config.training.global_batch_size
@@ -56,6 +62,10 @@ def input_fn():
     assert parallel_dims.cp_enabled is False, "CP not supported yet"
     assert parallel_dims.pp_enabled is False, "PP not supported yet"
 
+    # bail out
+    # model = model_fn()
+    # return model
+
     autop = AutoParallel(model_fn, input_fn, world_mesh)
     autop.add_parameter_memory_constraint(low=None, high=None)
 
@@ -73,4 +83,5 @@ def input_fn():
         torch._inductor.config.reorder_for_peak_memory = False
         parallel_mod = torch.compile(parallel_mod, fullgraph=True)
 
+    init_fn(parallel_mod)
     return parallel_mod
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -148,8 +148,79 @@ def __init__(self, job_config: JobConfig):
         )
 
         def model_fn():
+            # WHC - allow auto_p to construct the model object under its own fake_mode.
+            # TODO: let us pass in meta model, and internally hook it up to the auto_p fake mode
             return model_cls.from_model_args(model_args).cuda()
 
+        def init_fn(model):
+            # WHC - horrible hack to make auto-parallel work. basically, create a bespoke init_fn for llama3 by copying
+            # code from the llama3 init_weights functions throughout the model components, and adjusting them to use
+            # the new FQN structures in autoparallel.
+            # TODO: make it possible to more easily reuse the existing 'init_weights' functions on the auto_p module
+            def param(name):
+                return model.get_parameter(f"params.{name}")
+
+            from torchtitan.models.llama3.model import precompute_freqs_cis
+
+            model.buffers_.get_buffer("freqs_cis").copy_(
+                precompute_freqs_cis(
+                    model_args.dim // model_args.n_heads,
+                    model_args.max_seq_len,
+                    model_args.rope_theta,
+                )
+            )
+
+            torch.nn.init.normal_(param("tok_embeddings/weight"))
+
+            def init_layer(i):
+                for norm in ("attention_norm", "ffn_norm"):
+                    torch.nn.init.ones_(param(f"layers/{i}/{norm}/weight"))
+
+                if model_args.depth_init:
+                    weight_init_std = 0.02 / (2 * (i + 1)) ** 0.5
+                else:
+                    weight_init_std = 0.02 / (2 * model_args.n_layers) ** 0.5
+
+                for linear in ("wq", "wk", "wv"):
+                    torch.nn.init.trunc_normal_(
+                        param(f"layers/{i}/attention/{linear}/weight"),
+                        mean=0.0,
+                        std=0.02,
+                    )
+                torch.nn.init.trunc_normal_(
+                    param(f"layers/{i}/attention/wo/weight"),
+                    mean=0.0,
+                    std=weight_init_std,
+                )
+
+                torch.nn.init.trunc_normal_(
+                    param(f"layers/{i}/feed_forward/w1/weight"), mean=0.0, std=0.02
+                )
+                for linear in ("w2", "w3"):
+                    torch.nn.init.trunc_normal_(
+                        param(f"layers/{i}/feed_forward/{linear}/weight"),
+                        mean=0.0,
+                        std=weight_init_std,
+                    )
+
+            for i in range(model_args.n_layers):
+                init_layer(i)
+
+            if param("norm/weight") is not None:
+                torch.nn.init.ones_(param("norm/weight"))
+
+            final_out_std = model_args.dim**-0.5
+            cutoff_factor = 3
+
+            if param("output/weight") is not None:
+                torch.nn.init.trunc_normal_(
+                    param("output/weight"),
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+
         # with torch.device("meta"):
         # model = model_fn()
         # Build the collection of model converters. No-op if `model.converters` empty
@@ -254,12 +325,12 @@ def model_fn():
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(
-                model_fn, world_mesh, parallel_dims, job_config
+                model_fn, init_fn, world_mesh, parallel_dims, job_config
             )
 
-            model.to_empty(device=init_device)
-            with torch.no_grad():
-                model.init_weights(buffer_device=buffer_device)
+            # model.to_empty(device=init_device)
+            # with torch.no_grad():
+            #     model.init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]

Original file line number	Diff line number	Diff line change
`@@ -405,6 +405,7 @@ def log(`
`405`	`405`	`f"{color.blue}tps: {round(tps):,} "`
`406`	`406`	`# f"{color.cyan}tflops: {tflops:,.2f} "`
`407`	`407`	`# f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"`
	`408`	`+ f"{color.reset}"`
`408`	`409`	`)`
`409`	`410`
`410`	`411`	`self.ntokens_since_last_log = 0`