Drop hacky llama3_init_fn and use autop init_weights feature

wconstab · wconstab · commit 8739c23e013e · 2025-07-02T16:21:26.000-07:00
Relying on pytorch-labs/autoparallel#20, this lets us automatically apply a user's init_weights fn to the autoparallel model. Verified this works with `CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --model.name llama3_auto_parallel --parallelism.tensor_parallel_degree 4 --training.dataset c4` ``` [rank0]:[titan] 2025-07-02 16:18:02,007 - root - INFO - Training starts at step 1. [rank0]:[titan] 2025-07-02 16:18:08,224 - root - INFO - step: 1 loss: 8.1848 memory: 1.09GiB(1.14%) tps: 77 tflops: 0.01 mfu: 0.00% [rank0]:[titan] 2025-07-02 16:18:08,224 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:[titan] 2025-07-02 16:18:08,310 - root - INFO - step: 2 loss: 8.1619 memory: 1.15GiB(1.21%) tps: 48,138 tflops: 3.46 mfu: 0.35 % [rank0]:[titan] 2025-07-02 16:18:08,356 - root - INFO - step: 3 loss: 8.1140 memory: 1.15GiB(1.21%) tps: 88,440 tflops: 6.36 mfu: 0.64 % [rank0]:[titan] 2025-07-02 16:18:08,406 - root - INFO - step: 4 loss: 8.0099 memory: 1.15GiB(1.21%) tps: 82,626 tflops: 5.94 mfu: 0.60 % [rank0]:[titan] 2025-07-02 16:18:08,457 - root - INFO - step: 5 loss: 7.8928 memory: 1.15GiB(1.21%) tps: 81,594 tflops: 5.87 mfu: 0.59 % [rank0]:[titan] 2025-07-02 16:18:08,508 - root - INFO - step: 6 loss: 7.7758 memory: 1.15GiB(1.21%) tps: 79,607 tflops: 5.72 mfu: 0.58 % [rank0]:[titan] 2025-07-02 16:18:08,559 - root - INFO - step: 7 loss: 7.6221 memory: 1.15GiB(1.21%) tps: 81,448 tflops: 5.86 mfu: 0.59 % [rank0]:[titan] 2025-07-02 16:18:08,611 - root - INFO - step: 8 loss: 7.5578 memory: 1.15GiB(1.21%) tps: 79,732 tflops: 5.73 mfu: 0.58 % [rank0]:[titan] 2025-07-02 16:18:08,659 - root - INFO - step: 9 loss: 7.3851 memory: 1.15GiB(1.21%) tps: 85,655 tflops: 6.16 mfu: 0.62 % [rank0]:[titan] 2025-07-02 16:18:08,709 - root - INFO - step: 10 loss: 7.3361 memory: 1.15GiB(1.21%) tps: 81,855 tflops: 5.89 mfu: 0.60 % [rank0]:[titan] 2025-07-02 16:18:08,709 - root - INFO - Sleeping 2 seconds for other ranks to complete ```
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -161,79 +161,6 @@ def __init__(self, job_config: JobConfig):
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
 
-
-        def llama3_autoparallel_init_fn(model):
-            # WHC - horrible hack to make auto-parallel work. basically, create a bespoke init_fn for llama3 by copying
-            # code from the llama3 init_weights functions throughout the model components, and adjusting them to use
-            # the new FQN structures in autoparallel.
-            # TODO: make it possible to more easily reuse the existing 'init_weights' functions on the auto_p module
-            def param(name):
-                return model.get_parameter(f"params.{name}")
-
-            from torchtitan.models.llama3.model import precompute_freqs_cis
-
-            model.buffers_.get_buffer("freqs_cis").copy_(
-                DTensor.from_local(
-                    precompute_freqs_cis(
-                        model_args.dim // model_args.n_heads,
-                        model_args.max_seq_len,
-                        model_args.rope_theta,
-                    ),
-                    device_mesh=model.buffers_.get_buffer("freqs_cis").device_mesh,
-                )
-            )
-
-            torch.nn.init.normal_(param("tok_embeddings/weight"))
-
-            def init_layer(i):
-                for norm in ("attention_norm", "ffn_norm"):
-                    torch.nn.init.ones_(param(f"layers/{i}/{norm}/weight"))
-
-                if model_args.depth_init:
-                    weight_init_std = 0.02 / (2 * (i + 1)) ** 0.5
-                else:
-                    weight_init_std = 0.02 / (2 * model_args.n_layers) ** 0.5
-
-                for linear in ("wq", "wk", "wv"):
-                    torch.nn.init.trunc_normal_(
-                        param(f"layers/{i}/attention/{linear}/weight"),
-                        mean=0.0,
-                        std=0.02,
-                    )
-                torch.nn.init.trunc_normal_(
-                    param(f"layers/{i}/attention/wo/weight"),
-                    mean=0.0,
-                    std=weight_init_std,
-                )
-
-                torch.nn.init.trunc_normal_(
-                    param(f"layers/{i}/feed_forward/w1/weight"), mean=0.0, std=0.02
-                )
-                for linear in ("w2", "w3"):
-                    torch.nn.init.trunc_normal_(
-                        param(f"layers/{i}/feed_forward/{linear}/weight"),
-                        mean=0.0,
-                        std=weight_init_std,
-                    )
-
-            for i in range(model_args.n_layers):
-                init_layer(i)
-
-            if param("norm/weight") is not None:
-                torch.nn.init.ones_(param("norm/weight"))
-
-            final_out_std = model_args.dim**-0.5
-            cutoff_factor = 3
-
-            if param("output/weight") is not None:
-                torch.nn.init.trunc_normal_(
-                    param("output/weight"),
-                    mean=0.0,
-                    std=final_out_std,
-                    a=-cutoff_factor * final_out_std,
-                    b=cutoff_factor * final_out_std,
-                )
-
         with torch.device("meta"):
             model = model_cls.from_model_args(model_args)
             # Build the collection of model converters. No-op if `model.converters` empty
@@ -343,9 +270,7 @@ def init_layer(i):
 
             model.to_empty(device=init_device)
             with torch.no_grad():
-                # TODO(whc) make model.init_weights work with autoparallel
-                llama3_autoparallel_init_fn(model)
-                # model.init_weights(buffer_device=buffer_device)
+                model.init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]