Adopt new autoparallel API with meta-init model

wconstab · wconstab · commit c8fb6b54f39e · 2025-06-18T16:19:20.000-07:00
Allows reverting a lot of the hacks in the original integration that
were caused by not creating a model obj in the train.py due to passing a
model_fn builder to autop.
diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py
@@ -354,7 +354,7 @@ def log(
         global_max_loss: float,
         extra_metrics: dict[str, Any] | None = None,
     ):
-        # assert self.num_flops_per_token > 0, "num_flops_per_token must be set"
+        assert self.num_flops_per_token > 0, "num_flops_per_token must be set"
 
         time_delta = time.perf_counter() - self.time_last_log
 
@@ -365,8 +365,8 @@ def log(
         # model FLOPS utilization
         # For its definition and calculation, please refer to the PaLM paper:
         # https://arxiv.org/abs/2204.02311
-        # mfu = 100 * self.num_flops_per_token * tps / self.gpu_peak_flops
-        # tflops = self.num_flops_per_token * tps / 1e12
+        mfu = 100 * self.num_flops_per_token * tps / self.gpu_peak_flops
+        tflops = self.num_flops_per_token * tps / 1e12
 
         time_end_to_end = time_delta / self.job_config.metrics.log_freq
         time_data_loading = sum(self.data_loading_times) / len(self.data_loading_times)
@@ -378,8 +378,8 @@ def log(
             "loss_metrics/global_avg_loss": global_avg_loss,
             "loss_metrics/global_max_loss": global_max_loss,
             "throughput(tps)": tps,
-            # "tflops": tflops,
-            # "mfu(%)": mfu,
+            "tflops": tflops,
+            "mfu(%)": mfu,
             "time_metrics/end_to_end(s)": time_end_to_end,
             "time_metrics/data_loading(s)": time_data_loading,
             "time_metrics/data_loading(%)": time_data_loading_pct,
@@ -403,9 +403,8 @@ def log(
             f"{color.yellow}memory: {device_mem_stats.max_reserved_gib:5.2f}GiB"
             f"({device_mem_stats.max_reserved_pct:.2f}%)  "
             f"{color.blue}tps: {round(tps):,}  "
-            # f"{color.cyan}tflops: {tflops:,.2f}  "
-            # f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"
-            f"{color.reset}"
+            f"{color.cyan}tflops: {tflops:,.2f}  "
+            f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"
         )
 
         self.ntokens_since_last_log = 0
diff --git a/torchtitan/experiments/auto_parallel/parallelize_llama.py b/torchtitan/experiments/auto_parallel/parallelize_llama.py
@@ -20,8 +20,7 @@
 
 
 def parallelize_llama(
-    model_fn,
-    init_fn,  # TODO(whc) hack to pass stuff to autoparallel
+    model,
     world_mesh: DeviceMesh,
     parallel_dims: ParallelDims,
     job_config: JobConfig,
@@ -33,15 +32,6 @@ def parallelize_llama(
     NOTE: The passed-in model preferably should be on meta device. Otherwise,
     the model must fit on GPU or CPU memory.
     """
-    # TODO: make auto-p work with already created model object?
-    # wherever the auto-parallel code that creates a FakeTensorMode is...
-    # fake_mode = ...
-    # for k, v in m.named_parameters():
-    #     # swap each param in your model with a fake tensor version
-    #     # warning - we probably need to do this before initializing the optimizer?
-    #     setattr(m, k, fake_mode.from_tensor(v))
-    # # also do the same for named_buffers
-
     def input_fn():
         global_batch_size = job_config.training.global_batch_size
         if global_batch_size < 0:
@@ -66,7 +56,7 @@ def input_fn():
     # model = model_fn()
     # return model
 
-    autop = AutoParallel(model_fn, input_fn, world_mesh)
+    autop = AutoParallel(model, input_fn, world_mesh, device=world_mesh.device_type)
     autop.add_parameter_memory_constraint(low=None, high=None)
 
     x_sharding = (Shard(0), Replicate())
@@ -83,5 +73,4 @@ def input_fn():
         torch._inductor.config.reorder_for_peak_memory = False
         parallel_mod = torch.compile(parallel_mod, fullgraph=True)
 
-    init_fn(parallel_mod)
     return parallel_mod
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -12,7 +12,6 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
-
 import torchtitan.components.ft as ft
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
@@ -25,7 +24,7 @@
 from torchtitan.config_manager import ConfigManager, JobConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 
-# from torchtitan.protocols.model_converter import build_model_converters
+from torchtitan.protocols.model_converter import build_model_converters
 from torchtitan.tools import utils
 from torchtitan.tools.logging import init_logger, logger
 from torchtitan.tools.profiling import (
@@ -147,12 +146,8 @@ def __init__(self, job_config: JobConfig):
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
 
-        def model_fn():
-            # WHC - allow auto_p to construct the model object under its own fake_mode.
-            # TODO: let us pass in meta model, and internally hook it up to the auto_p fake mode
-            return model_cls.from_model_args(model_args).cuda()
 
-        def init_fn(model):
+        def llama3_autoparallel_init_fn(model):
             # WHC - horrible hack to make auto-parallel work. basically, create a bespoke init_fn for llama3 by copying
             # code from the llama3 init_weights functions throughout the model components, and adjusting them to use
             # the new FQN structures in autoparallel.
@@ -221,11 +216,11 @@ def init_layer(i):
                     b=cutoff_factor * final_out_std,
                 )
 
-        # with torch.device("meta"):
-        # model = model_fn()
-        # Build the collection of model converters. No-op if `model.converters` empty
-        # model_converters = build_model_converters(job_config, parallel_dims)
-        # model_converters.convert(model)
+        with torch.device("meta"):
+            model = model_cls.from_model_args(model_args)
+            # Build the collection of model converters. No-op if `model.converters` empty
+            model_converters = build_model_converters(job_config, parallel_dims)
+            model_converters.convert(model)
 
         # metrics logging
         build_metrics_processor_fn = (
@@ -239,15 +234,15 @@ def init_layer(i):
         color = self.metrics_processor.color
 
         # calculate model size and flops per token
-        # (
-        #     model_param_count,
-        #     self.metrics_processor.num_flops_per_token,
-        # ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
+        (
+            model_param_count,
+            self.metrics_processor.num_flops_per_token,
+        ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
 
-        # logger.info(
-        #     f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} "
-        #     f"{color.red}size: {model_param_count:,} total parameters{color.reset}"
-        # )
+        logger.info(
+            f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} "
+            f"{color.red}size: {model_param_count:,} total parameters{color.reset}"
+        )
 
         # move sharded model to CPU/GPU and initialize weights via DTensor
         if job_config.checkpoint.create_seed_checkpoint:
@@ -325,12 +320,14 @@ def init_layer(i):
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(
-                model_fn, init_fn, world_mesh, parallel_dims, job_config
+                model, world_mesh, parallel_dims, job_config
             )
 
-            # model.to_empty(device=init_device)
-            # with torch.no_grad():
-            #     model.init_weights(buffer_device=buffer_device)
+            model.to_empty(device=init_device)
+            with torch.no_grad():
+                # TODO(whc) make model.init_weights work with autoparallel
+                llama3_autoparallel_init_fn(model)
+                # model.init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]
@@ -362,11 +359,11 @@ def init_layer(i):
         # Post optimizer step model converters hook.
         # e.g. calculate float8 dynamic amax/scale for all-parameter for FSDP2
         # where it issues a single all-reduce for all parameters at once for better performance
-        # self.optimizers.register_step_post_hook(
-        #     lambda *args, **kwargs: model_converters.post_optimizer_hook(
-        #         self.model_parts
-        #     )
-        # )
+        self.optimizers.register_step_post_hook(
+            lambda *args, **kwargs: model_converters.post_optimizer_hook(
+                self.model_parts
+            )
+        )
         self.metrics_processor.optimizers = self.optimizers
 
         # Initialize trainer states that will be saved in checkpoint.