Replace nn.Linear with einsum in the model (#147)

tengyifei · web-flow · commit 6c3b297909e4 · 2025-03-10T11:40:00.000-07:00
Fixes #139. Previously, the batch and sequence dimensions are always squished together by nn.Linear, losing some sharding annotations. This is now no longer the case. As another proof that the sharding propagation got more sensible, I tried sharding the q/k/v proj activations explicitly: model.layers.*.self_attn.q_proj: [fsdp, null, tensor] model.layers.*.self_attn.k_proj: [fsdp, null, tensor] model.layers.*.self_attn.v_proj: [fsdp, null, tensor] model.layers.*.self_attn.o_proj: [fsdp, null, tensor] Without replacing nn.Linear with einsum, this actually _decreases_ MFU. Now this keeps MFU unchanged which is the expected behavior.
diff --git a/torchprime/torch_xla_models/README.md b/torchprime/torch_xla_models/README.md
@@ -80,7 +80,7 @@ tp run torchprime/torch_xla_models/train.py \
 ### Llama 3.1 405B on v6e-256
 
 Recipe for global batch size 64, sequence length 8192.
-Expected step duration: 27.349s. MFU: 21.48%.
+Expected step duration: 19.642s. MFU: 29.91%.
 
 ```sh
 export LIBTPU_INIT_ARGS='--xla_tpu_enable_flash_attention=false --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true --xla_tpu_scoped_vmem_limit_kib=98304'
diff --git a/torchprime/torch_xla_models/configs/model/scaling/llama-fsdp-tp.yaml b/torchprime/torch_xla_models/configs/model/scaling/llama-fsdp-tp.yaml
@@ -27,7 +27,10 @@ sharding:
   lm_head.weight: [tensor, fsdp]
 
   # Activations
-  model.layers.*.self_attn: [fsdp, null, tensor]
+  model.layers.*.self_attn.q_proj: [fsdp, null, tensor]
+  model.layers.*.self_attn.k_proj: [fsdp, null, tensor]
+  model.layers.*.self_attn.v_proj: [fsdp, null, tensor]
+  model.layers.*.self_attn.o_proj: [fsdp, null, tensor]
   model.layers.*.input_layernorm: [fsdp, null, tensor]
   model.layers.*.post_attention_layernorm: [fsdp, null, tensor]
   model.layers.*.mlp: [fsdp, null, tensor]
diff --git a/torchprime/torch_xla_models/train.py b/torchprime/torch_xla_models/train.py
@@ -24,6 +24,7 @@
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 from torch_xla.distributed.fsdp import checkpoint_module
+from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear
 
 # Transformers imports
 from transformers import (
@@ -81,13 +82,23 @@ def __init__(
     self.input_sharding_spec = xs.ShardingSpec(
       mesh, (("data", "fsdp"), None), minibatch=minibatch
     )
+
+    # Recursively replace `nn.Linear` layers with einsum operations in the model.
+    # Without this patch, an `nn.Linear` module will flatten non-contracting dimensions
+    # (e.g. batch and sequence), thus destroying the sharding constraints on those dimensions.
+    model = apply_xla_patch_to_nn_linear(model)
+
+    # Annotate model weights and activations with sharding constraints to distribute
+    # the training across devices following the SPMD paradigm.
     sharding_config = OmegaConf.to_container(
       self.config.model.scaling.sharding, resolve=True
     )
     assert isinstance(
       sharding_config, dict
     ), f"Sharding config {sharding_config} must be a dict"
     model = shard_torch_xla_model_from_config(model, config=sharding_config)
+
+    # Rematerialize forward computation during the backward pass if requested.
     model = self._checkpoint_model(model)
     model = self._add_optimization_barrier_model(model)
     self.model = model