Add option for selective op AC to filter mm shapes based on fqn

soulitzer · soulitzer · commit 3c4d97dba547 · 2025-07-11T07:38:47.000-07:00
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -487,6 +487,20 @@ class ActivationCheckpoint:
     'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
     """
 
+    selective_op_ac_force_recompute_mm_shapes_by_fqns: list[str] = field(
+        default_factory=lambda: []
+    )
+    """
+    When per-op selective ac is used, this list of fully qualified names (relative
+    to the module at which AC is applied) is used to determine which mm shapes to
+    force recompute, rather than being considered by rest of the sac policy, e.g
+    save every other mm. Only nn.Linear modules are supported today.
+
+    Note: this config applies to mms not limited to those matching the specified
+    fqns, e.g. if "moe.router.gate", corresponding to Linear(in, out), is specified,
+    ANY mm with shape matching (*, in) x (in, out) will be force recomputed.
+    """
+
 
 @dataclass
 class Float8:
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -261,11 +261,32 @@ def _apply_ac_to_transformer_block(module: nn.Module, ac_config):
             create_selective_checkpoint_contexts,
         )
 
+        mm_recompute_shapes = set()
+        if len(ac_config.selective_op_ac_force_recompute_mm_shapes_by_fqns) > 0:
+            for fqn, submod in module.named_modules():
+                if (
+                    fqn
+                    not in ac_config.selective_op_ac_force_recompute_mm_shapes_by_fqns
+                ):
+                    continue
+                if not isinstance(submod, nn.Linear):
+                    raise ValueError(
+                        "selective_op_ac_force_recompute_mm_shapes_by_fqns expected to match "
+                        f"a nn.Linear, but got: {submod}"
+                    )
+                out_f, in_f = submod.weight.shape
+                mm_recompute_shapes.add((in_f, out_f))
+            logger.debug(
+                f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}"
+            )
+
         def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
+                    if args[1].shape in mm_recompute_shapes:
+                        return CheckpointPolicy.PREFER_RECOMPUTE
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops, except every second mm
                 to_save = func in _save_list and not (