Add option to exclude low flop mms from every-other-mm sac policy

soulitzer · soulitzer · commit 7325e54b5b07 · 2025-07-08T12:13:15.000-07:00
ghstack-source-id: fabf2e0 Pull-Request-resolved: #1372
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -486,6 +486,12 @@ class ActivationCheckpoint:
     Selective activation checkpointing options ['int', 'op'].
     'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
     """
+    selective_op_ac_mm_flops_threshold: int = 0
+    """
+    When selective_ac_option is 'op', this threshold is used to determine whether to
+    save a given mm, e.g. 1e5 means excluding mms flops < 1e5, and then saving
+    every other mm from the remaining mms.
+    """
 
 
 @dataclass
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -265,6 +265,15 @@ def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
+
+                if func == torch.ops.aten.mm.default:
+                    m, k = args[0].shape
+                    k2, n = args[1].shape
+                    assert k == k2
+                    flops = m * n * 2 * k
+                    if flops < ac_config.selective_op_ac_mm_flops_threshold:
+                        return CheckpointPolicy.PREFER_RECOMPUTE
+
                 if func == torch.ops.aten.mm.default:
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops, except every second mm