Add option to exclude low flop mms from every-other-mm sac policy

soulitzer · soulitzer · commit 948a807dc447 · 2025-07-08T11:59:03.000-07:00
ghstack-source-id: 9c99bff Pull-Request-resolved: #1372
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -486,6 +486,15 @@ class ActivationCheckpoint:
     Selective activation checkpointing options ['int', 'op'].
     'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
     """
+    selective_op_ac_mm_flops_threshold: int = 0
+    """
+    When selective_ac_option is 'op', this threshold is used to determine whether to
+    apply save a given mm.
+
+    For example:
+    - 0 means no threshold; every other mm is saved
+    - 1e5 means every other mm is saved, excluding mm with flops > 1e5.
+    """
 
 
 @dataclass
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -265,6 +265,15 @@ def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
+
+                if func == torch.ops.aten.mm.default:
+                    m, k = args[0].shape
+                    k2, n = args[1].shape
+                    assert k == k2
+                    flops = m * n * 2 * k
+                    if flops < ac_config.selective_op_ac_mm_flops_threshold:
+                        return CheckpointPolicy.PREFER_RECOMPUTE
+
                 if func == torch.ops.aten.mm.default:
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops, except every second mm