[not for land] Use new AC

soulitzer · soulitzer · commit fdbcbb3a47bc · 2025-06-13T05:55:49.000-07:00
ghstack-source-id: dbec8c3 Pull-Request-resolved: #1294
diff --git a/torchtitan/models/llama3/parallelize_llama.py b/torchtitan/models/llama3/parallelize_llama.py
@@ -12,10 +12,35 @@
 import torch
 import torch.nn as nn
 from torch.distributed._composable.replicate import replicate
+
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    checkpoint_wrapper as ptd_checkpoint_wrapper,
+    ActivationWrapper,
+    # checkpoint_wrapper as ptd_checkpoint_wrapper,
 )
 
+
+class CheckpointWrapper(ActivationWrapper):
+    def __init__(self, mod: torch.nn.Module, **kwargs):
+        super().__init__(mod)
+        self._checkpoint_wrapped_module = mod
+        self._policy_fn = kwargs.get("policy_fn", None)
+
+    def forward(self, *args, **kwargs):
+        from ac_experimental import apply_ac_policy
+
+        policy = self._policy_fn if self._policy_fn is not None else "recompute_all"
+
+        with apply_ac_policy(policy_fn=policy):
+            return self._checkpoint_wrapped_module(*args, **kwargs)
+        # return apply_ac_policy_fn(
+        #     self._checkpoint_wrapped_module, *args, **kwargs, policy_fn="recompute_all"
+        # )
+
+
+def ptd_checkpoint_wrapper(mod, **kwargs):
+    return CheckpointWrapper(mod, **kwargs)
+
+
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Replicate, Shard
@@ -251,9 +276,39 @@ def _apply_ac_to_transformer_block(module: nn.Module, ac_config):
             create_selective_checkpoint_contexts,
         )
 
-        def _get_custom_policy(meta):
-            def _custom_policy(ctx, func, *args, **kwargs):
-                mode = "recompute" if ctx.is_recompute else "forward"
+        # def _get_custom_policy(meta):
+        #     def _custom_policy(ctx, func, *args, **kwargs):
+        #         mode = "recompute" if ctx.is_recompute else "forward"
+        #         mm_count_key = f"{mode}_mm_count"
+        #         if func == torch.ops.aten.mm.default:
+        #             meta[mm_count_key] += 1
+        #         # Saves output of all compute ops, except every second mm
+        #         # to_save = func in _save_list and not (
+        #         #     func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+        #         # )
+        #         return (
+        #             CheckpointPolicy.MUST_SAVE
+        #             if func in _save_list
+        #             else CheckpointPolicy.PREFER_RECOMPUTE
+        #         )
+
+        #     return _custom_policy
+
+        # def selective_checkpointing_context_fn():
+        #     meta = defaultdict(int)
+        #     return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+
+        # return ptd_checkpoint_wrapper(
+        #     module,
+        #     context_fn=selective_checkpointing_context_fn,
+        #     preserve_rng_state=False,
+        # )
+
+        def _get_custom_policy():
+            meta = defaultdict(int)
+
+            def _custom_policy(ctx, out, func, *args, **kwargs):
+                mode = "forward"  # recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
                     meta[mm_count_key] += 1
@@ -263,21 +318,17 @@ def _custom_policy(ctx, func, *args, **kwargs):
                 )
                 return (
                     CheckpointPolicy.MUST_SAVE
-                    if to_save
+                    if func in _save_list
                     else CheckpointPolicy.PREFER_RECOMPUTE
                 )
 
             return _custom_policy
 
-        def selective_checkpointing_context_fn():
-            meta = defaultdict(int)
-            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
-
         return ptd_checkpoint_wrapper(
             module,
-            context_fn=selective_checkpointing_context_fn,
-            preserve_rng_state=False,
+            policy_fn=_get_custom_policy(),
         )
+
     elif use_layer_sac:
         # Checkpoint every `ac_freq` of the modules passed to this function
         ac_freq = int(ac_config.selective_ac_option)