make offs optional for scaled grouped mm

danielvegamyhre · danielvegamyhre · commit efd993f09df2 · 2025-06-21T11:36:06.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from typing import Optional
 
 import torch
@@ -18,11 +19,13 @@
     _is_column_major,
 )
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 
 def _scaled_grouped_mm(
     A: torch.Tensor,
     B_t: torch.Tensor,
-    offs: torch.Tensor,
+    offs: Optional[torch.Tensor] = None,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
 ) -> torch.Tensor:
     """
@@ -38,6 +41,7 @@ def _scaled_grouped_mm(
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
         use_triton_for_per_group_scales (bool): Whether to use custom triton kernels to compute per-group scales. Default is True.
     """
+    logger.info("Using differentiable _scaled_grouped_mm")
     return _Float8GroupedMM.apply(
         A,
         B_t,
@@ -54,9 +58,8 @@ def forward(
         ctx,
         A: torch.Tensor,
         B_t: torch.Tensor,
-        offs: torch.Tensor,
+        offs: Optional[torch.Tensor] = None,
         out_dtype: Optional[torch.dtype] = torch.bfloat16,
-        use_triton_for_per_group_scales: bool = True,
     ) -> torch.Tensor:
         # torchao _scaled_grouped_mm only supports A=2D, B=3D.
         assert A.ndim == 2, "A must be 2D"
@@ -76,7 +79,6 @@ def forward(
         assert B_t.dtype == torch.float32 or B_t.dtype == torch.bfloat16, (
             "B must be float32 or bfloat16"
         )
-        assert offs.dtype == torch.int32, "offs must be int32"
 
         # Assert A and B dims are compatible for a scaled grouped GEMM.
         assert A.size(-1) == B_t.size(-2), (
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -1,3 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
 from typing import Any, Optional, Tuple
 
 import torch
@@ -6,6 +13,8 @@
 
 from torchao.prototype.moe_training import _scaled_grouped_mm
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 _ops_to_preserve_subclass = {
     torch.ops.aten.empty_like.default,
     torch.ops.aten.new_zeros.default,
@@ -27,7 +36,7 @@ class ScaledGroupedMMTensor(torch.Tensor):
     differentiable _scaled_grouped_mm autograd function.
     """
 
-    grouped_mm_func_name = "_grouped_mm"
+    grouped_mm_func_names = {"_grouped_mm", "_grouped_mm.default"}
     offs_arg_name = "offs"
 
     @staticmethod
@@ -57,7 +66,7 @@ def __init__(
     @classmethod
     def __torch_function__(cls, func, types, args, kwargs={}):
         # override the grouped mm op to use the differentiable _scaled_grouped_mm
-        if func.__name__ == cls.grouped_mm_func_name:
+        if func.__name__ in cls.grouped_mm_func_names:
             # Use torchao scaled grouped mm with dynamic quant for
             # "2d x 3d with offsets" case (used for routed experts).
             # Otherwise, fall back to regular grouped mm.
@@ -69,7 +78,9 @@ def __torch_function__(cls, func, types, args, kwargs={}):
             A_is_2d = A.dim() == 2
             B_is_3d = B.dim() == 3
             has_offs = kwargs.get(cls.offs_arg_name) is not None
-            if A_is_2d and B_is_3d and has_offs:
+            logger.info(f"A.shape={A.shape}, B.shape={B.shape}, has_offs={has_offs}")
+            
+            if A_is_2d and B_is_3d:
                 return _scaled_grouped_mm(
                     *args,
                     **kwargs,
@@ -107,7 +118,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs={}):
         )
 
     def fsdp_pre_all_gather(self, mesh):
-        return (self._data,), ()
+        metadata = ()
+        return (self._data,), metadata
 
     def fsdp_post_all_gather(
         self,