set dtype

danielvegamyhre · danielvegamyhre · commit 02f061cf186a · 2025-06-27T14:55:12.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 from typing import Optional
 
 import torch
@@ -19,6 +20,9 @@
 )
 
 
+logger: logging.Logger = logging.getLogger(__name__)
+
+
 def _scaled_grouped_mm(
     A: torch.Tensor,
     B_t: torch.Tensor,
@@ -36,8 +40,8 @@ def _scaled_grouped_mm(
             and in column-major memory layout.
         offs (int32 torch.Tensor): The offsets to use to mark the starting index of each group along dim0 of the A tensor.
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
-        use_triton_for_per_group_scales (bool): Whether to use custom triton kernels to compute per-group scales. Default is True.
     """
+    logger.info("Using scaled_grouped_mm")
     return _Float8GroupedMM.apply(
         A,
         B_t,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -9,7 +9,11 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch import nn
 from torch._prims_common import suggest_memory_format
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.autograd.grad_mode import _unsafe_preserve_version_counter
 
 from torchao.prototype.moe_training import _scaled_grouped_mm
 
@@ -69,7 +73,6 @@ def __init__(
 
     @classmethod
     def __torch_function__(cls, func, types, args, kwargs={}):
-        logger.info(f"{func.__name__}, args: {args}, kwargs: {kwargs}")
         # override the grouped mm op to use the differentiable _scaled_grouped_mm
         if func.__name__ == cls.grouped_mm_func_name:
             # Use torchao scaled grouped mm with dynamic quant for
@@ -142,9 +145,18 @@ def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):
             flatten_spec["_dtype"],
         )
 
-    def fsdp_pre_all_gather(self, mesh):
-        all_gather_inputs = (self._data,)
+    # fsdp hooks based on https://github.com/pytorch/pytorch/blob/20e40492b046b9287726d3ec656117e4dc38f0e2/test/distributed/_composable/fsdp/test_fully_shard_extensions.py#L81
+    def fsdp_pre_all_gather(
+        self,
+        mesh: DeviceMesh,
+        outer_size: torch.Size,
+        outer_stride: tuple[int, ...],
+        module: nn.Module,
+        mp_policy: MixedPrecisionPolicy,
+    ):
+        all_gather_inputs = (self._data.to(mp_policy.param_dtype),)
         all_gather_metadata = ()
+        logger.debug(f"fsdp_pre_all_gather: self._data.dtype={self._data.dtype}, param_dtype: {mp_policy.param_dtype}")
         return all_gather_inputs, all_gather_metadata
 
     def fsdp_post_all_gather(
@@ -156,6 +168,15 @@ def fsdp_post_all_gather(
         out: Optional[torch.Tensor] = None,
     ):
         (data,) = all_gather_outputs
-        output = ScaledGroupedMMTensor(data, param_dtype)
-        inner_tensors = (data,)
+        logger.debug(f"fsdp_post_all_gather: data.dtype={data.dtype}, param_dtype: {param_dtype}")
+
+        if out is not None:
+            #with _unsafe_preserve_version_counter(out):
+            with torch.no_grad():
+                out.copy_(data)
+            return
+
+        upcast_data = data.to(param_dtype)
+        output = ScaledGroupedMMTensor(upcast_data, param_dtype)
+        inner_tensors = (upcast_data,)
         return output, inner_tensors