handle out != None

danielvegamyhre · danielvegamyhre · commit 34dffa58f257 · 2025-07-02T09:41:02.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -40,16 +40,8 @@ def _scaled_grouped_mm(
         offs (int32 torch.Tensor): The offsets to use to mark the starting index of each group along dim0 of the A tensor.
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
     """
-<<<<<<< HEAD
-<<<<<<< HEAD
-    # logger.info("Using scaled_grouped_mm")
-=======
-    #logger.info("Using scaled_grouped_mm")
->>>>>>> 6ca070de (handle out != None)
-=======
     # TODO: Remove once prototype is more mature. This is currently very useful for development and debugging.
     logger.info("Using scaled_grouped_mm")
->>>>>>> 2f3bb137 (add tp support for fp8 moe training)
     return _Float8GroupedMM.apply(
         A,
         B_t,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -183,6 +183,7 @@ def fsdp_post_all_gather(
                     f"{out_data.dtype} {param_dtype}"
                 )
                 out_data.copy_(data)
+
             return
 
         # For training step 0, out=None, so we need to return a new ScaledGroupedMMTensor.

Original file line number	Diff line number	Diff line change
`@@ -183,6 +183,7 @@ def fsdp_post_all_gather(`
`183`	`183`	`f"{out_data.dtype} {param_dtype}"`
`184`	`184`	`)`
`185`	`185`	`out_data.copy_(data)`
	`186`	`+`
`186`	`187`	`return`
`187`	`188`
`188`	`189`	`# For training step 0, out=None, so we need to return a new ScaledGroupedMMTensor.`