Fix scaled_mm_rowwise in quantize_bench (pytorch#4551)

cthi · facebook-github-bot · commit 65829a25d00e · 2025-07-24T06:22:04.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1594 When looking at this with samanamp recently I noticed the scaled_mm is not actually using the rowwise scaling with it, I think this was before it was supported properly. We also add support for compile, which will be useful for testing. Reviewed By: jianyuh Differential Revision: D78844879
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_bench.py
@@ -169,6 +169,7 @@ def benchmark_grouped(
     trace: bool = False,
     num_iters: int = 1,
     fast_accum: bool = True,
+    torch_compile: bool = False,
 ) -> Dict[str, Any]:
     num_groups = len(m)
     # Create input tensors.
@@ -197,6 +198,8 @@ def benchmark_grouped(
         # Set fast accum mode if applicable.
         if hasattr(quantize_op, "fast_accum"):
             quantize_op.fast_accum = fast_accum
+        if hasattr(quantize_op, "torch_compile"):
+            quantize_op.torch_compile = torch_compile
         # Get the quantized tensors for this operator.
         preprocessed_args = quantize_op.preprocess(A, B)
         quantized_vals = quantize_op.quantize(*preprocessed_args)
@@ -282,6 +285,7 @@ def benchmark(
     trace: bool = False,
     num_iters: int = 1,
     fast_accum: bool = True,
+    torch_compile: bool = False,
 ) -> Dict[str, Any]:
     # Create input tensors.
     if b > 1:
@@ -301,6 +305,8 @@ def benchmark(
         # Set fast accum mode if applicable.
         if hasattr(quantize_op, "fast_accum"):
             quantize_op.fast_accum = fast_accum
+        if hasattr(quantize_op, "torch_compile"):
+            quantize_op.torch_compile = torch_compile
         # Preprocess data if needed.
         preprocessed_args = quantize_op.preprocess(A, B)
         # Get the quantized tensors for this operator.
@@ -495,6 +501,7 @@ def main(args: Any):
             args.trace,
             args.num_iters,
             not args.disable_fast_accum,
+            args.torch_compile,
         )
         benchmark_results.append(quantize_measurements)
     if args.export_csv or args.plot:
@@ -625,6 +632,12 @@ def invoke_main() -> None:
         action="store_true",
         help="If set, disable fast accumulation for FP8 implementations.",
     )
+    parser.add_argument(
+        "--torch_compile",
+        default=False,
+        action="store_true",
+        help="If set, torch.compile will be used for scaled_mm backed ops.",
+    )
 
     args = parser.parse_args()
     main(args)
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -345,27 +345,35 @@ def cuda(self) -> bool:
 class ScaledMMRowwise(QuantizeOpBase):
     def __init__(self):
         self.fast_accum = True
+        self.torch_compile = False
 
     def quantize(self, x, w):
         xq, x_scale = quantize_fp8_row(x)
         wq, w_scale = quantize_fp8_row(w)
-        dummy_scale = torch.tensor([1.0], device=x.device, dtype=torch.float32)
-        return xq, wq.t(), x_scale, w_scale, dummy_scale
+        return xq, wq.t(), x_scale.unsqueeze(1), w_scale.unsqueeze(0)
 
-    def compute(self, xq, wq, x_scale, w_scale, dummy_scale):
-        output = torch._scaled_mm(
+    def compute(self, xq, wq, x_scale, w_scale):
+        if self.torch_compile:
+            f = torch.compile(
+                torch._scaled_mm,
+                options={
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "TRITON,CK,CUTLASS,ATEN",
+                },
+            )
+        else:
+            f = torch._scaled_mm
+
+        return f(
             xq,
             wq,
             bias=None,
             out_dtype=torch.bfloat16,
-            scale_a=dummy_scale,
-            scale_b=dummy_scale,
+            scale_a=x_scale,
+            scale_b=w_scale,
             scale_result=None,
             use_fast_accum=self.fast_accum,
         )
-        # Apply separate rowwise scaling.
-        output = scale_fp8_row(output, x_scale, w_scale)
-        return output
 
     def quantize_and_compute(self, x, w):
         return self.compute(*self.quantize(x, w))