Fixing test_quantize_fp8_matmul for CUDA graph (pytorch#4425)

cthi · facebook-github-bot · commit 80ed942c80c4 · 2025-07-01T11:42:32.000-07:00
Summary: Pull Request resolved: pytorch#4425 X-link: facebookresearch/FBGEMM#1492 With cuda graph you will run into issues like P1856907021 sporadically on this test. The confusing thing is the RNG errors are thrown outside of the cuda graph (when we do `torch.randn`) - Im not sure if the tests are running in parallel with hypothesis/buck but this could potentially be the cause. The change is we always warm up before cuda graph capture, even for non-triton. This is a good practice anyways, since some initialization could occur beneath us in ATen. After adding back cuda graph, now the test runs reliably. Reviewed By: jwfromm Differential Revision: D77596554 fbshipit-source-id: 6a65ba530bbac5d1357ac24ca9638e28e33369c8
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -278,7 +278,7 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
         ),
         QType=st.sampled_from([fp8_e4m3, fp8_e5m2]),
         Bias=st.sampled_from([True, False]),
-        CudaGraph=st.sampled_from([False]),
+        CudaGraph=st.sampled_from([True, False]),
         UseTriton=st.sampled_from([False] + ([True] if torch.version.cuda else [])),
         UseFastAccum=st.booleans(),
         InputMultiDim=st.booleans(),
@@ -337,78 +337,62 @@ def test_quantize_fp8_matmul(
         )
 
         if Mode == "tensorwise":
-            if CudaGraph:
-                g = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(g):
-                    xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
-                    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
-                    zq = torch.ops.fbgemm.f8f8bf16(xq, wq, x_scale * w_scale)
-                    if bias is not None:
-                        zq += bias
-                g.replay()
-            else:
+
+            def f(
+                x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor]
+            ) -> torch.Tensor:
                 xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
                 wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
                 zq = torch.ops.fbgemm.f8f8bf16(xq, wq, x_scale * w_scale)
                 if bias is not None:
                     zq += bias
-        elif Mode == "tensorwise_broadcast":
-            xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
-            wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
-            x_scale = x_scale.item()
-            w_scale = w_scale.item()
+                return zq
+
             if CudaGraph:
+                # Warm-up to avoid capture issues
+                f(x, w, bias)
+
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
-                    zq = torch.ops.fbgemm.f8f8bf16_tensorwise(
-                        xq, wq, x_scale * w_scale, use_fast_accum=UseFastAccum
-                    )
-                    if bias is not None:
-                        zq += bias
+                    zq = f(x, w, bias)
                 g.replay()
             else:
+                zq = f(x, w, bias)
+        elif Mode == "tensorwise_broadcast":
+
+            def f(
+                xq: torch.Tensor,
+                wq: torch.Tensor,
+                scale: float,
+                bias: Optional[torch.Tensor],
+            ) -> torch.Tensor:
                 zq = torch.ops.fbgemm.f8f8bf16_tensorwise(
-                    xq, wq, x_scale * w_scale, use_fast_accum=UseFastAccum
+                    xq, wq, scale, use_fast_accum=UseFastAccum
                 )
                 if bias is not None:
                     zq += bias
-        elif Mode == "rowwise":
+                return zq
+
+            xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
+            wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
+            x_scale = x_scale.item()
+            w_scale = w_scale.item()
+
             if CudaGraph:
-                # Warm up triton functions before cuda graph.
-                xq, x_scale = quantize_fp8_row(x)
-                wq, w_scale = quantize_fp8_row(w)
-                if UseTriton and torch.version.cuda:
-                    zq = matmul_fp8_row(
-                        xq, wq, x_scale, w_scale, fp8_fast_accum=UseFastAccum
-                    )
+                # Warm-up to avoid capture issues
+                f(xq, wq, x_scale * w_scale, bias)
+
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
-                    if torch.version.cuda:
-                        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
-                            x, output_dtype=QType
-                        )
-                        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
-                    else:
-                        xq, x_scale = quantize_fp8_row(x)
-                        wq, w_scale = quantize_fp8_row(w)
-                    if UseTriton and torch.version.cuda:
-                        zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
-                        if bias is not None:
-                            zq += bias
-                    else:
-                        zq = torch.ops.fbgemm.f8f8bf16_rowwise(
-                            xq,
-                            wq,
-                            x_scale,
-                            w_scale,
-                            bias=bias if torch.version.cuda else None,
-                            use_fast_accum=UseFastAccum,
-                        )
-                        # Bias fusion not yet supported on AMD.
-                        if bias is not None and torch.version.hip:
-                            zq += bias
+                    zq = f(xq, wq, x_scale * w_scale, bias)
                 g.replay()
             else:
+                zq = f(xq, wq, x_scale * w_scale, bias)
+        elif Mode == "rowwise":
+
+            def f(
+                x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor]
+            ) -> torch.Tensor:
                 if torch.version.cuda:
                     xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(
                         x, output_dtype=QType
@@ -418,9 +402,7 @@ def test_quantize_fp8_matmul(
                     xq, x_scale = quantize_fp8_row(x)
                     wq, w_scale = quantize_fp8_row(w)
                 if UseTriton and torch.version.cuda:
-                    zq = matmul_fp8_row(
-                        xq, wq, x_scale, w_scale, fp8_fast_accum=UseFastAccum
-                    )
+                    zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
                     if bias is not None:
                         zq += bias
                 else:
@@ -435,14 +417,27 @@ def test_quantize_fp8_matmul(
                     # Bias fusion not yet supported on AMD.
                     if bias is not None and torch.version.hip:
                         zq += bias
-        elif Mode == "blockwise":
-            block_m = block_n = block_k = 128
-            output_device = torch.device(self.device)
+
+                return zq
+
             if CudaGraph:
-                #  Need a warmup to compile the Triton kernel before cuda graph
+                # Warm-up to avoid capture issues
+                f(x, w, bias)
+
+                g = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(g):
+                    zq = f(x, w, bias)
+                g.replay()
+            else:
+                zq = f(x, w, bias)
+        elif Mode == "blockwise":
 
+            def f(
+                x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor]
+            ) -> torch.Tensor:
+                block_m = block_n = block_k = 128
                 wq, w_scale = quantize_fp8_block(
-                    w, block_n, block_k, output_device=output_device
+                    w, block_n, block_k, output_device=torch.device(self.device)
                 )
                 xq, x_scale = quantize_fp8_block(x, block_m, block_k)
                 if UseTriton:
@@ -463,52 +458,18 @@ def test_quantize_fp8_matmul(
                 if bias is not None:
                     zq += bias
 
+                return zq
+
+            if CudaGraph:
+                # Warm-up to avoid capture issues
+                f(x, w, bias)
+
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
-                    wq, w_scale = quantize_fp8_block(
-                        w, block_n, block_k, output_device=output_device
-                    )
-                    xq, x_scale = quantize_fp8_block(x, block_m, block_k)
-                    if UseTriton:
-                        zq = matmul_fp8_block(
-                            xq,
-                            wq,
-                            x_scale,
-                            w_scale,
-                            block_m,
-                            block_n,
-                            block_k,
-                            fp8_fast_accum=UseFastAccum,
-                        )
-                    else:
-                        zq = torch.ops.fbgemm.f8f8bf16_blockwise(
-                            xq, wq, x_scale, w_scale, block_m, block_n, block_k
-                        )
-                    if bias is not None:
-                        zq += bias
+                    zq = f(x, w, bias)
                 g.replay()
             else:
-                wq, w_scale = quantize_fp8_block(
-                    w, block_n, block_k, output_device=output_device
-                )
-                xq, x_scale = quantize_fp8_block(x, block_m, block_k)
-                if UseTriton:
-                    zq = matmul_fp8_block(
-                        xq,
-                        wq,
-                        x_scale,
-                        w_scale,
-                        block_m,
-                        block_n,
-                        block_k,
-                        fp8_fast_accum=UseFastAccum,
-                    )
-                else:
-                    zq = torch.ops.fbgemm.f8f8bf16_blockwise(
-                        xq, wq, x_scale, w_scale, block_m, block_n, block_k
-                    )
-                if bias is not None:
-                    zq += bias
+                zq = f(x, w, bias)
         else:
             raise ValueError(f"Invalid mode {Mode}")