Add accuracy check and fixes for fp8_attention Triton kernels (#276)

yf225 · facebook-github-bot · commit e3f6db64afa3 · 2025-07-12T19:59:23.000-07:00
Summary: Stacked PRs: * #281 * __->__#276 --- --- --- ### Add accuracy check and fixes for fp8_attention Triton kernels Pull Request resolved: #276 Reviewed By: xuzhao9 Differential Revision: D78183079 Pulled By: yf225 fbshipit-source-id: 0a2a46a120163cc0bcfc03a6a879129c028b1f2e
diff --git a/run.py b/run.py
@@ -10,7 +10,28 @@
 import sys
 from typing import List
 
+# Apply async_task patch for Triton 3.4+ compatibility
+import triton.language as tl
+
 from tritonbench.operator_loader import get_op_loader_bench_cls_by_name, is_loader_op
+
+if not hasattr(tl, "async_task"):
+
+    class _AsyncTaskContext:
+        """A no-op context manager to replace tl.async_task"""
+
+        def __init__(self, task_ids):
+            self.task_ids = task_ids
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            return False
+
+    # Add async_task to triton.language
+    tl.async_task = lambda task_ids: _AsyncTaskContext(task_ids)
+
 from tritonbench.operators import load_opbench_by_name
 from tritonbench.operators_collection import list_operators_by_collection
 from tritonbench.utils.env_utils import is_fbcode
diff --git a/tritonbench/operators/fp8_attention/operator.py b/tritonbench/operators/fp8_attention/operator.py
@@ -70,7 +70,7 @@ def __init__(
         if self.mode == BenchmarkMode.BWD or self.mode == BenchmarkMode.FWD_BWD:
             self.causal = True
         self.requires_grad = not self.tb_args.mode == "fwd_no_grad"
-        self.sm_scale = 1.3
+        self.sm_scale = 1.0 / math.sqrt(float(self.D_HEAD))
 
         if self.embedding_dim and self.H != self.embedding_dim // self.D_HEAD:
             raise ValueError(
@@ -119,7 +119,7 @@ def triton_preprocess(self, q, k, v):
             v,
         )
 
-    @register_benchmark()
+    @register_benchmark(baseline=True)
     def triton_flash_v2(
         self,
         q: torch.Tensor,
@@ -129,7 +129,7 @@ def triton_flash_v2(
         triton_q, triton_k, triton_v = self.triton_preprocess(q, k, v)
         # full fp8 will be enabled if type of q,k,v is fp8
         return lambda: triton_attention(
-            triton_q, triton_k, triton_v, self.causal, self.sm_scale, "base"
+            triton_q, triton_k, triton_v, self.causal, self.sm_scale, "base_opt"
         )
 
     @register_benchmark()
@@ -189,12 +189,14 @@ def get_ctx_vals():
                 device=self.device,
                 requires_grad=self.requires_grad,
             )
+
             k = torch.randn(
                 (BATCH, H, N_CTX, D_HEAD),
                 dtype=torch.float16,
                 device=self.device,
                 requires_grad=self.requires_grad,
             )
+
             v = torch.randn(
                 (BATCH, H, N_CTX, D_HEAD),
                 dtype=torch.float16,
@@ -203,6 +205,42 @@ def get_ctx_vals():
             )
             yield (q, k, v)
 
+    def accuracy(self, fn: Callable, baseline_fn: Callable) -> bool:
+        """
+        Check accuracy of FP8 attention implementation against baseline.
+
+        FP8 operations have inherently lower precision, so we use relaxed tolerances.
+        Based on empirical testing, FP8 can introduce differences up to ~2.0.
+        """
+        try:
+            output = fn()
+            baseline_output = baseline_fn()
+
+            # Convert FP8 outputs to FP16 for comparison
+            if output.dtype in [torch.float8_e5m2, torch.float8_e4m3fn]:
+                output = output.to(torch.float16)
+            if baseline_output.dtype in [torch.float8_e5m2, torch.float8_e4m3fn]:
+                baseline_output = baseline_output.to(torch.float16)
+
+            # Validate outputs
+            if torch.isnan(output).any() or torch.isinf(output).any():
+                return False
+            if torch.isnan(baseline_output).any() or torch.isinf(baseline_output).any():
+                return False
+            if output.shape != baseline_output.shape:
+                return False
+
+            # FP8 attention uses relaxed tolerances due to:
+            # 1. FP8 quantization of Q, K, V inputs
+            # 2. FP8 quantization of attention weights (doesn't sum to exactly 1.0)
+            # 3. Accumulation differences in FP8 GEMM operations
+            result = torch.allclose(output, baseline_output, atol=2.0, rtol=0.2)
+
+            return result
+
+        except Exception:
+            return False
+
     @register_metric()
     def flops(
         self, fn_name: str, example_inputs: Any, metrics: BenchmarkOperatorMetrics