Fix accuracy check for flash_attention kernels (#280)

yf225 · facebook-github-bot · commit 78b71eb72d62 · 2025-07-12T18:33:00.000-07:00
Summary: Stacked PRs: * #276 * __->__#280 --- --- --- ### Fix accuracy check for flash_attention kernels Pull Request resolved: #280 Reviewed By: oulgen Differential Revision: D78222794 Pulled By: yf225 fbshipit-source-id: 4c104b1defc84a5ffecbe274ea00c590f9463362
diff --git a/tritonbench/kernels/triton_fused_attention.py b/tritonbench/kernels/triton_fused_attention.py
@@ -1432,7 +1432,7 @@ def _attn_fwd_base_opt(
     tl.assume(H >= 0)
 
     tl.static_assert(BLOCK_N <= HEAD_DIM)
-    pid = tl.program_id(0)
+    start_m = tl.program_id(0)
     off_hz = tl.program_id(1)
 
     # Both base and opt use the same compute function
@@ -1464,7 +1464,7 @@ def _attn_fwd_base_opt(
         stride_om,
         stride_on,
         off_hz,
-        pid,
+        start_m,
         Z,
         H,
         N_CTX,
@@ -1753,6 +1753,12 @@ def _attn_fwd_tma_ws_persistent(  # Q, V, desc_k, desc_v, sm_scale, M, Out,  #
 
     tile_idx = prog_id
 
+    # Initialize descriptors as None
+    desc_q = None
+    desc_k = None
+    desc_v = None
+    desc_o = None
+
     if ENABLE_TMA:
         desc_k = tl.make_tensor_descriptor(
             K,
diff --git a/tritonbench/operators/flash_attention/operator.py b/tritonbench/operators/flash_attention/operator.py
@@ -201,17 +201,19 @@ def __init__(
         self.pt2_sdpa = args.pt2_sdpa
         self.additional_inputs = args.additional_inputs
         self.ragged_shapes = args.ragged_shapes
-        self.sm_scale = 1.3
+        # Use standard scale factor: 1/sqrt(head_dim)
+        self.sm_scale = 1.0 / (self.D_HEAD**0.5)
 
-    @register_benchmark()
+    @register_benchmark(baseline=True)
     def aten(
         self,
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
     ) -> Callable:
         def _inner():
-            M = torch.tril(torch.ones((self.N_CTX, self.N_CTX), device=self.device))
+            seq_len = q.shape[2]
+            M = torch.tril(torch.ones((seq_len, seq_len), device=self.device))
             p = torch.matmul(q, k.transpose(2, 3)) * self.sm_scale
             if self.causal:
                 p[:, :, M == 0] = float("-inf")
@@ -524,6 +526,28 @@ def causal_mask(b, h, q_idx, kv_idx):
 
         return lambda: flex_attention(q, k, v, block_mask=block_mask)
 
+    def accuracy(self, fn, baseline_fn):
+        """Override accuracy to use relaxed tolerance for bfloat16."""
+        output = fn()
+        baseline_output = baseline_fn()
+
+        # Check for NaN values
+        if torch.isnan(output).any():
+            return False
+
+        try:
+            # Use relaxed tolerance for bfloat16/float16
+            # Using atol=2e-2 and rtol=1e-2 to provide some margin
+            if output.dtype in [torch.bfloat16, torch.float16]:
+                torch.testing.assert_close(
+                    output, baseline_output, rtol=1e-2, atol=2e-2
+                )
+            else:
+                torch.testing.assert_close(output, baseline_output)
+            return True
+        except Exception:
+            return False
+
     @register_metric(x_only=True)
     def flops(
         self, fn_name: str, example_inputs: Any, metrics: BenchmarkOperatorMetrics