fix variants

manman-ren · manman-ren · commit e652eb014e8c · 2024-12-04T17:13:20.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/tritonbench/kernels/triton_fused_attention.py b/tritonbench/kernels/triton_fused_attention.py
@@ -1644,7 +1644,7 @@ def keep2(conf):
     for s in [3]#, 4, 7]
     for w in [4]#, 8]
 ]
-configsBwd2 = [
+configsBwdWs = [
     (
         triton.Config(
             {
@@ -1655,8 +1655,8 @@ def keep2(conf):
             },
             num_stages=s,
             num_warps=w,
-            num_buffers_warp_spec=0,
-            num_consumer_groups=0,
+            num_buffers_warp_spec=2,
+            num_consumer_groups=2,
         )
         if has_warp_spec
         else triton.Config(
@@ -1670,8 +1670,8 @@ def keep2(conf):
             num_warps=w,
         )
     )
-    for BM in [32] #32, 64] # BLOCK_N1 % BLOCK_M1 == 0
-    for BN in [64] #64, 128]
+    for BM in [64] #32, 64] # BLOCK_N1 % BLOCK_M1 == 0
+    for BN in [128] #[64] #64, 128]
     for s in [3]#, 4, 7]
     for w in [4]#, 8]
 ]
@@ -1922,9 +1922,9 @@ def _attn_bwd(
     )
 
 
-@triton.autotune(list(filter(keep2, configsBwd2)), key=["N_CTX"])
+@triton.autotune(list(filter(keep2, configsBwdWs)), key=["N_CTX"])
 @triton.jit
-def _attn_bwd2(
+def _attn_bwd_ws(
     Q,
     K,
     V,
@@ -1978,7 +1978,7 @@ def _attn_bwd2(
 
 class _attention_opt(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, q, k, v, causal, sm_scale, baseVariant): #, bwdVariant):
+    def forward(ctx, q, k, v, causal, sm_scale, baseVariant, bwdVariant):
         # shape constraints
         HEAD_DIM_Q, HEAD_DIM_K = q.shape[-1], k.shape[-1]
         # when v is in float8_e5m2 it is transposed.
@@ -2366,7 +2366,7 @@ def grid_tma_persistent(META):
         ctx.sm_scale = sm_scale
         ctx.HEAD_DIM = HEAD_DIM_K
         ctx.causal = causal
-        #ctx.bwdVariant = bwdVariant
+        ctx.bwdVariant = bwdVariant
         # If we want to use different variants for bwd, save bwd mode here.
         return o
 
@@ -2385,8 +2385,6 @@ def backward(ctx, do):
 
         #NUM_WARPS, NUM_STAGES = 4, 5
         #BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 128, 128, 32
-        NUM_WARPS, NUM_STAGES = 4, 3
-        BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 64, 128, 128, 64
 
         BLK_SLICE_FACTOR = 2
         RCP_LN2 = 1.4426950408889634  # = 1.0 / ln(2)
@@ -2409,7 +2407,7 @@ def backward(ctx, do):
         grid = lambda args: (N_CTX // args["BLOCK_N1"], 1, BATCH * N_HEAD)
         #grid = (N_CTX // BLOCK_N1, 1, BATCH * N_HEAD)
         print(q.stride(0), q.stride(1), q.stride(2), q.stride(3))
-        if True: #ctx.bwdVariant == "base":
+        if ctx.bwdVariant == "base":
             _attn_bwd[grid](
                 q,
                 arg_k,
@@ -2436,8 +2434,8 @@ def backward(ctx, do):
                 #num_warps=NUM_WARPS,  #
                 #num_stages=NUM_STAGES,  #
             )
-        else: #if ctx.bwdVariant == "base2":
-            _attn_bwd2[grid](
+        elif ctx.bwdVariant == "ws":
+            _attn_bwd_ws[grid](
                 q,
                 arg_k,
                 v,
diff --git a/tritonbench/operators/flash_attention/operator.py b/tritonbench/operators/flash_attention/operator.py
@@ -252,19 +252,19 @@ def triton_tutorial_flash_v2(
     ) -> Callable:
         # base: do not enable TMA/WarpSpec/CompPipe
         return lambda: triton_tutorial_FA2_opt(
-            q, k, v, self.causal, self.sm_scale, "base"#, "base"
+            q, k, v, self.causal, self.sm_scale, "base", "base"
         )
 
     @register_benchmark()
-    def triton_tutorial_flash_v2_bwd2(
+    def triton_tutorial_flash_v2_bwd_ws(
         self,
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
     ) -> Callable:
         # base: do not enable TMA/WarpSpec/CompPipe
         return lambda: triton_tutorial_FA2_opt(
-            q, k, v, self.causal, self.sm_scale, "base", "base2"
+            q, k, v, self.causal, self.sm_scale, "base", "ws"
         )
 
     @register_benchmark(enabled=HAS_CUDA_124)