Enrich auto-tune shapes for OC OBA model (#4368)

Randy Shuai · facebook-github-bot · commit 509724d382b7 · 2025-06-17T19:36:24.000-07:00
Summary: Pull Request resolved: #4368 X-link: facebookresearch/FBGEMM#1436 Adding two shapes for the OC OBA model to better the perf using the triton fp8 non-persistent kernel, as suggested by the [log](https://www.internalfb.com/intern/paste/P1843910441/). By which the triton kernel almost on-pars the torch rowwise: |fp8 kernel|Flops|Time per iter|QPS |pytorch rowwise|304.07|35.23ms|87205.84 |triton(without added shapes)|292.83|36.58ms|83982.15 |triton(with the added shapes)|302.63|35.39ms|86793.45 Reviewed By: njriasan, karthik-man Differential Revision: D76631650 fbshipit-source-id: 4a1324302f5ca635d801242d0cca205a33f41c94
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -3418,6 +3418,34 @@ def get_full_non_persistent_tuning_space():
         num_warps=8,
         num_stages=2,
     ),
+    triton.Config(
+        {
+            "BLOCK_M": 128,
+            "BLOCK_N": 128,
+            "BLOCK_K": 64,
+            "GROUP_M": 4,
+            "SPLIT_K": 1,
+            "waves_per_eu": 2,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2,
+        },
+        num_warps=4,
+        num_stages=2,
+    ),
+    triton.Config(
+        {
+            "BLOCK_M": 128,
+            "BLOCK_N": 64,
+            "BLOCK_K": 64,
+            "GROUP_M": 4,
+            "SPLIT_K": 1,
+            "waves_per_eu": 0,
+            "matrix_instr_nonkdim": 16,
+            "kpack": 2,
+        },
+        num_warps=4,
+        num_stages=2,
+    ),
 ]
 
 # Set this to enable full autotuning for proper benchmarking.