Fix potential out-of-bound access in int8_mm.py (#1751)

mark14wu · web-flow · commit 98c4e2e06d7f · 2025-02-25T11:46:19.000-08:00
* fix potential out-of-bound access

* remove unused EVEN_K

* refactor fix with triton.heuristics

* restore EVEN_K as an input

* fix typo

* fix another typo

* ruff reformatted
diff --git a/torchao/prototype/quantized_training/int8_mm.py b/torchao/prototype/quantized_training/int8_mm.py
@@ -54,6 +54,7 @@
 
 
 @triton.autotune(configs=configs, key=["M", "N", "K", "stride_ak", "stride_bk"])
+@triton.heuristics({"EVEN_K": lambda args: args["K"] % args["BLOCK_K"] == 0})
 @triton.jit
 def _scaled_int8_mm_kernel(
     A_ptr,
@@ -176,7 +177,6 @@ def scaled_int8_mm_cuda(A: Tensor, B: Tensor, row_scale: Tensor, col_scale: Tens
         *A.stride(),
         *B.stride(),
         *C.stride(),
-        EVEN_K=K % 2 == 0,
         COL_SCALE_SCALAR=col_scale.numel() == 1,
     )
     return C