Optimize half-precision operations in sparse Marlin MMA

petrex · petrex · commit cf7903976596 · 2025-03-10T22:46:02.000-07:00
Update CUDA half-precision operations using __hsub2 and __hfma2 intrinsics to improve performance and precision in sparse matrix multiply-accumulate (MMA) computations.
diff --git a/torchao/csrc/cuda/sparse_marlin/mma.h b/torchao/csrc/cuda/sparse_marlin/mma.h
@@ -206,8 +206,8 @@ __device__ inline FragB dequant_4bit(int q) {
   const __half2* MUL_ptr = reinterpret_cast<const __half2*>(&MUL);
   const __half2* ADD_ptr = reinterpret_cast<const __half2*>(&ADD);
 
-  frag_b[0] = __hsub(*lo_ptr, *SUB_ptr);
-  frag_b[1] = __hfma(*hi_ptr, *MUL_ptr, *ADD_ptr);
+  frag_b[0] = __hsub2(*lo_ptr, *SUB_ptr);
+  frag_b[1] = __hfma2(*hi_ptr, *MUL_ptr, *ADD_ptr);
   #else
   // NVIDIA implementation
   frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),