Add ROCm-specific inline assembly for sparse Marlin MMA operations

petrex · petrex · commit 72c2642a6636 · 2025-03-10T22:36:46.000-07:00
Add conditional compilation for ROCm platforms in the sparse Marlin matrix multiply accumulate (MMA) function. This ensures proper inline assembly implementation for both CUDA and ROCm environments, using platform-specific register and instruction handling.
diff --git a/torchao/csrc/cuda/sparse_marlin/mma.h b/torchao/csrc/cuda/sparse_marlin/mma.h
@@ -53,6 +53,22 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
 
   float* c = reinterpret_cast<float*>(&frag_c);
   if (psel == 0) {
+    #ifdef USE_ROCM
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "v"(c[0]), "v"(c[1]),
+                   "v"(c[2]), "v"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=v"(c[4]), "=v"(c[5]), "=v"(c[6]), "=v"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "v"(c[4]), "v"(c[5]),
+                   "v"(c[6]), "v"(c[7]), "r"(e[0]));
+    #else
     asm volatile(MMA_SP_INST
                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
                  "{%12,%13,%14,%15}, %16, 0x0;\n"
@@ -67,7 +83,24 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
                  : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
                    "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
                    "f"(c[6]), "f"(c[7]), "r"(e[0]));
+    #endif
   } else {
+    #ifdef USE_ROCM
+   asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=v"(c[0]), "=v"(c[1]), "=v"(c[2]), "=v"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "v"(c[0]), "v"(c[1]),
+                   "v"(c[2]), "v"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=v"(c[4]), "=v"(c[5]), "=v"(c[6]), "=v"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "v"(c[4]), "v"(c[5]),
+                   "v"(c[6]), "v"(c[7]), "r"(e[0])); 
+    #else
     asm volatile(MMA_SP_INST
                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
                  "{%12,%13,%14,%15}, %16, 0x1;\n"
@@ -82,6 +115,7 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
                  : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
                    "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
                    "f"(c[6]), "f"(c[7]), "r"(e[0]));
+    #endif
   }
 }