Added support for stmatrix migration

TejaX-Alaghari · TejaX-Alaghari · commit 68b7d31d5b66 · 2025-05-15T20:51:27.000+08:00
diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp
@@ -602,7 +602,8 @@ bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) {
   // Address expression only support ld/st/red & atom instructions.
   if (!CurrInst || !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom,
                                  asmtok::op_prefetch, asmtok::op_red,
-                                 asmtok::op_cp, asmtok::op_ldmatrix)) {
+                                 asmtok::op_cp, asmtok::op_ldmatrix,
+                                 asmtok::op_stmatrix)) {
     return SYCLGenError();
   }
   std::string Type;
@@ -635,7 +636,8 @@ bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) {
     if (CurrInst->is(asmtok::op_prefetch, asmtok::op_red) ||
         CanSuppressCast(Dst->getSymbol()))
       OS() << llvm::formatv("{0}", Reg);
-    else if (CurrInst->is(asmtok::op_ldmatrix))
+    else if (CurrInst->is(asmtok::op_ldmatrix) ||
+             CurrInst->is(asmtok::op_stmatrix))
       OS() << llvm::formatv("(uintptr_t){0}", Reg);
     else
       OS() << llvm::formatv("(({0} *)(uintptr_t){1})", Type, Reg);
@@ -1376,6 +1378,66 @@ class SYCLGen : public SYCLGenBase {
     return SYCLGenSuccess();
   }
 
+  bool handle_stmatrix(const InlineAsmInstruction *Inst) override {
+    if (Inst->getNumInputOperands() != 1)
+      return SYCLGenError();
+
+    const auto *Type = dyn_cast<InlineAsmBuiltinType>(Inst->getType(0));
+
+    if (!Type || Type->getKind() != InlineAsmBuiltinType::b16)
+      return SYCLGenError();
+
+    const InlineAsmVectorExpr *VE;
+    if (VE = dyn_cast<InlineAsmVectorExpr>(Inst->getInputOperand(0))) {
+      auto numOutputOperands = VE->getNumElements();
+      if (Inst->hasAttr(InstAttr::x1)) {
+        if (numOutputOperands != 1)
+          return SYCLGenError();
+      } else if (Inst->hasAttr(InstAttr::x2)) {
+        if (numOutputOperands != 2)
+          return SYCLGenError();
+      } else if (Inst->hasAttr(InstAttr::x4)) {
+        if (numOutputOperands != 4)
+          return SYCLGenError();
+      }
+    } else {
+      return SYCLGenError();
+    }
+
+    llvm::SaveAndRestore<const InlineAsmInstruction *> Store(CurrInst);
+    CurrInst = Inst;
+    const auto *Dst =
+        dyn_cast_or_null<InlineAsmAddressExpr>(Inst->getOutputOperand());
+    if (!Dst)
+      return false;
+
+    OS() << MapNames::getDpctNamespace() << "experimental::matrix::stmatrix(";
+    if (emitStmt(Dst)) {
+      return SYCLGenError();
+    }
+    OS() << ", ";
+    for (unsigned Inst = 0; Inst != VE->getNumElements(); ++Inst) {
+      if (isa<InlineAsmDiscardExpr>(VE->getElement(Inst)))
+        continue;
+      if (emitStmt(VE->getElement(Inst)))
+        return SYCLGenError();
+      OS() << ", ";
+    }
+    OS() << DpctGlobalInfo::getItem(GAS);
+    if (Inst->hasAttr(InstAttr::trans))
+      OS() << ", true";
+    OS() << ");";
+    const auto *KernelDecl = getImmediateOuterFuncDecl(GAS);
+    if (KernelDecl) {
+      auto FuncInfo = DeviceFunctionDecl::LinkRedecls(KernelDecl);
+      if (FuncInfo)
+        FuncInfo->addSubGroupSizeRequest(32, GAS->getBeginLoc(),
+                                         DpctGlobalInfo::getSubGroup(GAS));
+    }
+
+    return SYCLGenSuccess();
+  }
+
   bool handle_mma(const InlineAsmInstruction *Inst) override {
     if (Inst->getNumInputOperands() != 3)
       return SYCLGenError();
diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h b/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h
@@ -340,6 +340,8 @@ class InlineAsmInstruction : public InlineAsmStmt {
   /// therest are input operands.
   SmallVector<InlineAsmExpr *, 4> InputOps;
 
+  SmallVector<InlineAsmExpr *, 4> OutputOps;
+
 public:
   InlineAsmInstruction(InlineAsmIdentifierInfo *Op,
                        SmallVector<AsmStateSpace, 4> AsmStateSpaces,
diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp
@@ -736,10 +736,11 @@ InlineAsmExprResult InlineAsmParser::ActOnParenExpr(InlineAsmExpr *SubExpr) {
 InlineAsmExprResult
 InlineAsmParser::ActOnVectorExpr(ArrayRef<InlineAsmExpr *> Vec) {
 
-  // Vector size for ldmatrix are 1, 2, 4
+  // Vector size for ldmatrix/stmatrix are 1, 2, 4
   // size(x) = 2 * sizeof(v).
   InlineAsmVectorType::VecKind Kind;
-  if (Opcode->getTokenID() == asmtok::op_ldmatrix) {
+  if (Opcode->getTokenID() == asmtok::op_ldmatrix ||
+      Opcode->getTokenID() == asmtok::op_stmatrix) {
     switch (Vec.size()) {
     case 1:
       Kind = InlineAsmVectorType::x1;
diff --git a/clang/lib/DPCT/SrcAPI/APINames_ASM.inc b/clang/lib/DPCT/SrcAPI/APINames_ASM.inc
@@ -123,7 +123,7 @@ ENTRY("sqrt", "sqrt", true, NO_FLAG, P1, "Successful")
 ENTRY("st", "st", true, NO_FLAG, P1, "Partial")
 ENTRY("stackrestore", "stackrestore", false, NO_FLAG, P1, "Comment")
 ENTRY("stacksave", "stacksave", false, NO_FLAG, P1, "Comment")
-ENTRY("stmatrix", "stmatrix", false, NO_FLAG, P1, "Comment")
+ENTRY("stmatrix", "stmatrix", true, NO_FLAG, P1, "Successful")
 ENTRY("sub", "sub", true, NO_FLAG, P1, "Partial")
 ENTRY("subc", "subc", false, NO_FLAG, P1, "Comment")
 ENTRY("suld", "suld", false, NO_FLAG, P1, "Comment")
diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp
@@ -425,7 +425,7 @@ max(T1 a, T2 b) {
   return sycl::fmax(static_cast<common_t>(a), static_cast<common_t>(b));
 }
 
-// pow functions overload.
+// pow functions overstore.
 inline float pow(const float a, const int b) { return sycl::pown(a, b); }
 inline double pow(const double a, const int b) { return sycl::pown(a, b); }
 inline float pow(const float a, const float b) { return sycl::pow(a, b); }
@@ -2218,6 +2218,101 @@ void ldmatrix(uintptr_t addr, T *m1, T *m2, T *m3, T *m4, bool trans = false) {
   ldmatrix(addr, m4, trans, 3);
 }
 
+/// Stores 1 8x8 b16 matrix from local memory to shared memory (32-bits per wi)
+/// Requires the sub-group size of kernel calling this function to be 32
+/// \tparam [in] T The type of matrix elements
+/// \param [in] addr The address of the matrix in shared memory
+/// \param [in] m The local memory containing data of matrix
+/// \param [in] item The sycl::nd_item index space class
+/// \param [in] trans Indicates whether the matrix to be stored transposed
+/// \param [in] mat The matrix index to be stored
+template <typename T, typename ItemT>
+void stmatrix(uintptr_t addr, T m, const ItemT &item, bool trans = false,
+              unsigned mat = 0) {
+  int lane = item.get_sub_group().get_local_linear_id();
+
+  int lane_group8_row = lane / 8;
+  int lane_group8_col = lane % 8;
+
+  if (!trans) {
+    // calculate the source lane
+    int src_lane = 2 * lane_group8_row;
+    if (lane_group8_col >= 4)
+      src_lane += 1;
+
+    // Broadcast the address from the source lane
+    auto recv_addr_uintp = dpct::select_from_sub_group(
+        item.get_sub_group(), addr, mat * 8 + src_lane);
+
+    // Cast the received address from uintptr_t to the type of 'm'
+    auto recv_addr = reinterpret_cast<T *>(recv_addr_uintp);
+
+    // Non-transposed store
+    recv_addr[lane_group8_col % 4] = m;
+  } else {
+    // calculate the source lane
+    int src_lane = (lane % 4) * 2;
+
+    // Broadcast the address from the source lane
+    auto recv_addr_uintp_1 = dpct::select_from_sub_group(
+        item.get_sub_group(), addr, mat * 8 + src_lane);
+    auto recv_addr_uintp_2 = dpct::select_from_sub_group(
+        item.get_sub_group(), addr, mat * 8 + src_lane + 1);
+
+    // Cast the received address from uintptr_t to 'half *'
+    auto recv_addr_1 = reinterpret_cast<sycl::half *>(recv_addr_uintp_1);
+    auto recv_addr_2 = reinterpret_cast<sycl::half *>(recv_addr_uintp_2);
+
+    // Split the 32-bit value of 'm' into two 16-bits
+    sycl::half *val = reinterpret_cast<sycl::half *>(&m);
+
+    // Transposed store
+    int index = lane / 4;
+    recv_addr_1[index] = val[0];
+    recv_addr_2[index] = val[1];
+  }
+}
+
+/// Stores 2 8x8 b16 matrix from local memory to shared memory (32-bits per wi)
+/// Requires the sub-group size of kernel calling this function to be 32
+/// \tparam [in] T The type of matrix elements
+/// \param [in] addr The address of the matrix in shared memory
+/// \param [in] m1 The local memory containing data of 1st matrix
+/// \param [in] m2 The local memory containing data of 2nd matrix
+/// \param [in] item The sycl::nd_item index space class
+/// \param [in] trans Indicates whether the matrix to be stored transposed
+template <typename T, typename ItemT>
+void stmatrix(uintptr_t addr, T m1, T m2, const ItemT &item,
+              bool trans = false) {
+  // Store 1st matrix
+  stmatrix(addr, m1, item, trans, 0);
+  // Store 2nd matrix
+  stmatrix(addr, m2, item, trans, 1);
+}
+
+/// Stores 4 8x8 b16 matrix from local memory to shared memory (32-bits per wi)
+/// Requires the sub-group size of kernel calling this function to be 32
+/// \tparam [in] T The type of matrix elements
+/// \param [in] addr The address of the matrix in shared memory
+/// \param [in] m1 The local memory containing data of 1st matrix
+/// \param [in] m2 The local memory containing data of 2nd matrix
+/// \param [in] m3 The local memory containing data of 3rd matrix
+/// \param [in] m4 The local memory containing data of 4th matrix
+/// \param [in] item The sycl::nd_item index space class
+/// \param [in] trans Indicates whether the matrix to be stored transposed
+template <typename T, typename ItemT>
+void stmatrix(uintptr_t addr, T m1, T m2, T m3, T m4, const ItemT &item,
+              bool trans = false) {
+  // Store 1st matrix
+  stmatrix(addr, m1, item, trans, 0);
+  // Store 2nd matrix
+  stmatrix(addr, m2, item, trans, 1);
+  // Store 3rd matrix
+  stmatrix(addr, m3, item, trans, 2);
+  // Store 4th matrix
+  stmatrix(addr, m4, item, trans, 3);
+}
+
 /// A helper struct that defines the pack type for the input matrix fragments
 /// of mma() function based on the type of input matrix fragments.
 /// The MMAType struct is specialized for different types of input matrices.
diff --git a/clang/test/dpct/asm/stmatrix.cu b/clang/test/dpct/asm/stmatrix.cu
@@ -0,0 +1,119 @@
+// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
+// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
+// RUN: dpct --format-range=none -out-root %T/stmatrix %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
+// RUN: FileCheck %s --match-full-lines --input-file %T/stmatrix/stmatrix.dp.cpp
+// RUN: %if build_lit %{icpx -c -DNO_BUILD_TEST -fsycl %T/stmatrix/stmatrix.dp.cpp -o %T/stmatrix/stmatrix.dp.o %}
+
+// clang-format off
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+/*
+stmatrix.sync.aligned.shape.num{.trans}{.ss}.type [p], r;
+
+Below are the currenly supported configurations:
+.shape = {.m8n8};
+.num   = {.x1, .x2, .x4};
+.ss    = {.shared{::cta}};
+.type  = {.b16};
+*/
+
+__device__ void store_matrix_x1(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], item_ct1);
+  asm volatile("stmatrix.sync.aligned.m8n8.x1.shared.b16 [%0], {%1};\n"
+                : 
+                : "r"(addr), "r"(r[0]));
+}
+
+__device__ void store_matrix_x2(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], r[1], item_ct1);
+  asm volatile("stmatrix.sync.aligned.m8n8.x2.shared.b16 [%0], {%1, %2};\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]));
+}
+
+__device__ void store_matrix_x4(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], r[1], r[2], r[3], item_ct1);
+  asm volatile("stmatrix.sync.aligned.m8n8.x4.shared.b16 [%0], {%1, %2, %3, %4};\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]), "r"(r[2]), "r"(r[3]));
+}
+
+__device__ void store_matrix_x1_trans(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], item_ct1, true);
+  asm volatile("stmatrix.sync.aligned.m8n8.x1.trans.shared.b16 [%0], {%1};\n"
+                : 
+                : "r"(addr), "r"(r[0]));
+}
+
+__device__ void store_matrix_x2_trans(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], r[1], item_ct1, true);
+  asm volatile("stmatrix.sync.aligned.m8n8.x2.trans.shared.b16 [%0], {%1, %2};\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]));
+}
+
+__device__ void store_matrix_x4_trans(void *sh_r_addr, int *r) {
+  // CHECK: auto addr = sh_r_addr;
+  uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr));
+
+  // CHECK: dpct::experimental::matrix::stmatrix((uintptr_t)addr, r[0], r[1], r[2], r[3], item_ct1, true);
+  asm volatile("stmatrix.sync.aligned.m8n8.x4.trans.shared.b16 [%0], {%1, %2, %3, %4};\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]), "r"(r[2]), "r"(r[3]));
+}
+
+__global__ void store_kernel() {
+  __shared__ half s_data[1024];
+  int r[4];
+
+  store_matrix_x1(s_data, r);
+  store_matrix_x2(s_data, r);
+  store_matrix_x4(s_data, r);
+  store_matrix_x1_trans(s_data, r);
+  store_matrix_x2_trans(s_data, r);
+  store_matrix_x4_trans(s_data, r);
+}
+
+int main () {
+  // CHECK: [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} {
+  store_kernel<<<1, 32>>>();
+
+  return 0;
+}
+
+#ifndef NO_BUILD_TEST
+__device__ void test_xn(uint32_t addr, int *r) {
+  // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported.
+  asm volatile("stmatrix.sync.aligned.m8n8.x1.shared.b16 [%0], {%1, %2};\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]));
+
+  // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported.
+  asm volatile("stmatrix.sync.aligned.m8n8.x2.shared.b16 [%0], {%0};\n"
+                :
+                : "r"(addr));
+
+  // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported.
+  asm volatile("stmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2}, [%3];\n"
+                : 
+                : "r"(addr), "r"(r[0]), "r"(r[1]), "r"(r[2]));
+}
+#endif // NO_BUILD_TEST
+
+// clang-format on
diff --git a/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv b/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv
@@ -41,7 +41,7 @@ griddepcontrol,NO,
 isspacep,NO,
 istypep,NO,
 ld,YES, Partial
-ldmatrix,YES,
+ldmatrix,YES,Partial
 ldu,NO,
 lg2,YES,
 lop3,YES,
@@ -89,7 +89,7 @@ sqrt,YES,
 st,YES, Partial
 stackrestore,NO,
 stacksave,NO,
-stmatrix,NO,
+stmatrix,YES,Partial
 sub,YES, Partial
 subc,NO,
 suld,NO,