Skip to content

Commit 0e3bc4e

Browse files
authored
[SYCLomatic][PTX] Support migration of PTX cvta.to.shared.u64 (#2782)
Signed-off-by: chenwei.sun <chenwei.sun@intel.com>
1 parent 754ae47 commit 0e3bc4e

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed

clang/lib/DPCT/RulesAsm/AsmMigration.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2607,6 +2607,34 @@ class SYCLGen : public SYCLGenBase {
26072607
return SYCLGenSuccess();
26082608
}
26092609

2610+
bool handle_cvta(const InlineAsmInstruction *Inst) override {
2611+
if (Inst->getNumInputOperands() != 1)
2612+
return SYCLGenError();
2613+
llvm::SaveAndRestore<const InlineAsmInstruction *> Store(CurrInst);
2614+
CurrInst = Inst;
2615+
2616+
std::string Op;
2617+
if (tryEmitStmt(Op, Inst->getInputOperand(0)))
2618+
return SYCLGenError();
2619+
2620+
const auto *Dst = Inst->getOutputOperand();
2621+
if (!Dst)
2622+
return SYCLGenError();
2623+
2624+
std::string Type;
2625+
if (tryEmitType(Type, Inst->getType(0)))
2626+
return SYCLGenError();
2627+
2628+
if (emitStmt(Dst))
2629+
return SYCLGenError();
2630+
OS() << " = ";
2631+
2632+
std::string FormatTemp = "({0})({1})";
2633+
OS() << llvm::formatv(FormatTemp.c_str(), Type, Op);
2634+
endstmt();
2635+
return SYCLGenSuccess();
2636+
}
2637+
26102638
// Handle fma instruction.
26112639
// .sat/.ftz/.oob/.relu attributes was ignored.
26122640
bool handle_fma(const InlineAsmInstruction *Inst) override {

clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ MODIFIER(ecl, ".ecl")
419419
MODIFIER(ecr, ".ecr")
420420
MODIFIER(rc16, ".rc16")
421421
MODIFIER(cs, ".cs")
422+
MODIFIER(to, ".to")
422423

423424
#undef LINKAGE
424425
#undef TARGET

clang/test/dpct/asm/cvta.cu

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct --format-range=none -out-root %T/cvta %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck %s --match-full-lines --input-file %T/cvta/cvta.dp.cpp
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/cvta/cvta.dp.cpp -o %T/cvta/cvta.dp.o %}
6+
7+
// clang-format off
8+
#include <cstdint>
9+
#include <cuda_runtime.h>
10+
11+
12+
// CHECK: void test_cvta_to_shared_u64(uint64_t* output, const sycl::nd_item<3> &item_ct1,
13+
// CHECK-NEXT: int *shared_data) {
14+
// CHECK-NEXT: // Shared memory
15+
// CHECK-NEXT: shared_data[0] = 0;
16+
// CHECK-NEXT: uint64_t shared_addr = 0;
17+
// CHECK-NEXT: shared_addr = (uint64_t)(&shared_data[0]);
18+
// CHECK-NEXT: output[item_ct1.get_local_id(2)] = shared_addr;
19+
// CHECK-NEXT:}
20+
__global__ void test_cvta_to_shared_u64(uint64_t* output) {
21+
__shared__ int shared_data[1]; // Shared memory
22+
shared_data[0] = 0;
23+
uint64_t shared_addr = 0;
24+
asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shared_addr) : "l"(&shared_data[0]));
25+
output[threadIdx.x] = shared_addr;
26+
}
27+
28+
29+
#define N 128
30+
// CHECK: void testKernel(unsigned int *addr_out, const sycl::nd_item<3> &item_ct1,
31+
// CHECK-NEXT: int *B_shared) {
32+
// CHECK-NEXT: // Shared memory
33+
// CHECK-NEXT: unsigned int addr1;
34+
// CHECK-NEXT: int k_0_1 = item_ct1.get_group(2);
35+
// CHECK-NEXT: int ax1_0 = item_ct1.get_local_id(2);
36+
// CHECK-NEXT: {
37+
// CHECK-NEXT: uint64_t addr;
38+
// CHECK-NEXT: addr = (uint64_t)((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)item_ct1.get_local_id(1)) * (N / 2))) + (ax1_0 * 16))])) +
39+
// CHECK-NEXT: (((((int)item_ct1.get_local_id(2)) & 15) * (N + 8)) + ((((int)item_ct1.get_local_id(2)) >> 4) * 8))));
40+
// CHECK-NEXT: addr1 = static_cast<uint32_t>(addr);
41+
// CHECK-NEXT: }
42+
// CHECK-NEXT: addr_out[item_ct1.get_local_id(2)] = addr1;
43+
// CHECK-NEXT: }
44+
__global__ void testKernel(unsigned int *addr_out) {
45+
__shared__ int B_shared[N * 16 + 128]; // Shared memory
46+
unsigned int addr1;
47+
int k_0_1 = blockIdx.x;
48+
int ax1_0 = threadIdx.x;
49+
__asm__ __volatile__(
50+
"{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
51+
: "=r"(addr1)
52+
: "l"((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)threadIdx.y) * (N / 2))) + (ax1_0 * 16))])) +
53+
(((((int)threadIdx.x) & 15) * (N + 8)) + ((((int)threadIdx.x) >> 4) * 8)))));
54+
addr_out[threadIdx.x] = addr1;
55+
}
56+
57+
58+
// CHECK: void read_shared_value(int *output, const sycl::nd_item<3> &item_ct1,
59+
// CHECK-NEXT: int *shared_data) {
60+
// CHECK-NEXT: // Shared memory allocation
61+
// CHECK-NEXT: if (item_ct1.get_local_id(2) == 0) {
62+
// CHECK-NEXT: shared_data[0] = 42;
63+
// CHECK-NEXT: }
64+
// CHECK-NEXT: item_ct1.barrier(sycl::access::fence_space::local_space);
65+
// CHECK-NEXT: unsigned long long shared_addr_u64;
66+
// CHECK-NEXT: int value;
67+
// CHECK-NEXT: shared_addr_u64 = (uint64_t)(shared_data);
68+
// CHECK-NEXT: value = *((uint32_t *)(uintptr_t)shared_addr_u64);
69+
// CHECK-NEXT: if (item_ct1.get_local_id(2) == 0) {
70+
// CHECK-NEXT: output[0] = value;
71+
// CHECK-NEXT: }
72+
// CHECK-NEXT:}
73+
__global__ void read_shared_value(int *output) {
74+
__shared__ int shared_data[1]; // Shared memory allocation
75+
if (threadIdx.x == 0) {
76+
shared_data[0] = 42;
77+
}
78+
__syncthreads();
79+
unsigned long long shared_addr_u64;
80+
int value;
81+
asm volatile(
82+
"cvta.to.shared.u64 %0, %2;\n\t" // Properly uses input operand %2
83+
"ld.shared.u32 %1, [%0];\n\t" // Correctly assigns to output %1
84+
: "=l"(shared_addr_u64), "=r"(value)
85+
: "l"(shared_data));
86+
if (threadIdx.x == 0) {
87+
output[0] = value;
88+
}
89+
}
90+
91+
// clang-format on

0 commit comments

Comments
 (0)