|
| 1 | +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2 |
| 2 | +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2 |
| 3 | +// RUN: dpct --format-range=none -out-root %T/cvta %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only |
| 4 | +// RUN: FileCheck %s --match-full-lines --input-file %T/cvta/cvta.dp.cpp |
| 5 | +// RUN: %if build_lit %{icpx -c -fsycl %T/cvta/cvta.dp.cpp -o %T/cvta/cvta.dp.o %} |
| 6 | + |
| 7 | +// clang-format off |
| 8 | +#include <cstdint> |
| 9 | +#include <cuda_runtime.h> |
| 10 | + |
| 11 | + |
| 12 | +// CHECK: void test_cvta_to_shared_u64(uint64_t* output, const sycl::nd_item<3> &item_ct1, |
| 13 | +// CHECK-NEXT: int *shared_data) { |
| 14 | +// CHECK-NEXT: // Shared memory |
| 15 | +// CHECK-NEXT: shared_data[0] = 0; |
| 16 | +// CHECK-NEXT: uint64_t shared_addr = 0; |
| 17 | +// CHECK-NEXT: shared_addr = (uint64_t)(&shared_data[0]); |
| 18 | +// CHECK-NEXT: output[item_ct1.get_local_id(2)] = shared_addr; |
| 19 | +// CHECK-NEXT:} |
| 20 | +__global__ void test_cvta_to_shared_u64(uint64_t* output) { |
| 21 | + __shared__ int shared_data[1]; // Shared memory |
| 22 | + shared_data[0] = 0; |
| 23 | + uint64_t shared_addr = 0; |
| 24 | + asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shared_addr) : "l"(&shared_data[0])); |
| 25 | + output[threadIdx.x] = shared_addr; |
| 26 | +} |
| 27 | + |
| 28 | + |
| 29 | +#define N 128 |
| 30 | +// CHECK: void testKernel(unsigned int *addr_out, const sycl::nd_item<3> &item_ct1, |
| 31 | +// CHECK-NEXT: int *B_shared) { |
| 32 | +// CHECK-NEXT: // Shared memory |
| 33 | +// CHECK-NEXT: unsigned int addr1; |
| 34 | +// CHECK-NEXT: int k_0_1 = item_ct1.get_group(2); |
| 35 | +// CHECK-NEXT: int ax1_0 = item_ct1.get_local_id(2); |
| 36 | +// CHECK-NEXT: { |
| 37 | +// CHECK-NEXT: uint64_t addr; |
| 38 | +// CHECK-NEXT: addr = (uint64_t)((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)item_ct1.get_local_id(1)) * (N / 2))) + (ax1_0 * 16))])) + |
| 39 | +// CHECK-NEXT: (((((int)item_ct1.get_local_id(2)) & 15) * (N + 8)) + ((((int)item_ct1.get_local_id(2)) >> 4) * 8)))); |
| 40 | +// CHECK-NEXT: addr1 = static_cast<uint32_t>(addr); |
| 41 | +// CHECK-NEXT: } |
| 42 | +// CHECK-NEXT: addr_out[item_ct1.get_local_id(2)] = addr1; |
| 43 | +// CHECK-NEXT: } |
| 44 | +__global__ void testKernel(unsigned int *addr_out) { |
| 45 | + __shared__ int B_shared[N * 16 + 128]; // Shared memory |
| 46 | + unsigned int addr1; |
| 47 | + int k_0_1 = blockIdx.x; |
| 48 | + int ax1_0 = threadIdx.x; |
| 49 | + __asm__ __volatile__( |
| 50 | + "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n" |
| 51 | + : "=r"(addr1) |
| 52 | + : "l"((void *)((&(B_shared[(((k_0_1 * (N * 16 + 128)) + (((int)threadIdx.y) * (N / 2))) + (ax1_0 * 16))])) + |
| 53 | + (((((int)threadIdx.x) & 15) * (N + 8)) + ((((int)threadIdx.x) >> 4) * 8))))); |
| 54 | + addr_out[threadIdx.x] = addr1; |
| 55 | +} |
| 56 | + |
| 57 | + |
| 58 | +// CHECK: void read_shared_value(int *output, const sycl::nd_item<3> &item_ct1, |
| 59 | +// CHECK-NEXT: int *shared_data) { |
| 60 | +// CHECK-NEXT: // Shared memory allocation |
| 61 | +// CHECK-NEXT: if (item_ct1.get_local_id(2) == 0) { |
| 62 | +// CHECK-NEXT: shared_data[0] = 42; |
| 63 | +// CHECK-NEXT: } |
| 64 | +// CHECK-NEXT: item_ct1.barrier(sycl::access::fence_space::local_space); |
| 65 | +// CHECK-NEXT: unsigned long long shared_addr_u64; |
| 66 | +// CHECK-NEXT: int value; |
| 67 | +// CHECK-NEXT: shared_addr_u64 = (uint64_t)(shared_data); |
| 68 | +// CHECK-NEXT: value = *((uint32_t *)(uintptr_t)shared_addr_u64); |
| 69 | +// CHECK-NEXT: if (item_ct1.get_local_id(2) == 0) { |
| 70 | +// CHECK-NEXT: output[0] = value; |
| 71 | +// CHECK-NEXT: } |
| 72 | +// CHECK-NEXT:} |
| 73 | +__global__ void read_shared_value(int *output) { |
| 74 | + __shared__ int shared_data[1]; // Shared memory allocation |
| 75 | + if (threadIdx.x == 0) { |
| 76 | + shared_data[0] = 42; |
| 77 | + } |
| 78 | + __syncthreads(); |
| 79 | + unsigned long long shared_addr_u64; |
| 80 | + int value; |
| 81 | + asm volatile( |
| 82 | + "cvta.to.shared.u64 %0, %2;\n\t" // Properly uses input operand %2 |
| 83 | + "ld.shared.u32 %1, [%0];\n\t" // Correctly assigns to output %1 |
| 84 | + : "=l"(shared_addr_u64), "=r"(value) |
| 85 | + : "l"(shared_data)); |
| 86 | + if (threadIdx.x == 0) { |
| 87 | + output[0] = value; |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +// clang-format on |
0 commit comments