Skip to content

Commit 4a7017d

Browse files
[SYCLomatic][Query API Mapping][nvcuda::wmma] Added QAM support for 4 nvcuda wmma APIs (#2928)
1 parent c725bae commit 4a7017d

File tree

7 files changed

+112
-0
lines changed

7 files changed

+112
-0
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
template <typename T> __global__ void test(T val) {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
7+
nvcuda::wmma::fill_fragment(acc_frag, val /*const T&*/);
8+
// End
9+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
template <typename T>
5+
__global__ void test(const T *a, int row, int col, unsigned lda) {
6+
// Start
7+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
8+
nvcuda::wmma::row_major>
9+
a_frag;
10+
nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*const T **/,
11+
lda /*unsigned*/);
12+
// End
13+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
__global__ void test() {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
7+
nvcuda::wmma::row_major>
8+
a_frag;
9+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half,
10+
nvcuda::wmma::col_major>
11+
b_frag;
12+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
13+
nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
14+
// End
15+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
template <typename T>
5+
__global__ void test(const T *c, int row, int col, unsigned ldc) {
6+
// Start
7+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
8+
nvcuda::wmma::store_matrix_sync(
9+
c + col + row * ldc /*const T **/, acc_frag, ldc /*unsigned*/,
10+
nvcuda::wmma::mem_col_major /*nvcuda::wmma::layout_t*/);
11+
nvcuda::wmma::store_matrix_sync(
12+
c + row + col * ldc /*const T **/, acc_frag, ldc /*unsigned*/,
13+
nvcuda::wmma::mem_row_major /*nvcuda::wmma::layout_t*/);
14+
// End
15+
}

clang/lib/DPCT/DPCT.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,8 @@ int runDPCT(int argc, const char **argv) {
10631063
Experimentals.addValue(ExperimentalFeatures::Exp_LevelZero);
10641064
else if (Option.ends_with("non-uniform-groups"))
10651065
Experimentals.addValue(ExperimentalFeatures::Exp_NonUniformGroups);
1066+
else if (Option.ends_with("matrix"))
1067+
Experimentals.addValue(ExperimentalFeatures::Exp_Matrix);
10661068
} else if (Option == "--no-dry-pattern") {
10671069
NoDRYPattern.setValue(true);
10681070
} else if (Option == "--enable-profiling") {
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0
3+
4+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::fill_fragment | FileCheck %s -check-prefix=NVCUDA_WMMA_FILL_FRAGMENT
5+
// NVCUDA_WMMA_FILL_FRAGMENT: CUDA API:
6+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
7+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag, val /*const T&*/);
8+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
9+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
10+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), val);
11+
12+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::load_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_LOAD_MATRIX_SYNC
13+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC: CUDA API:
14+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
15+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::row_major>
16+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag;
17+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*const T **/,
18+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: lda /*unsigned*/);
19+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
20+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::a, 16, 16, 16, sycl::half, dpct::experimental::matrix::row_major>
21+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag;
22+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_load(sycl::ext::oneapi::this_work_item::get_sub_group(), a_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, typename std::remove_pointer<decltype(a + col + row * lda)>::type>(a + col + row * lda), lda);
23+
24+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::store_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_STORE_MATRIX_SYNC
25+
// NVCUDA_WMMA_STORE_MATRIX_SYNC: CUDA API:
26+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
27+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync(
28+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc /*const T **/, acc_frag, ldc /*unsigned*/,
29+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /*nvcuda::wmma::layout_t*/);
30+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync(
31+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc /*const T **/, acc_frag, ldc /*unsigned*/,
32+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /*nvcuda::wmma::layout_t*/);
33+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
34+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
35+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, typename std::remove_pointer<decltype(c + col + row * ldc)>::type>(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major);
36+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, typename std::remove_pointer<decltype(c + row + col * ldc)>::type>(c + row + col * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::row_major);
37+
38+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::mma_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_MMA_SYNC
39+
// NVCUDA_WMMA_MMA_SYNC: CUDA API:
40+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
41+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::row_major>
42+
// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag;
43+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half,
44+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::col_major>
45+
// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag;
46+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
47+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
48+
// NVCUDA_WMMA_MMA_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
49+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::a, 16, 16, 16, sycl::half, dpct::experimental::matrix::row_major>
50+
// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag;
51+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::b, 16, 16, 16, sycl::half, dpct::experimental::matrix::col_major>
52+
// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag;
53+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
54+
// NVCUDA_WMMA_MMA_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_mad(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), a_frag.get(), b_frag.get(), acc_frag.get());

clang/test/dpct/query_api_mapping/test_all.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2339,6 +2339,10 @@
23392339
// CHECK-NEXT: normcdfinv
23402340
// CHECK-NEXT: normcdfinvf
23412341
// CHECK-NEXT: normf
2342+
// CHECK-NEXT: nvcuda::wmma::fill_fragment
2343+
// CHECK-NEXT: nvcuda::wmma::load_matrix_sync
2344+
// CHECK-NEXT: nvcuda::wmma::mma_sync
2345+
// CHECK-NEXT: nvcuda::wmma::store_matrix_sync
23422346
// CHECK-NEXT: nvshmem_align
23432347
// CHECK-NEXT: nvshmem_calloc
23442348
// CHECK-NEXT: nvshmem_finalize

0 commit comments

Comments
 (0)