Skip to content

Commit 6bf6411

Browse files
authored
Updating SYCL For CUDA examples to CUDA 12 and latest intel llvm tip (#27)
* Ignoring VIM temporary files * Vector Addition example update Works with latest DPC++ and SYCL2020 features README updated to reflect CUDA backend has USM support * Removing unnecessary CUDA Driver types Since intel/llvm#8197, SYCL CUDA backend uses CUDA primary context by default, so individual context setting is no longer required. * Using moder queue construction SYCL 1.2.1 device selectors have been deprecated in favour of a new simplified form using lambdas. * Format files Run clang-format on files, separate commit to avoid noise * Explicitly setting CUDA context on host task Because of the changes on SYCL context, it is necessary now to set the active CUDA context manually inside the host task. Note there was some clang-formatting here as well * Addressing feedback from Gordon
1 parent 4d59331 commit 6bf6411

File tree

5 files changed

+41
-77
lines changed

5 files changed

+41
-77
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,7 @@
3434
# Temporaries
3535
*~
3636
*#
37-
*/build
37+
*/build
38+
]
39+
# vim
40+
*.swp

examples/sgemm_interop/sycl_sgemm.cpp

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525
#include <iostream>
2626
#include <vector>
2727

28-
#include <CL/sycl.hpp>
29-
#include <CL/sycl/backend/cuda.hpp>
28+
#include <sycl/sycl.hpp>
3029

3130
#include <cublas_v2.h>
3231
#include <cuda.h>
@@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
4746
}
4847
}
4948

50-
void inline checkCudaErrorMsg(CUresult status, const char *msg) {
51-
if (status != CUDA_SUCCESS) {
52-
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
53-
exit(EXIT_FAILURE);
54-
}
55-
}
56-
57-
class CUDASelector : public sycl::device_selector {
58-
public:
59-
int operator()(const sycl::device &device) const override {
60-
if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
61-
std::cout << " CUDA device found " << std::endl;
62-
return 1;
63-
} else{
64-
return -1;
65-
}
66-
}
67-
};
68-
6949
int main() {
7050
using namespace sycl;
7151

@@ -88,7 +68,9 @@ int main() {
8868
// B is a matrix fill with 1
8969
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
9070

91-
sycl::queue q{CUDASelector()};
71+
sycl::queue q{[](auto &d) {
72+
return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
73+
}};
9274

9375
cublasHandle_t handle;
9476
CHECK_ERROR(cublasCreate(&handle));
@@ -104,12 +86,16 @@ int main() {
10486
auto d_C = b_C.get_access<sycl::access::mode::write>(h);
10587

10688
h.host_task([=](sycl::interop_handle ih) {
107-
cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
89+
// Set the correct cuda context & stream
90+
cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
10891
auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
10992
cublasSetStream(handle, cuStream);
110-
auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
111-
auto cuB = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
112-
auto cuC = reinterpret_cast<float *>(ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
93+
auto cuA = reinterpret_cast<float *>(
94+
ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
95+
auto cuB = reinterpret_cast<float *>(
96+
ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
97+
auto cuC = reinterpret_cast<float *>(
98+
ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
11399

114100
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
115101
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,

examples/sgemm_interop/sycl_sgemm_usm.cpp

Lines changed: 20 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525
#include <iostream>
2626
#include <vector>
2727

28-
#include <CL/sycl.hpp>
29-
#include <CL/sycl/backend/cuda.hpp>
28+
#include <sycl/sycl.hpp>
3029

3130
#include <cublas_v2.h>
3231
#include <cuda.h>
@@ -47,25 +46,6 @@ void inline checkCudaErrorMsg(cudaError status, const char *msg) {
4746
}
4847
}
4948

50-
void inline checkCudaErrorMsg(CUresult status, const char *msg) {
51-
if (status != CUDA_SUCCESS) {
52-
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
53-
exit(EXIT_FAILURE);
54-
}
55-
}
56-
57-
class CUDASelector : public sycl::device_selector {
58-
public:
59-
int operator()(const sycl::device &device) const override {
60-
if(device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda){
61-
std::cout << " CUDA device found " << std::endl;
62-
return 1;
63-
} else{
64-
return -1;
65-
}
66-
}
67-
};
68-
6949
int main() {
7050
using namespace sycl;
7151

@@ -88,12 +68,14 @@ int main() {
8868
// B is a matrix fill with 1
8969
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
9070

91-
sycl::queue q{CUDASelector()};
71+
sycl::queue q{[](auto &d) {
72+
return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
73+
}};
9274

9375
// Allocate memory on the device
94-
float* d_A = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
95-
float* d_B = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
96-
float* d_C = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
76+
float *d_A = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
77+
float *d_B = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
78+
float *d_C = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
9779

9880
// Copy matrices A & B to device from host vectors
9981
const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
@@ -105,21 +87,19 @@ int main() {
10587
CHECK_ERROR(cublasCreate(&handle));
10688

10789
q.submit([&](handler &h) {
108-
109-
h.host_task([=](sycl::interop_handle ih) {
110-
111-
// Set the correct cuda context & stream
112-
cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
113-
auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
114-
cublasSetStream(handle, cuStream);
115-
116-
// Call generalised matrix-matrix multiply
117-
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
118-
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
119-
d_C, WIDTH));
120-
cuStreamSynchronize(cuStream);
121-
});
122-
}).wait();
90+
h.host_task([=](sycl::interop_handle ih) {
91+
// Set the correct cuda context & stream
92+
cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
93+
auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
94+
cublasSetStream(handle, cuStream);
95+
96+
// Call generalised matrix-matrix multiply
97+
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
98+
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
99+
d_C, WIDTH));
100+
cuStreamSynchronize(cuStream);
101+
});
102+
}).wait();
123103

124104
// Copy the result back to host
125105
q.memcpy(h_C.data(), d_C, numBytes).wait();

examples/vector_addition/README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@ to highlight how to build an application with SYCL for CUDA using DPC++ support,
77
for which an example CMakefile is provided. For detailed documentation on how to
88
migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers).
99

10-
Note currently the CUDA backend does not support the [USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) extension, so we use
11-
`sycl::buffer` and `sycl::accessors` instead.
12-
1310
Pre-requisites
1411
---------------
1512

examples/vector_addition/vector_addition.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@ int main(int argc, char *argv[]) {
3434

3535
// Initialize input data
3636
{
37-
const auto dwrite_t = sycl::access::mode::discard_write;
37+
sycl::host_accessor h_a{bufA, sycl::write_only};
38+
sycl::host_accessor h_b{bufB, sycl::write_only};
3839

39-
auto h_a = bufA.get_access<dwrite_t>();
40-
auto h_b = bufB.get_access<dwrite_t>();
4140
for (int i = 0; i < N; i++) {
4241
h_a[i] = sin(i) * sin(i);
4342
h_b[i] = cos(i) * cos(i);
@@ -63,15 +62,14 @@ int main(int argc, char *argv[]) {
6362
auto b = bufB.get_access<read_t>(h);
6463
auto c = bufC.get_access<write_t>(h);
6564

66-
h.parallel_for(VecSize,
67-
[=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
65+
h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
6866
};
6967

7068
myQueue.submit(cg);
7169

7270
{
73-
const auto read_t = sycl::access::mode::read;
74-
auto h_c = bufC.get_access<read_t>();
71+
sycl::host_accessor h_c{bufC, sycl::read_only};
72+
7573
double sum = 0.0f;
7674
for (int i = 0; i < N; i++) {
7775
sum += h_c[i];

0 commit comments

Comments
 (0)