Skip to content

Commit ff43eb0

Browse files
authored
Merge pull request #4 from codeplaysoftware/example-03
[SYCL] Example of pure CUDA SYCL application
2 parents da2fa7d + e628024 commit ff43eb0

File tree

5 files changed

+287
-0
lines changed

5 files changed

+287
-0
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,9 @@ experimental support for CUDA in the DPC++ SYCL implementation.
99

1010
CUDA is a registered trademark of NVIDIA Corporation
1111
SYCL is a trademark of the Khronos Group Inc
12+
13+
Docker Image
14+
-------------
15+
16+
There is a docker image available with all the examples and the required
17+
environment set up, see https://hub.docker.com/r/ruyman/dpcpp_cuda_examples.

example-03/Makefile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
3+
CUDACXX=${SYCL_ROOT}/bin/clang++
4+
5+
SYCL_INCLUDE=${SYCL_ROOT}/include/sycl/
6+
7+
CUDAFLAGS=--cuda-gpu-arch=sm_30
8+
9+
CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
10+
11+
CUDA_ROOT=/usr/local/cuda/
12+
13+
LIBS=-L${SYCL_ROOT}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT}/lib64 -lcudart
14+
15+
default: vec_add.exe usm_vec_add.exe
16+
17+
vec_add.exe: vec_add.cu
18+
${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
19+
20+
usm_vec_add.exe: vec_add_usm.cu
21+
${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
22+
23+
24+
clean:
25+
rm vec_add.exe usm_vec_add.exe

example-03/README.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
Example 03: Calling CUDA kernels from SYCL
2+
===============================
3+
4+
In this example, we re-use the trivial SYCL kernel we used on Example 1,
5+
but instead of writing the SYCL variant, we will keep the original CUDA
6+
kernel, only replacing the CUDA Runtime calls with the SYCL API.
7+
8+
This variant uses buffer and accessor syntax, which is more verbose but allows
9+
the creation of the implicit DAG.
10+
An USM variant is presented for exposition only, support for USM in CUDA is
11+
unstable at the time of writting.
12+
13+
Pre-requisites
14+
---------------
15+
16+
You would need an installation of DPC++ with CUDA support,
17+
see [Getting Started Guide](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda)
18+
for details on how to build it.
19+
20+
The example is built using Makefiles, since there is no support yet on
21+
a release of CMake for changing the CUDA compiler from nvcc.
22+
23+
Building the example
24+
---------------------
25+
26+
```sh
27+
$ SYCL_ROOT=/path/to/dpcpp make
28+
```
29+
30+
This compiles the SYCL code with the LLVM CUDA support, and generates
31+
two binaries.
32+
NVCC is not used, but the CUDA device libraries need to be available on
33+
/usr/local/cuda/lib64/ for linking to the device code.
34+
35+
NVCC compiler does not support some of the advanced C++17 syntax used on the
36+
SYCL Runtime headers.
37+
38+
Running the example
39+
--------------------
40+
41+
The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
42+
A simple way of running the example is as follows:
43+
44+
```
45+
$ LD_LIBRARY_PATH=/path/to/dpcpp/lib:$LD_LIBRARY_PATH ./vec_add.exe
46+
```
47+
48+
49+
Calling CUDA kernels from SYCL
50+
-------------------------------
51+
52+
Using Codeplay's `interop_task` extension, the example calls a CUDA kernel from
53+
a SYCL application.
54+
Note the example is compiled with the LLVM CUDA compiler, not with the SYCL
55+
compiler, since there are no SYCL kernels on it. It is only required to link
56+
against the SYCL runtime library to ensure the runtime can use the application.
57+
58+
At the time of writing, it is not possible to have both CUDA and SYCL kernels
59+
on the same file.
60+
It is possible to have different files for CUDA and SYCL kernels and call
61+
them together from a main application at runtime.
62+
63+
The example uses an extension to the SYCL interface to interact with the
64+
CUDA Runtime API.
65+
At the time of writing the extension is not public, so only a boolean flag
66+
is passed to the `sycl::context` creation.
67+
68+

example-03/vec_add.cu

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// Original source reproduced unmodified here from:
2+
// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
3+
4+
#include <algorithm>
5+
#include <iostream>
6+
#include <vector>
7+
8+
#include <CL/sycl.hpp>
9+
#include <CL/sycl/backend/cuda.hpp>
10+
11+
class CUDASelector : public sycl::device_selector {
12+
public:
13+
int operator()(const sycl::device &Device) const override {
14+
using namespace sycl::info;
15+
16+
const std::string DriverVersion = Device.get_info<device::driver_version>();
17+
18+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
19+
std::cout << " CUDA device found \n";
20+
return 1;
21+
};
22+
return -1;
23+
}
24+
};
25+
26+
// CUDA kernel. Each thread takes care of one element of c
27+
__global__ void vecAdd(double *a, double *b, double *c, int n) {
28+
// Get our global thread ID
29+
int id = blockIdx.x * blockDim.x + threadIdx.x;
30+
31+
// Make sure we do not go out of bounds
32+
if (id < n) {
33+
c[id] = a[id] + b[id];
34+
}
35+
}
36+
37+
int main(int argc, char *argv[]) {
38+
using namespace sycl;
39+
// Size of vectors
40+
int n = 100000;
41+
42+
// Create a SYCL context for interoperability with CUDA Runtime API
43+
// This is temporary until the property extension is implemented
44+
const bool UsePrimaryContext = true;
45+
device dev{CUDASelector().select_device()};
46+
context myContext{dev, {}, UsePrimaryContext};
47+
queue myQueue{myContext, dev};
48+
49+
{
50+
buffer<double> bA{range<1>(n)};
51+
buffer<double> bB{range<1>(n)};
52+
buffer<double> bC{range<1>(n)};
53+
54+
{
55+
auto hA = bA.get_access<access::mode::write>();
56+
auto hB = bB.get_access<access::mode::write>();
57+
58+
// Initialize vectors on host
59+
for (int i = 0; i < n; i++) {
60+
hA[i] = sin(i) * sin(i);
61+
hB[i] = cos(i) * cos(i);
62+
}
63+
}
64+
65+
// Dispatch a command group with all the dependencies
66+
myQueue.submit([&](handler& h) {
67+
auto accA = bA.get_access<access::mode::read>(h);
68+
auto accB = bB.get_access<access::mode::read>(h);
69+
auto accC = bC.get_access<access::mode::write>(h);
70+
71+
h.interop_task([=](interop_handler ih) {
72+
auto dA = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
73+
auto dB = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
74+
auto dC = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
75+
76+
int blockSize, gridSize;
77+
// Number of threads in each thread block
78+
blockSize = 1024;
79+
// Number of thread blocks in grid
80+
gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
81+
// Call the CUDA kernel directly from SYCL
82+
vecAdd<<<gridSize, blockSize>>>(dA, dB, dC, n);
83+
});
84+
});
85+
86+
{
87+
auto hC = bC.get_access<access::mode::read>();
88+
// Sum up vector c and print result divided by n, this should equal 1 within
89+
// error
90+
double sum = 0;
91+
for (int i = 0; i < n; i++) {
92+
sum += hC[i];
93+
}
94+
std::cout << "Final result " << sum / n << std::endl;
95+
}
96+
}
97+
98+
99+
return 0;
100+
}

example-03/vec_add_usm.cu

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Original source reproduced unmodified here from:
2+
// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
3+
4+
#include <CL/sycl.hpp>
5+
#include <CL/sycl/backend/cuda.hpp>
6+
7+
class CUDASelector : public sycl::device_selector {
8+
public:
9+
int operator()(const sycl::device &Device) const override {
10+
using namespace sycl::info;
11+
12+
const std::string DriverVersion = Device.get_info<device::driver_version>();
13+
14+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
15+
std::cout << " CUDA device found \n";
16+
return 1;
17+
};
18+
return -1;
19+
}
20+
};
21+
22+
23+
// CUDA kernel. Each thread takes care of one element of c
24+
__global__ void vecAdd(double *a, double *b, double *c, int n) {
25+
// Get our global thread ID
26+
int id = blockIdx.x * blockDim.x + threadIdx.x;
27+
28+
// Make sure we do not go out of bounds
29+
if (id < n) {
30+
c[id] = a[id] + b[id];
31+
}
32+
}
33+
34+
int main(int argc, char *argv[]) {
35+
using namespace sycl;
36+
// Size of vectors
37+
int n = 100000;
38+
39+
// Size, in bytes, of each vector
40+
size_t bytes = n * sizeof(double);
41+
42+
// Create a SYCL context for interoperability with CUDA Runtime API
43+
// This is temporary until the property extension is implemented
44+
const bool UsePrimaryContext = true;
45+
device dev{CUDASelector().select_device()};
46+
context myContext{dev, {}, UsePrimaryContext};
47+
queue myQueue{myContext, dev};
48+
49+
// Allocate memory for each vector on host
50+
auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
51+
auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
52+
auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
53+
54+
// Initialize vectors on host
55+
for (int i = 0; i < n; i++) {
56+
d_A[i] = sin(i) * sin(i);
57+
d_B[i] = cos(i) * cos(i);
58+
}
59+
60+
myQueue.submit([&](handler& h) {
61+
h.interop_task([=](interop_handler ih) {
62+
// Number of threads in each thread block
63+
int blockSize = 1024;
64+
65+
// Number of thread blocks in grid
66+
int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
67+
68+
// Execute the kernel
69+
vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
70+
});
71+
});
72+
73+
myQueue.wait();
74+
75+
// Sum up vector c and print result divided by n, this should equal 1 within
76+
// error
77+
double sum = 0;
78+
for (int i = 0; i < n; i++) {
79+
sum += d_C[i];
80+
}
81+
std::cout << "Final result " << sum / n << std::endl;
82+
83+
free(d_A, myContext);
84+
free(d_B, myContext);
85+
free(d_C, myContext);
86+
87+
return 0;
88+
}

0 commit comments

Comments
 (0)