Skip to content

Commit 0bef3dc

Browse files
authored
Combining CUDA and SYCL on the same program (#14)
1 parent 130a487 commit 0bef3dc

File tree

4 files changed

+157
-2
lines changed

4 files changed

+157
-2
lines changed

example-05/Makefile

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
SYCLCXX=clang++
2+
SYCLFLAGS=-O2 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
3+
OBJS=main.o vadd_sycl.o vadd_cuda.o
4+
CUFLAGS=--cuda-gpu-arch=sm_80 -std=c++11
5+
6+
7+
%.o: %.cpp
8+
${SYCLCXX} ${SYCLFLAGS} -c -o $@ $<
9+
10+
%.o: %.cu
11+
${SYCLCXX} ${CUFLAGS} -c -o $@ $<
12+
13+
main.exe: ${OBJS}
14+
${SYCLCXX} ${SYCLFLAGS} ${CUFLAGS} ${OBJS} -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -o $@
15+
16+
clean:
17+
rm -f ${OBJS}
18+
119
MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas
220

321
distributed-batch-gemm: distributed-batch-gemm.o
@@ -11,5 +29,4 @@ run: distributed-batch-gemm
1129

1230
.PHONY: clean
1331

14-
clean:
15-
rm -f distributed-batch-gemm *.o
32+

example-05/main.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#include <array>
2+
#include <iostream>
3+
4+
template <typename T, size_t N>
5+
void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
6+
std::array<T, N>& VC);
7+
8+
template <typename T, size_t N>
9+
void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
10+
std::array<T, N>& VC);
11+
12+
int main() {
13+
const size_t array_size = 4;
14+
std::array<int, array_size> A = {{1, 2, 3, 4}},
15+
B = {{1, 2, 3, 4}}, C;
16+
std::array<float, array_size> D = {{1.f, 2.f, 3.f, 4.f}},
17+
E = {{1.f, 2.f, 3.f, 4.f}}, F;
18+
simple_vadd_sycl(A, B, C);
19+
simple_vadd_cuda(D, E, F);
20+
for (unsigned int i = 0; i < array_size; i++) {
21+
if (C[i] != A[i] + B[i]) {
22+
std::cout << "The results are incorrect (element " << i << " is " << C[i]
23+
<< "!\n";
24+
return 1;
25+
}
26+
if (F[i] != D[i] + E[i]) {
27+
std::cout << "The results are incorrect (element " << i << " is " << F[i]
28+
<< "!\n";
29+
return 1;
30+
}
31+
}
32+
std::cout << "The results are correct!\n";
33+
return 0;
34+
}

example-05/vadd_cuda.cu

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include <array>
2+
3+
// CUDA kernel. Each thread takes care of one element of c
4+
template<class T>
5+
__global__ void vecAdd(T *a, T *b, T *c, int n)
6+
{
7+
// Get our global thread ID
8+
int id = blockIdx.x*blockDim.x+threadIdx.x;
9+
10+
// Make sure we do not go out of bounds
11+
if (id < n)
12+
c[id] = a[id] + b[id];
13+
}
14+
15+
template <typename T, size_t N>
16+
void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
17+
std::array<T, N>& VC) {
18+
// Device input vectors
19+
T *d_a;
20+
T *d_b;
21+
//Device output vector
22+
T *d_c;
23+
24+
// Size, in bytes, of each vector
25+
const size_t bytes = N*sizeof(T);
26+
27+
// Allocate memory for each vector on GPU
28+
cudaMalloc(&d_a, bytes);
29+
cudaMalloc(&d_b, bytes);
30+
cudaMalloc(&d_c, bytes);
31+
32+
// Copy host vectors to device
33+
cudaMemcpy( d_a, VA.data(), bytes, cudaMemcpyHostToDevice);
34+
cudaMemcpy( d_b, VB.data(), bytes, cudaMemcpyHostToDevice);
35+
36+
int blockSize, gridSize;
37+
38+
// Number of threads in each thread block
39+
blockSize = 1024;
40+
41+
// Number of thread blocks in grid
42+
gridSize = (int)ceil((float)N/blockSize);
43+
44+
// Execute the kernel
45+
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
46+
47+
// Copy array back to host
48+
cudaMemcpy( VC.data(), d_c, bytes, cudaMemcpyDeviceToHost );
49+
50+
// Release device memory
51+
cudaFree(d_a);
52+
cudaFree(d_b);
53+
cudaFree(d_c);
54+
55+
}
56+
57+
58+
template void simple_vadd_cuda<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
59+
std::array<float, 4>& VC);
60+
template void simple_vadd_cuda<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
61+
std::array<int, 4>& VC);
62+

example-05/vadd_sycl.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
2+
/* This example is a very small one designed to show how compact SYCL code
3+
* can be. That said, it includes no error checking and is rather terse. */
4+
#include <CL/sycl.hpp>
5+
6+
#include <array>
7+
#include <iostream>
8+
9+
constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read;
10+
constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
11+
12+
/* This is the class used to name the kernel for the runtime.
13+
* This must be done when the kernel is expressed as a lambda. */
14+
template <typename T>
15+
class SimpleVadd;
16+
17+
template <typename T, size_t N>
18+
void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
19+
std::array<T, N>& VC) {
20+
cl::sycl::queue deviceQueue;
21+
cl::sycl::range<1> numOfItems{N};
22+
cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
23+
cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
24+
cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
25+
26+
deviceQueue.submit([&](cl::sycl::handler& cgh) {
27+
auto accessorA = bufferA.template get_access<sycl_read>(cgh);
28+
auto accessorB = bufferB.template get_access<sycl_read>(cgh);
29+
auto accessorC = bufferC.template get_access<sycl_write>(cgh);
30+
31+
auto kern = [=](cl::sycl::id<1> wiID) {
32+
accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
33+
};
34+
cgh.parallel_for<class SimpleVadd<T>>(numOfItems, kern);
35+
});
36+
}
37+
38+
template void simple_vadd_sycl<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
39+
std::array<float, 4>& VC);
40+
template void simple_vadd_sycl<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
41+
std::array<int, 4>& VC);
42+

0 commit comments

Comments
 (0)