Combining CUDA and SYCL on the same program (#14)

Ruyk · web-flow · commit 0bef3dcc6e2b · 2021-06-22T13:10:02.000+01:00
diff --git a/example-05/Makefile b/example-05/Makefile
@@ -1,3 +1,21 @@
+SYCLCXX=clang++
+SYCLFLAGS=-O2 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
+OBJS=main.o vadd_sycl.o vadd_cuda.o
+CUFLAGS=--cuda-gpu-arch=sm_80 -std=c++11 
+
+
+%.o: %.cpp
+	${SYCLCXX} ${SYCLFLAGS} -c -o $@ $<
+
+%.o: %.cu
+	${SYCLCXX} ${CUFLAGS} -c -o $@ $<
+
+main.exe: ${OBJS}
+	${SYCLCXX} ${SYCLFLAGS} ${CUFLAGS} ${OBJS}  -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -o $@
+
+clean: 
+	rm -f ${OBJS}
+
 MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17  -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas
 
 distributed-batch-gemm: distributed-batch-gemm.o
@@ -11,5 +29,4 @@ run: distributed-batch-gemm
 
 .PHONY: clean
 
-clean:
-	rm -f distributed-batch-gemm *.o
+
diff --git a/example-05/main.cpp b/example-05/main.cpp
@@ -0,0 +1,34 @@
+#include <array>
+#include <iostream>
+
+template <typename T, size_t N>
+void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
+                 std::array<T, N>& VC);
+
+template <typename T, size_t N>
+void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
+                 std::array<T, N>& VC);
+
+int main() {
+  const size_t array_size = 4;
+  std::array<int, array_size> A = {{1, 2, 3, 4}},
+                                           B = {{1, 2, 3, 4}}, C;
+  std::array<float, array_size> D = {{1.f, 2.f, 3.f, 4.f}},
+                                             E = {{1.f, 2.f, 3.f, 4.f}}, F;
+  simple_vadd_sycl(A, B, C);
+  simple_vadd_cuda(D, E, F);
+  for (unsigned int i = 0; i < array_size; i++) {
+    if (C[i] != A[i] + B[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << C[i]
+                << "!\n";
+      return 1;
+    }
+    if (F[i] != D[i] + E[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << F[i]
+                << "!\n";
+      return 1;
+    }
+  }
+  std::cout << "The results are correct!\n";
+  return 0;
+}
diff --git a/example-05/vadd_cuda.cu b/example-05/vadd_cuda.cu
@@ -0,0 +1,62 @@
+#include <array>
+
+// CUDA kernel. Each thread takes care of one element of c
+template<class T>
+__global__ void vecAdd(T *a, T *b, T *c, int n)
+{
+    // Get our global thread ID
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+ 
+    // Make sure we do not go out of bounds
+    if (id < n)
+        c[id] = a[id] + b[id];
+}
+ 
+template <typename T, size_t N>
+void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
+                 std::array<T, N>& VC) {
+    // Device input vectors
+    T *d_a;
+    T *d_b;
+    //Device output vector
+    T *d_c;
+ 
+    // Size, in bytes, of each vector
+    const size_t bytes = N*sizeof(T);
+ 
+    // Allocate memory for each vector on GPU
+    cudaMalloc(&d_a, bytes);
+    cudaMalloc(&d_b, bytes);
+    cudaMalloc(&d_c, bytes);
+ 
+    // Copy host vectors to device
+    cudaMemcpy( d_a, VA.data(), bytes, cudaMemcpyHostToDevice);
+    cudaMemcpy( d_b, VB.data(), bytes, cudaMemcpyHostToDevice);
+ 
+    int blockSize, gridSize;
+ 
+    // Number of threads in each thread block
+    blockSize = 1024;
+ 
+    // Number of thread blocks in grid
+    gridSize = (int)ceil((float)N/blockSize);
+ 
+    // Execute the kernel
+    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
+ 
+    // Copy array back to host
+    cudaMemcpy( VC.data(), d_c, bytes, cudaMemcpyDeviceToHost );
+ 
+    // Release device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+}
+
+
+template void simple_vadd_cuda<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
+                 std::array<float, 4>& VC);
+template void simple_vadd_cuda<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
+                 std::array<int, 4>& VC);
+
diff --git a/example-05/vadd_sycl.cpp b/example-05/vadd_sycl.cpp
@@ -0,0 +1,42 @@
+
+/* This example is a very small one designed to show how compact SYCL code
+ * can be. That said, it includes no error checking and is rather terse. */
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <iostream>
+
+constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read;
+constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
+
+/* This is the class used to name the kernel for the runtime.
+ * This must be done when the kernel is expressed as a lambda. */
+template <typename T>
+class SimpleVadd;
+
+template <typename T, size_t N>
+void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
+                 std::array<T, N>& VC) {
+  cl::sycl::queue deviceQueue;
+  cl::sycl::range<1> numOfItems{N};
+  cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler& cgh) {
+    auto accessorA = bufferA.template get_access<sycl_read>(cgh);
+    auto accessorB = bufferB.template get_access<sycl_read>(cgh);
+    auto accessorC = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [=](cl::sycl::id<1> wiID) {
+      accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+    };
+    cgh.parallel_for<class SimpleVadd<T>>(numOfItems, kern);
+  });
+}
+
+template void simple_vadd_sycl<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
+                 std::array<float, 4>& VC);
+template void simple_vadd_sycl<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
+                 std::array<int, 4>& VC);
+