Merge pull request #4 from codeplaysoftware/example-03

rodburns · web-flow · commit ff43eb071ba3 · 2020-06-04T13:20:13.000+01:00
[SYCL] Example of pure CUDA SYCL application
diff --git a/README.md b/README.md
@@ -9,3 +9,9 @@ experimental support for CUDA in the DPC++ SYCL implementation.
 
 CUDA is a registered trademark of NVIDIA Corporation
 SYCL is a trademark of the Khronos Group Inc
+
+Docker Image
+-------------
+
+There is a docker image available with all the examples and the required
+environment set up, see https://hub.docker.com/r/ruyman/dpcpp_cuda_examples.
diff --git a/example-03/Makefile b/example-03/Makefile
@@ -0,0 +1,25 @@
+
+
+CUDACXX=${SYCL_ROOT}/bin/clang++
+
+SYCL_INCLUDE=${SYCL_ROOT}/include/sycl/
+
+CUDAFLAGS=--cuda-gpu-arch=sm_30 
+
+CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
+
+CUDA_ROOT=/usr/local/cuda/
+
+LIBS=-L${SYCL_ROOT}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT}/lib64 -lcudart
+
+default: vec_add.exe usm_vec_add.exe
+
+vec_add.exe: vec_add.cu
+	${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
+
+usm_vec_add.exe: vec_add_usm.cu
+	${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
+
+
+clean:
+	rm vec_add.exe usm_vec_add.exe
diff --git a/example-03/README.md b/example-03/README.md
@@ -0,0 +1,68 @@
+Example 03: Calling CUDA kernels from SYCL
+===============================
+
+In this example, we re-use the trivial SYCL kernel we used on Example 1, 
+but instead of writing the SYCL variant, we will keep the original CUDA
+kernel, only replacing the CUDA Runtime calls with the SYCL API.
+
+This variant uses buffer and accessor syntax, which is more verbose but allows
+the creation of the implicit DAG.
+An USM variant is presented for exposition only, support for USM in CUDA is
+unstable at the time of writting.
+
+Pre-requisites
+---------------
+
+You would need an installation of DPC++ with CUDA support, 
+see [Getting Started Guide](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda)
+for details on how to build it.
+
+The example is built using Makefiles, since there is no support yet on
+a release of CMake for changing the CUDA compiler from nvcc.
+
+Building the example
+---------------------
+
+```sh
+$ SYCL_ROOT=/path/to/dpcpp   make  
+```
+
+This compiles the SYCL code with the LLVM CUDA support, and generates
+two binaries.
+NVCC is not used, but the CUDA device libraries need to be available on 
+/usr/local/cuda/lib64/ for linking to the device code.
+
+NVCC compiler does not support some of the advanced C++17 syntax used on the
+SYCL Runtime headers.
+
+Running the example
+--------------------
+
+The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
+A simple way of running the example is as follows:
+
+```
+$ LD_LIBRARY_PATH=/path/to/dpcpp/lib:$LD_LIBRARY_PATH  ./vec_add.exe
+```
+
+
+Calling CUDA kernels from SYCL
+-------------------------------
+
+Using Codeplay's `interop_task` extension, the example calls a CUDA kernel from
+a SYCL application.
+Note the example is compiled with the LLVM CUDA compiler, not with the SYCL
+compiler, since there are no SYCL kernels on it. It is only required to link
+against the SYCL runtime library to ensure the runtime can use the application.
+
+At the time of writing, it is not possible to have both CUDA and SYCL kernels
+on the same file.
+It is possible to have different files for CUDA and SYCL kernels and call
+them together from a main application at runtime.
+
+The example uses an extension to the SYCL interface to interact with the
+CUDA Runtime API. 
+At the time of writing the extension is not public, so only a boolean flag
+is passed to the `sycl::context` creation.
+
+
diff --git a/example-03/vec_add.cu b/example-03/vec_add.cu
@@ -0,0 +1,100 @@
+// Original source reproduced unmodified here from: 
+// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found \n";
+      return 1;
+    };
+    return -1;
+  }
+};
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n) {
+  // Get our global thread ID
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Make sure we do not go out of bounds
+  if (id < n) {
+    c[id] = a[id] + b[id];
+  }
+}
+
+int main(int argc, char *argv[]) {
+  using namespace sycl;
+  // Size of vectors
+  int n = 100000;
+
+  // Create a SYCL context for interoperability with CUDA Runtime API
+  // This is temporary until the property extension is implemented
+  const bool UsePrimaryContext = true;
+  device dev{CUDASelector().select_device()};
+  context myContext{dev, {}, UsePrimaryContext};
+  queue myQueue{myContext, dev};
+
+  {
+    buffer<double> bA{range<1>(n)};
+    buffer<double> bB{range<1>(n)};
+    buffer<double> bC{range<1>(n)};
+
+    {
+      auto hA = bA.get_access<access::mode::write>();
+      auto hB = bB.get_access<access::mode::write>();
+
+      // Initialize vectors on host
+      for (int i = 0; i < n; i++) {
+        hA[i] = sin(i) * sin(i);
+        hB[i] = cos(i) * cos(i);
+      }
+    }
+
+    // Dispatch a command group with all the dependencies
+    myQueue.submit([&](handler& h) {
+      auto accA = bA.get_access<access::mode::read>(h);
+      auto accB = bB.get_access<access::mode::read>(h);
+      auto accC = bC.get_access<access::mode::write>(h);
+
+      h.interop_task([=](interop_handler ih) {
+        auto dA = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
+        auto dB = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
+        auto dC = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
+
+        int blockSize, gridSize;
+        // Number of threads in each thread block
+        blockSize = 1024;
+        // Number of thread blocks in grid
+        gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
+        // Call the CUDA kernel directly from SYCL
+        vecAdd<<<gridSize, blockSize>>>(dA, dB, dC, n);
+      });
+    });
+
+    {
+     auto hC = bC.get_access<access::mode::read>();
+     // Sum up vector c and print result divided by n, this should equal 1 within
+     // error
+     double sum = 0;
+     for (int i = 0; i < n; i++) {
+        sum += hC[i];
+     }
+      std::cout << "Final result " << sum / n << std::endl;
+    }
+  }
+
+
+  return 0;
+}
diff --git a/example-03/vec_add_usm.cu b/example-03/vec_add_usm.cu
@@ -0,0 +1,88 @@
+// Original source reproduced unmodified here from: 
+// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found \n";
+      return 1;
+    };
+    return -1;
+  }
+};
+
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n) {
+  // Get our global thread ID
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Make sure we do not go out of bounds
+  if (id < n) {
+    c[id] = a[id] + b[id];
+  }
+}
+
+int main(int argc, char *argv[]) {
+  using namespace sycl;
+  // Size of vectors
+  int n = 100000;
+
+  // Size, in bytes, of each vector
+  size_t bytes = n * sizeof(double);
+
+  // Create a SYCL context for interoperability with CUDA Runtime API
+  // This is temporary until the property extension is implemented
+  const bool UsePrimaryContext = true;
+  device dev{CUDASelector().select_device()};
+  context myContext{dev, {}, UsePrimaryContext};
+  queue myQueue{myContext, dev};
+
+  // Allocate memory for each vector on host
+  auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
+  auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
+  auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
+
+  // Initialize vectors on host
+  for (int i = 0; i < n; i++) {
+    d_A[i] = sin(i) * sin(i);
+    d_B[i] = cos(i) * cos(i);
+  }
+
+  myQueue.submit([&](handler& h) {
+      h.interop_task([=](interop_handler ih) {
+        // Number of threads in each thread block
+        int blockSize = 1024;
+
+        // Number of thread blocks in grid
+        int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
+
+        // Execute the kernel
+        vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
+        });
+  });
+
+  myQueue.wait();
+
+  // Sum up vector c and print result divided by n, this should equal 1 within
+  // error
+  double sum = 0;
+  for (int i = 0; i < n; i++) {
+    sum += d_C[i];
+  }
+  std::cout << "Final result " << sum / n << std::endl;
+
+  free(d_A, myContext);
+  free(d_B, myContext);
+  free(d_C, myContext);
+
+  return 0;
+}