Update CUDA examples to work with recent DPC++ (#12)

Ruyk · web-flow · commit 1416d9e061e5 · 2020-12-15T20:30:42.000Z
* Update CUDA examples to work with recent DPC++

* Addressing feedback from reviews
diff --git a/example-02/CMakeLists.txt b/example-02/CMakeLists.txt
@@ -29,4 +29,4 @@ target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
 target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
 target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
 target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
-target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit  CUDA::cublas)
+target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)
diff --git a/example-02/sgemm.cu b/example-02/sgemm.cu
@@ -15,7 +15,7 @@ void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
 }
 
 void inline checkCudaErrorMsg(cudaError status, const char *msg) {
-  if (status != CUDA_SUCCESS) {
+  if (status != cudaSuccess) {
     std::cout << msg << " - " << status << std::endl;
     exit(EXIT_FAILURE);
   }
diff --git a/example-02/sycl_sgemm.cpp b/example-02/sycl_sgemm.cpp
@@ -12,14 +12,21 @@
 
 void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
   if (status != CUBLAS_STATUS_SUCCESS) {
-    std::cout << msg << " - " << status << std::endl;
+    std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
     exit(EXIT_FAILURE);
   }
 }
 
 void inline checkCudaErrorMsg(cudaError status, const char *msg) {
+  if (status != cudaSuccess) {
+    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+void inline checkCudaErrorMsg(CUresult status, const char *msg) {
   if (status != CUDA_SUCCESS) {
-    std::cout << msg << " - " << status << std::endl;
+    std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
     exit(EXIT_FAILURE);
   }
 }
@@ -76,12 +83,12 @@ int main() {
       auto d_B = b_B.get_access<sycl::access::mode::read>(h);
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
-      h.interop_task([=](sycl::interop_handler ih) {
-        cublasSetStream(handle, ih.get_queue<backend::cuda>());
-
-        auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
-        auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
-        auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
+      h.codeplay_host_task([=](sycl::interop_handle ih) {
+        cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
+        cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
+        auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_A));
+        auto cuB = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_B));
+        auto cuC = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_C));
 
         CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
                                 WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
diff --git a/example-02/sycl_sgemm_interop_task.cpp b/example-02/sycl_sgemm_interop_task.cpp
@@ -0,0 +1,111 @@
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
+
+void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+void inline checkCudaErrorMsg(cudaError status, const char *msg) {
+  if (status != CUDA_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found " << std::endl;
+      return 1;
+    };
+    return -1;
+  }
+};
+
+int main() {
+  constexpr size_t WIDTH = 1024;
+  constexpr size_t HEIGHT = 1024;
+  constexpr float ALPHA = 1.0f;
+  constexpr float BETA = 0.0f;
+
+  std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
+      h_C(WIDTH * HEIGHT);
+
+  std::cout << "Size: " << h_C.size() << std::endl;
+
+  // A is an identity matrix
+  std::fill(std::begin(h_A), std::end(h_A), 0.0f);
+  for (size_t i = 0; i < WIDTH; i++) {
+    h_A[i * WIDTH + i] = 1.0f;
+  }
+
+  // B is a matrix fill with 1
+  std::fill(std::begin(h_B), std::end(h_B), 1.0f);
+
+  sycl::queue q{CUDASelector()};
+
+  cublasHandle_t handle;
+  CHECK_ERROR(cublasCreate(&handle));
+
+  {
+    sycl::buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
+    sycl::buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
+    sycl::buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};
+
+    q.submit([&](sycl::handler &h) {
+      auto d_A = b_A.get_access<sycl::access::mode::read>(h);
+      auto d_B = b_B.get_access<sycl::access::mode::read>(h);
+      auto d_C = b_C.get_access<sycl::access::mode::write>(h);
+
+      h.interop_task([=](sycl::interop_handler ih) {
+        cublasSetStream(handle, ih.get_queue<backend::cuda>());
+
+        auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
+        auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
+        auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
+
+        CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
+                                WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
+                                cuC, WIDTH));
+      });
+    });
+  }
+
+  // C must be all ones
+  int i = 0;
+  const bool allEqual =
+      std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
+        ++i;
+        if (num != 1) {
+          std::cout << i << " Not one : " << num << std::endl;
+        }
+        return num == 1;
+      });
+
+  if (!allEqual) {
+    std::cout << " Incorrect result " << std::endl;
+  } else {
+    std::cout << " Correct! " << std::endl;
+  }
+
+  CHECK_ERROR(cublasDestroy(handle));
+
+  return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/example-03/Makefile b/example-03/Makefile
@@ -8,7 +8,7 @@ CUDAFLAGS=--cuda-gpu-arch=sm_30
 
 CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
 
-LIBS=-L${SYCL_ROOT_DIR}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart
+LIBS=-L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart
 
 default: vec_add.exe usm_vec_add.exe
 
diff --git a/example-03/vec_add.cu b/example-03/vec_add.cu
@@ -39,11 +39,8 @@ int main(int argc, char *argv[]) {
   // Size of vectors
   int n = 100000;
 
-  // Create a SYCL context for interoperability with CUDA Runtime API
-  // This is temporary until the property extension is implemented
-  const bool UsePrimaryContext = true;
   device dev{CUDASelector().select_device()};
-  context myContext{dev, {}, UsePrimaryContext};
+  context myContext{dev};
   queue myQueue{myContext, dev};
 
   {
diff --git a/example-03/vec_add_usm.cu b/example-03/vec_add_usm.cu
@@ -39,11 +39,8 @@ int main(int argc, char *argv[]) {
   // Size, in bytes, of each vector
   size_t bytes = n * sizeof(double);
 
-  // Create a SYCL context for interoperability with CUDA Runtime API
-  // This is temporary until the property extension is implemented
-  const bool UsePrimaryContext = true;
   device dev{CUDASelector().select_device()};
-  context myContext{dev, {}, UsePrimaryContext};
+  context myContext{dev};
   queue myQueue{myContext, dev};
 
   // Allocate memory for each vector on host

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {`
`15`	`15`	`}`
`16`	`16`
`17`	`17`	`void inline checkCudaErrorMsg(cudaError status, const char *msg) {`
`18`		`- if (status != CUDA_SUCCESS) {`
	`18`	`+ if (status != cudaSuccess) {`
`19`	`19`	`std::cout << msg << " - " << status << std::endl;`
`20`	`20`	`exit(EXIT_FAILURE);`
`21`	`21`	`}`