[SYCL] Initial commit of interop example (#2)

Ruyk · web-flow · commit c278fcc5d980 · 2020-05-11T15:07:02.000+01:00
[SYCL] Initial commit of interop example

Example that uses the interop task on the Command Group handler to interop with CUBLAS, alongside the original CUBLAS example.
diff --git a/example-01/CMakeLists.txt b/example-01/CMakeLists.txt
@@ -11,7 +11,7 @@ endif(NOT SYCL_ROOT)
 set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
 set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
 set(SYCL_FLAGS "-fsycl" 
-      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice"
+      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice" 
       "-fsycl-unnamed-lambda")
 
 # Build the CUDA code
diff --git a/example-02/CMakeLists.txt b/example-02/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+project(sycl_cuda_interop LANGUAGES CXX CUDA)
+
+find_package(CUDAToolkit)
+
+# SYCL installation
+if (NOT SYCL_ROOT) 
+  message(FATAL_ERROR "No SYCL installation detected")
+endif(NOT SYCL_ROOT)
+
+set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
+set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
+set(SYCL_FLAGS "-fsycl" 
+      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
+      "-fsycl-unnamed-lambda")
+
+
+# Build the CUDA code
+add_executable(sgemm_cuda sgemm.cu)
+target_compile_features(sgemm_cuda PUBLIC cxx_std_11)
+set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas)
+
+# Build the SYCL code
+add_executable (sycl_sgemm sycl_sgemm.cpp)
+target_compile_features(sycl_sgemm PUBLIC cxx_std_17)
+target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
+target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
+target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
+target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
+target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit  CUDA::cublas)
diff --git a/example-02/README.md b/example-02/README.md
@@ -0,0 +1,46 @@
+SYCL interop with CUDA library
+-------------------------------
+
+The example shows how to interop with CUBLAS from a SYCL for CUDA application.
+The example uses Codeplay's extension *interop_task* to call the **SGEMM** 
+routine in CUBLAS. Parameters are extracted using the interop handler conversion.
+
+Requirements
+==============
+
+Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
+Example is meant to be build and executed with DPC++ compiler.
+
+
+Building the example
+=====================
+
+
+Create a build directory and run the following command:
+
+```
+CXX=/path/to/dpc++/bin/clang++ cmake build/
+```
+
+If NVIDIA CUDA is installed in your system, CMake should be able to generate
+the configuration files.
+
+Then run 
+
+```
+make
+```
+
+to build the example
+
+Example
+=========
+
+Two source codes are provided. `sgemm.cu` is the original CUDA code calling
+CUBLAS library to perform the matrix multiplication.
+`sycl_sgemm.cpp` is the sycl variant that calls CUBLAS underneath.
+
+Both implementations perform the multiplication of square matrices A and B, 
+where A is a matrix full of ones, and B is an identity matrix.
+The expected output on C is a matrix full of ones.
+
diff --git a/example-02/sgemm.cu b/example-02/sgemm.cu
@@ -0,0 +1,80 @@
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
+
+void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+void inline checkCudaErrorMsg(cudaError status, const char *msg) {
+  if (status != CUDA_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+int main() {
+  constexpr size_t WIDTH = 1024;
+  constexpr size_t HEIGHT = 1024;
+  constexpr float ALPHA = 1.0f;
+  constexpr float BETA = 0.0f;
+
+  std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
+      h_C(WIDTH * HEIGHT);
+
+  std::cout << "Size: " << h_C.size() << std::endl;
+  float *d_A, *d_B, *d_C;
+
+  // A is an identity matrix
+  std::fill(std::begin(h_A), std::end(h_A), 0.0f);
+  for (size_t i = 0; i < WIDTH; i++) {
+    h_A[i * WIDTH + i] = 1.0f;
+  }
+
+  // B is a matrix fill with 1
+  std::fill(std::begin(h_B), std::end(h_B), 1.0f);
+
+  const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
+
+  CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes));
+  CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes));
+  CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes));
+
+  CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice));
+  CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice));
+
+  cublasHandle_t handle;
+  CHECK_ERROR(cublasCreate(&handle));
+
+  // C = A * B
+  CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
+                          WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C,
+                          WIDTH));
+
+  CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost));
+
+  // C must be all ones
+  const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C),
+                                    [](float num) { return num == 1; });
+
+  if (!allEqual) {
+    std::cout << " Incorrect result " << std::endl;
+  } else {
+    std::cout << " Correct! " << std::endl;
+  }
+
+  CHECK_ERROR(cublasDestroy(handle));
+  CHECK_ERROR(cudaFree(d_A));
+  CHECK_ERROR(cudaFree(d_B));
+  CHECK_ERROR(cudaFree(d_C));
+
+  return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/example-02/sycl_sgemm.cpp b/example-02/sycl_sgemm.cpp
@@ -0,0 +1,114 @@
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
+
+void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+void inline checkCudaErrorMsg(cudaError status, const char *msg) {
+  if (status != CUDA_SUCCESS) {
+    std::cout << msg << " - " << status << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found " << std::endl;
+      return 1;
+    };
+    return -1;
+  }
+};
+
+int main() {
+  using namespace sycl;
+
+  constexpr size_t WIDTH = 1024;
+  constexpr size_t HEIGHT = 1024;
+  constexpr float ALPHA = 1.0f;
+  constexpr float BETA = 0.0f;
+
+  std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
+      h_C(WIDTH * HEIGHT);
+
+  std::cout << "Size: " << h_C.size() << std::endl;
+  float *d_A, *d_B, *d_C;
+
+  // A is an identity matrix
+  std::fill(std::begin(h_A), std::end(h_A), 0.0f);
+  for (size_t i = 0; i < WIDTH; i++) {
+    h_A[i * WIDTH + i] = 1.0f;
+  }
+
+  // B is a matrix fill with 1
+  std::fill(std::begin(h_B), std::end(h_B), 1.0f);
+
+  sycl::queue q{CUDASelector()};
+
+  cublasHandle_t handle;
+  CHECK_ERROR(cublasCreate(&handle));
+
+  {
+    buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
+    buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
+    buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};
+
+    q.submit([&](handler &h) {
+      auto d_A = b_A.get_access<sycl::access::mode::read>(h);
+      auto d_B = b_B.get_access<sycl::access::mode::read>(h);
+      auto d_C = b_C.get_access<sycl::access::mode::write>(h);
+
+      h.interop_task([=](sycl::interop_handler ih) {
+        cublasSetStream(handle, ih.get_queue<backend::cuda>());
+
+        auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
+        auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
+        auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
+
+        CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
+                                WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
+                                cuC, WIDTH));
+      });
+    });
+  }
+
+  // C must be all ones
+  int i = 0;
+  const bool allEqual =
+      std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
+        ++i;
+        if (num != 1) {
+          std::cout << i << " Not one : " << num << std::endl;
+        }
+        return num == 1;
+      });
+
+  if (!allEqual) {
+    std::cout << " Incorrect result " << std::endl;
+  } else {
+    std::cout << " Correct! " << std::endl;
+  }
+
+  CHECK_ERROR(cublasDestroy(handle));
+
+  return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
+}