Merge pull request #5 from codeplaysoftware/stuart/exercise-01

rodburns · web-flow · commit da2fa7d6581e · 2020-06-04T13:19:55.000+01:00
CUDA Interop Exercise
diff --git a/example-01/README.md b/example-01/README.md
@@ -43,7 +43,7 @@ The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
 A simple way of running the app is as follows:
 
 ```
-$ LD_LIBRARY_PATH=$HOME/open-source/sycl4cuda/lib  ./sycl_vector_addition
+$ LD_LIBRARY_PATH=/path/to/dpc++/install/lib ./sycl_vector_addition
 ```
 
 Note the `SYCL_BE` env variable is not required, since we use a custom
diff --git a/example-02/README.md b/example-02/README.md
@@ -9,8 +9,7 @@ Requirements
 ==============
 
 Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
-Example is meant to be build and executed with DPC++ compiler.
-
+This example must be compiled and executed with the DPC++ compiler.
 
 Building the example
 =====================
diff --git a/example-02/sycl_sgemm.cpp b/example-02/sycl_sgemm.cpp
@@ -51,7 +51,6 @@ int main() {
       h_C(WIDTH * HEIGHT);
 
   std::cout << "Size: " << h_C.size() << std::endl;
-  float *d_A, *d_B, *d_C;
 
   // A is an identity matrix
   std::fill(std::begin(h_A), std::end(h_A), 0.0f);
diff --git a/exercise-01/CMakeLists.txt b/exercise-01/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+project(sycl_cuda_interop LANGUAGES CXX CUDA)
+
+find_package(CUDAToolkit)
+
+# SYCL installation
+if (NOT SYCL_ROOT) 
+  message(FATAL_ERROR "No SYCL installation detected")
+endif(NOT SYCL_ROOT)
+
+set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
+set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
+set(SYCL_FLAGS "-fsycl" 
+      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
+      "-fsycl-unnamed-lambda")
+
+# Build the CUDA code
+add_executable(cuda_sgemv sgemv.cu)
+target_compile_features(cuda_sgemv PUBLIC cxx_std_11)
+set_target_properties(cuda_sgemv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_property(TARGET cuda_sgemv PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+target_link_libraries(cuda_sgemv CUDA::toolkit CUDA::cublas)
+
+add_executable (sycl_sgemv sycl_sgemv.cpp)
+target_compile_features(sycl_sgemv PUBLIC cxx_std_17)
+target_compile_options(sycl_sgemv PUBLIC ${SYCL_FLAGS})
+target_compile_definitions(sycl_sgemv PUBLIC CUDA_NO_HALF)
+target_link_libraries(sycl_sgemv PUBLIC ${SYCL_FLAGS})
+target_include_directories(sycl_sgemv PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
+target_link_libraries(sycl_sgemv PUBLIC CUDA::toolkit CUDA::cublas)
diff --git a/exercise-01/README.md b/exercise-01/README.md
@@ -0,0 +1,47 @@
+Exercise 01: SYCL interop
+-------------------------------
+
+In this exercise, you must implement an `interop_task` to let a `SYCL` application call `cuBLAS`.
+This application will perform a vector/matrix multiplication using the `cublasSgemv` routine in `cuBLAS`.
+A CUDA version of the application is provided, demonstrating how to call `cublasSgemv`.
+
+Requirements
+==============
+
+Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
+This exercise must be compiled and executed with the DPC++ compiler.
+It is expected that you have read at least example-02 before attempting this exercise.
+
+
+Building the exercise
+=====================
+
+
+Create a build directory and run the following command:
+
+```
+CXX=/path/to/dpc++/bin/clang++ cmake build/
+```
+
+If NVIDIA CUDA is installed in your system, CMake should be able to generate
+the configuration files.
+
+Then run 
+
+```
+make
+```
+
+to build the exercise
+
+Exercise
+=========
+
+Two source codes are provided. `sgemv.cu` is the original CUDA code calling
+CUBLAS library to perform the vector/matrix multiplication.
+`sycl_sgemv.cpp` is the unfinished SYCL variant that you must complete.
+Running the `sycl_sgemv.cpp` executable at this stage will result in a runtime error.
+
+Both implementations set up the same input data and expect the same output.
+
+Familiarise yourself with the `host_task` by reading through the SYCL source in example-02.
diff --git a/exercise-01/sgemv.cu b/exercise-01/sgemv.cu
@@ -0,0 +1,74 @@
+#include "cublas_v2.h"
+#include <cassert>
+#include <cuda_runtime.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+int main() {
+  constexpr size_t ROWS = 6;
+  constexpr size_t COLUMNS = 5;
+  constexpr float ALPHA = 1.0f;
+  constexpr float BETA = 0.0f;
+
+  cublasHandle_t handle;
+
+  std::vector<float> hostA(ROWS * COLUMNS);
+  std::vector<float> hostB(COLUMNS);
+  std::vector<float> hostC(ROWS);
+
+  int index = 11;
+  for (size_t i = 0; i < COLUMNS; i++) {
+    for (size_t j = 0; j < ROWS; j++) {
+      hostA[(i * ROWS) + j] = static_cast<float>(index++);
+    }
+  }
+
+  std::fill(std::begin(hostB), std::end(hostB), 1.0f);
+
+  // hostA:
+  // [11, 17, 23, 29, 35]
+  // [12, 18, 24, 30, 36]
+  // [13, 19, 25, 31, 37]
+  // [14, 20, 26, 32, 38]
+  // [15, 21, 27, 33, 39]
+  // [16, 22, 28, 34, 40]
+
+  // hostB:
+  // [1, 1, 1, 1, 1]
+
+  // hostC:
+  // [0, 0, 0, 0, 0, 0]
+
+  float *deviceA = nullptr;
+  float *deviceB = nullptr;
+  float *deviceC = nullptr;
+
+  cudaMalloc((void **)&deviceA, ROWS * COLUMNS * sizeof(float));
+  cudaMalloc((void **)&deviceB, COLUMNS * sizeof(float));
+  cudaMalloc((void **)&deviceC, ROWS * sizeof(float));
+
+  cublasCreate(&handle);
+
+  cublasSetMatrix(ROWS, COLUMNS, sizeof(float), hostA.data(), ROWS, deviceA,
+                  ROWS);
+  cublasSetVector(COLUMNS, sizeof(float), hostB.data(), 1, deviceB, 1);
+  cublasSetVector(ROWS, sizeof(float), hostC.data(), 1, deviceC, 1);
+  cublasSgemv(handle, CUBLAS_OP_N, ROWS, COLUMNS, &ALPHA, deviceA, ROWS,
+              deviceB, 1, &BETA, deviceC, 1);
+  cublasGetVector(ROWS, sizeof(float), deviceC, 1, hostC.data(), 1);
+
+  cudaFree(deviceA);
+  cudaFree(deviceB);
+  cudaFree(deviceC);
+
+  assert(hostC[0] == 115); // [11, 17, 23, 29, 35]     [1]
+  assert(hostC[1] == 120); // [12, 18, 24, 30, 36]     [1]
+  assert(hostC[2] == 125); // [13, 19, 25, 31, 37]  *  [1]
+  assert(hostC[3] == 130); // [14, 20, 26, 32, 38]     [1]
+  assert(hostC[4] == 135); // [15, 21, 27, 33, 39]     [1]
+  assert(hostC[5] == 140); // [16, 22, 28, 34, 40]
+
+  cublasDestroy(handle);
+}
diff --git a/exercise-01/sycl_sgemv.cpp b/exercise-01/sycl_sgemv.cpp
@@ -0,0 +1,84 @@
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+#include <algorithm>
+#include <cassert>
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <iostream>
+#include <vector>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      return 1;
+    };
+    return -1;
+  }
+};
+
+int main() {
+  using namespace sycl;
+
+  constexpr size_t ROWS = 6;
+  constexpr size_t COLUMNS = 5;
+  constexpr float ALPHA = 1.0f;
+  constexpr float BETA = 0.0f;
+
+  std::vector<float> hostA(ROWS * COLUMNS);
+  std::vector<float> hostB(COLUMNS);
+  std::vector<float> hostC(ROWS);
+
+  int index = 11;
+  for (size_t i = 0; i < COLUMNS; i++) {
+    for (size_t j = 0; j < ROWS; j++) {
+      hostA[(i * ROWS) + j] = static_cast<float>(index++);
+    }
+  }
+
+  std::fill(std::begin(hostB), std::end(hostB), 1.0f);
+
+  // hostA:
+  // [11, 17, 23, 29, 35]
+  // [12, 18, 24, 30, 36]
+  // [13, 19, 25, 31, 37]
+  // [14, 20, 26, 32, 38]
+  // [15, 21, 27, 33, 39]
+  // [16, 22, 28, 34, 40]
+
+  // hostB:
+  // [1, 1, 1, 1, 1]
+
+  // hostC:
+  // [0, 0, 0, 0, 0, 0]
+
+  queue q{CUDASelector()};
+
+  cublasHandle_t handle;
+  cublasCreate(&handle);
+
+  {
+    buffer<float, 2> bufferA{hostA.data(), range<2>{ROWS, COLUMNS}};
+    buffer<float, 1> bufferB{hostB.data(), range<1>{COLUMNS}};
+    buffer<float, 1> bufferC{hostC.data(), range<1>{ROWS}};
+
+    q.submit([&](handler &h) {
+      // exercise-01
+    });
+  }
+
+  assert(hostC[0] == 115); // [11, 17, 23, 29, 35]     [1]
+  assert(hostC[1] == 120); // [12, 18, 24, 30, 36]     [1]
+  assert(hostC[2] == 125); // [13, 19, 25, 31, 37]  *  [1]
+  assert(hostC[3] == 130); // [14, 20, 26, 32, 38]     [1]
+  assert(hostC[4] == 135); // [15, 21, 27, 33, 39]     [1]
+  assert(hostC[5] == 140); // [16, 22, 28, 34, 40]
+
+  cublasDestroy(handle);
+
+  return EXIT_SUCCESS;
+}