Skip to content

Commit da2fa7d

Browse files
authored
Merge pull request #5 from codeplaysoftware/stuart/exercise-01
CUDA Interop Exercise
2 parents c278fcc + 4b3ed86 commit da2fa7d

File tree

7 files changed

+237
-4
lines changed

7 files changed

+237
-4
lines changed

example-01/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
4343
A simple way of running the app is as follows:
4444

4545
```
46-
$ LD_LIBRARY_PATH=$HOME/open-source/sycl4cuda/lib ./sycl_vector_addition
46+
$ LD_LIBRARY_PATH=/path/to/dpc++/install/lib ./sycl_vector_addition
4747
```
4848
4949
Note the `SYCL_BE` env variable is not required, since we use a custom

example-02/README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ Requirements
99
==============
1010

1111
Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
12-
Example is meant to be build and executed with DPC++ compiler.
13-
12+
This example must be compiled and executed with the DPC++ compiler.
1413

1514
Building the example
1615
=====================

example-02/sycl_sgemm.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ int main() {
5151
h_C(WIDTH * HEIGHT);
5252

5353
std::cout << "Size: " << h_C.size() << std::endl;
54-
float *d_A, *d_B, *d_C;
5554

5655
// A is an identity matrix
5756
std::fill(std::begin(h_A), std::end(h_A), 0.0f);

exercise-01/CMakeLists.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2+
project(sycl_cuda_interop LANGUAGES CXX CUDA)
3+
4+
find_package(CUDAToolkit)
5+
6+
# SYCL installation
7+
if (NOT SYCL_ROOT)
8+
message(FATAL_ERROR "No SYCL installation detected")
9+
endif(NOT SYCL_ROOT)
10+
11+
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
12+
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
13+
set(SYCL_FLAGS "-fsycl"
14+
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
15+
"-fsycl-unnamed-lambda")
16+
17+
# Build the CUDA code
18+
add_executable(cuda_sgemv sgemv.cu)
19+
target_compile_features(cuda_sgemv PUBLIC cxx_std_11)
20+
set_target_properties(cuda_sgemv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
21+
set_property(TARGET cuda_sgemv PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
22+
target_link_libraries(cuda_sgemv CUDA::toolkit CUDA::cublas)
23+
24+
add_executable (sycl_sgemv sycl_sgemv.cpp)
25+
target_compile_features(sycl_sgemv PUBLIC cxx_std_17)
26+
target_compile_options(sycl_sgemv PUBLIC ${SYCL_FLAGS})
27+
target_compile_definitions(sycl_sgemv PUBLIC CUDA_NO_HALF)
28+
target_link_libraries(sycl_sgemv PUBLIC ${SYCL_FLAGS})
29+
target_include_directories(sycl_sgemv PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
30+
target_link_libraries(sycl_sgemv PUBLIC CUDA::toolkit CUDA::cublas)

exercise-01/README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
Exercise 01: SYCL interop
2+
-------------------------------
3+
4+
In this exercise, you must implement an `interop_task` to let a `SYCL` application call `cuBLAS`.
5+
This application will perform a vector/matrix multiplication using the `cublasSgemv` routine in `cuBLAS`.
6+
A CUDA version of the application is provided, demonstrating how to call `cublasSgemv`.
7+
8+
Requirements
9+
==============
10+
11+
Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
12+
This exercise must be compiled and executed with the DPC++ compiler.
13+
It is expected that you have read at least example-02 before attempting this exercise.
14+
15+
16+
Building the exercise
17+
=====================
18+
19+
20+
Create a build directory and run the following command:
21+
22+
```
23+
CXX=/path/to/dpc++/bin/clang++ cmake build/
24+
```
25+
26+
If NVIDIA CUDA is installed in your system, CMake should be able to generate
27+
the configuration files.
28+
29+
Then run
30+
31+
```
32+
make
33+
```
34+
35+
to build the exercise
36+
37+
Exercise
38+
=========
39+
40+
Two source codes are provided. `sgemv.cu` is the original CUDA code calling
41+
CUBLAS library to perform the vector/matrix multiplication.
42+
`sycl_sgemv.cpp` is the unfinished SYCL variant that you must complete.
43+
Running the `sycl_sgemv.cpp` executable at this stage will result in a runtime error.
44+
45+
Both implementations set up the same input data and expect the same output.
46+
47+
Familiarise yourself with the `host_task` by reading through the SYCL source in example-02.

exercise-01/sgemv.cu

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include "cublas_v2.h"
2+
#include <cassert>
3+
#include <cuda_runtime.h>
4+
#include <math.h>
5+
#include <stdio.h>
6+
#include <stdlib.h>
7+
#include <vector>
8+
9+
int main() {
10+
constexpr size_t ROWS = 6;
11+
constexpr size_t COLUMNS = 5;
12+
constexpr float ALPHA = 1.0f;
13+
constexpr float BETA = 0.0f;
14+
15+
cublasHandle_t handle;
16+
17+
std::vector<float> hostA(ROWS * COLUMNS);
18+
std::vector<float> hostB(COLUMNS);
19+
std::vector<float> hostC(ROWS);
20+
21+
int index = 11;
22+
for (size_t i = 0; i < COLUMNS; i++) {
23+
for (size_t j = 0; j < ROWS; j++) {
24+
hostA[(i * ROWS) + j] = static_cast<float>(index++);
25+
}
26+
}
27+
28+
std::fill(std::begin(hostB), std::end(hostB), 1.0f);
29+
30+
// hostA:
31+
// [11, 17, 23, 29, 35]
32+
// [12, 18, 24, 30, 36]
33+
// [13, 19, 25, 31, 37]
34+
// [14, 20, 26, 32, 38]
35+
// [15, 21, 27, 33, 39]
36+
// [16, 22, 28, 34, 40]
37+
38+
// hostB:
39+
// [1, 1, 1, 1, 1]
40+
41+
// hostC:
42+
// [0, 0, 0, 0, 0, 0]
43+
44+
float *deviceA = nullptr;
45+
float *deviceB = nullptr;
46+
float *deviceC = nullptr;
47+
48+
cudaMalloc((void **)&deviceA, ROWS * COLUMNS * sizeof(float));
49+
cudaMalloc((void **)&deviceB, COLUMNS * sizeof(float));
50+
cudaMalloc((void **)&deviceC, ROWS * sizeof(float));
51+
52+
cublasCreate(&handle);
53+
54+
cublasSetMatrix(ROWS, COLUMNS, sizeof(float), hostA.data(), ROWS, deviceA,
55+
ROWS);
56+
cublasSetVector(COLUMNS, sizeof(float), hostB.data(), 1, deviceB, 1);
57+
cublasSetVector(ROWS, sizeof(float), hostC.data(), 1, deviceC, 1);
58+
cublasSgemv(handle, CUBLAS_OP_N, ROWS, COLUMNS, &ALPHA, deviceA, ROWS,
59+
deviceB, 1, &BETA, deviceC, 1);
60+
cublasGetVector(ROWS, sizeof(float), deviceC, 1, hostC.data(), 1);
61+
62+
cudaFree(deviceA);
63+
cudaFree(deviceB);
64+
cudaFree(deviceC);
65+
66+
assert(hostC[0] == 115); // [11, 17, 23, 29, 35] [1]
67+
assert(hostC[1] == 120); // [12, 18, 24, 30, 36] [1]
68+
assert(hostC[2] == 125); // [13, 19, 25, 31, 37] * [1]
69+
assert(hostC[3] == 130); // [14, 20, 26, 32, 38] [1]
70+
assert(hostC[4] == 135); // [15, 21, 27, 33, 39] [1]
71+
assert(hostC[5] == 140); // [16, 22, 28, 34, 40]
72+
73+
cublasDestroy(handle);
74+
}

exercise-01/sycl_sgemv.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#include <CL/sycl.hpp>
2+
#include <CL/sycl/backend/cuda.hpp>
3+
#include <algorithm>
4+
#include <cassert>
5+
#include <cublas_v2.h>
6+
#include <cuda.h>
7+
#include <iostream>
8+
#include <vector>
9+
10+
class CUDASelector : public sycl::device_selector {
11+
public:
12+
int operator()(const sycl::device &Device) const override {
13+
using namespace sycl::info;
14+
15+
const std::string DriverVersion = Device.get_info<device::driver_version>();
16+
17+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
18+
return 1;
19+
};
20+
return -1;
21+
}
22+
};
23+
24+
int main() {
25+
using namespace sycl;
26+
27+
constexpr size_t ROWS = 6;
28+
constexpr size_t COLUMNS = 5;
29+
constexpr float ALPHA = 1.0f;
30+
constexpr float BETA = 0.0f;
31+
32+
std::vector<float> hostA(ROWS * COLUMNS);
33+
std::vector<float> hostB(COLUMNS);
34+
std::vector<float> hostC(ROWS);
35+
36+
int index = 11;
37+
for (size_t i = 0; i < COLUMNS; i++) {
38+
for (size_t j = 0; j < ROWS; j++) {
39+
hostA[(i * ROWS) + j] = static_cast<float>(index++);
40+
}
41+
}
42+
43+
std::fill(std::begin(hostB), std::end(hostB), 1.0f);
44+
45+
// hostA:
46+
// [11, 17, 23, 29, 35]
47+
// [12, 18, 24, 30, 36]
48+
// [13, 19, 25, 31, 37]
49+
// [14, 20, 26, 32, 38]
50+
// [15, 21, 27, 33, 39]
51+
// [16, 22, 28, 34, 40]
52+
53+
// hostB:
54+
// [1, 1, 1, 1, 1]
55+
56+
// hostC:
57+
// [0, 0, 0, 0, 0, 0]
58+
59+
queue q{CUDASelector()};
60+
61+
cublasHandle_t handle;
62+
cublasCreate(&handle);
63+
64+
{
65+
buffer<float, 2> bufferA{hostA.data(), range<2>{ROWS, COLUMNS}};
66+
buffer<float, 1> bufferB{hostB.data(), range<1>{COLUMNS}};
67+
buffer<float, 1> bufferC{hostC.data(), range<1>{ROWS}};
68+
69+
q.submit([&](handler &h) {
70+
// exercise-01
71+
});
72+
}
73+
74+
assert(hostC[0] == 115); // [11, 17, 23, 29, 35] [1]
75+
assert(hostC[1] == 120); // [12, 18, 24, 30, 36] [1]
76+
assert(hostC[2] == 125); // [13, 19, 25, 31, 37] * [1]
77+
assert(hostC[3] == 130); // [14, 20, 26, 32, 38] [1]
78+
assert(hostC[4] == 135); // [15, 21, 27, 33, 39] [1]
79+
assert(hostC[5] == 140); // [16, 22, 28, 34, 40]
80+
81+
cublasDestroy(handle);
82+
83+
return EXIT_SUCCESS;
84+
}

0 commit comments

Comments
 (0)