Skip to content

Commit c278fcc

Browse files
authored
[SYCL] Initial commit of interop example (#2)
[SYCL] Initial commit of interop example Example that uses the interop task on the Command Group handler to interop with CUBLAS, alongside the original CUBLAS example.
1 parent d0e6be7 commit c278fcc

File tree

5 files changed

+273
-1
lines changed

5 files changed

+273
-1
lines changed

example-01/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ endif(NOT SYCL_ROOT)
1111
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
1212
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
1313
set(SYCL_FLAGS "-fsycl"
14-
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice"
14+
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
1515
"-fsycl-unnamed-lambda")
1616

1717
# Build the CUDA code

example-02/CMakeLists.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2+
project(sycl_cuda_interop LANGUAGES CXX CUDA)
3+
4+
find_package(CUDAToolkit)
5+
6+
# SYCL installation
7+
if (NOT SYCL_ROOT)
8+
message(FATAL_ERROR "No SYCL installation detected")
9+
endif(NOT SYCL_ROOT)
10+
11+
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
12+
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
13+
set(SYCL_FLAGS "-fsycl"
14+
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
15+
"-fsycl-unnamed-lambda")
16+
17+
18+
# Build the CUDA code
19+
add_executable(sgemm_cuda sgemm.cu)
20+
target_compile_features(sgemm_cuda PUBLIC cxx_std_11)
21+
set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
22+
set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
23+
target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas)
24+
25+
# Build the SYCL code
26+
add_executable (sycl_sgemm sycl_sgemm.cpp)
27+
target_compile_features(sycl_sgemm PUBLIC cxx_std_17)
28+
target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
29+
target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
30+
target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
31+
target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
32+
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cublas)

example-02/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
SYCL interop with CUDA library
2+
-------------------------------
3+
4+
The example shows how to interop with CUBLAS from a SYCL for CUDA application.
5+
The example uses Codeplay's extension *interop_task* to call the **SGEMM**
6+
routine in CUBLAS. Parameters are extracted using the interop handler conversion.
7+
8+
Requirements
9+
==============
10+
11+
Requires CMake 3.17 to configure (makes use of FindCUDAToolkit for simplicity)
12+
Example is meant to be build and executed with DPC++ compiler.
13+
14+
15+
Building the example
16+
=====================
17+
18+
19+
Create a build directory and run the following command:
20+
21+
```
22+
CXX=/path/to/dpc++/bin/clang++ cmake build/
23+
```
24+
25+
If NVIDIA CUDA is installed in your system, CMake should be able to generate
26+
the configuration files.
27+
28+
Then run
29+
30+
```
31+
make
32+
```
33+
34+
to build the example
35+
36+
Example
37+
=========
38+
39+
Two source codes are provided. `sgemm.cu` is the original CUDA code calling
40+
CUBLAS library to perform the matrix multiplication.
41+
`sycl_sgemm.cpp` is the sycl variant that calls CUBLAS underneath.
42+
43+
Both implementations perform the multiplication of square matrices A and B,
44+
where A is a matrix full of ones, and B is an identity matrix.
45+
The expected output on C is a matrix full of ones.
46+

example-02/sgemm.cu

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#include <algorithm>
2+
#include <iostream>
3+
#include <vector>
4+
5+
#include <cublas_v2.h>
6+
#include <cuda.h>
7+
8+
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
9+
10+
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
11+
if (status != CUBLAS_STATUS_SUCCESS) {
12+
std::cout << msg << " - " << status << std::endl;
13+
exit(EXIT_FAILURE);
14+
}
15+
}
16+
17+
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
18+
if (status != CUDA_SUCCESS) {
19+
std::cout << msg << " - " << status << std::endl;
20+
exit(EXIT_FAILURE);
21+
}
22+
}
23+
24+
int main() {
25+
constexpr size_t WIDTH = 1024;
26+
constexpr size_t HEIGHT = 1024;
27+
constexpr float ALPHA = 1.0f;
28+
constexpr float BETA = 0.0f;
29+
30+
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
31+
h_C(WIDTH * HEIGHT);
32+
33+
std::cout << "Size: " << h_C.size() << std::endl;
34+
float *d_A, *d_B, *d_C;
35+
36+
// A is an identity matrix
37+
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
38+
for (size_t i = 0; i < WIDTH; i++) {
39+
h_A[i * WIDTH + i] = 1.0f;
40+
}
41+
42+
// B is a matrix fill with 1
43+
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
44+
45+
const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
46+
47+
CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes));
48+
CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes));
49+
CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes));
50+
51+
CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice));
52+
CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice));
53+
54+
cublasHandle_t handle;
55+
CHECK_ERROR(cublasCreate(&handle));
56+
57+
// C = A * B
58+
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
59+
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C,
60+
WIDTH));
61+
62+
CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost));
63+
64+
// C must be all ones
65+
const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C),
66+
[](float num) { return num == 1; });
67+
68+
if (!allEqual) {
69+
std::cout << " Incorrect result " << std::endl;
70+
} else {
71+
std::cout << " Correct! " << std::endl;
72+
}
73+
74+
CHECK_ERROR(cublasDestroy(handle));
75+
CHECK_ERROR(cudaFree(d_A));
76+
CHECK_ERROR(cudaFree(d_B));
77+
CHECK_ERROR(cudaFree(d_C));
78+
79+
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
80+
}

example-02/sycl_sgemm.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#include <algorithm>
2+
#include <iostream>
3+
#include <vector>
4+
5+
#include <CL/sycl.hpp>
6+
#include <CL/sycl/backend/cuda.hpp>
7+
8+
#include <cublas_v2.h>
9+
#include <cuda.h>
10+
11+
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
12+
13+
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
14+
if (status != CUBLAS_STATUS_SUCCESS) {
15+
std::cout << msg << " - " << status << std::endl;
16+
exit(EXIT_FAILURE);
17+
}
18+
}
19+
20+
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
21+
if (status != CUDA_SUCCESS) {
22+
std::cout << msg << " - " << status << std::endl;
23+
exit(EXIT_FAILURE);
24+
}
25+
}
26+
27+
class CUDASelector : public sycl::device_selector {
28+
public:
29+
int operator()(const sycl::device &Device) const override {
30+
using namespace sycl::info;
31+
32+
const std::string DriverVersion = Device.get_info<device::driver_version>();
33+
34+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
35+
std::cout << " CUDA device found " << std::endl;
36+
return 1;
37+
};
38+
return -1;
39+
}
40+
};
41+
42+
int main() {
43+
using namespace sycl;
44+
45+
constexpr size_t WIDTH = 1024;
46+
constexpr size_t HEIGHT = 1024;
47+
constexpr float ALPHA = 1.0f;
48+
constexpr float BETA = 0.0f;
49+
50+
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
51+
h_C(WIDTH * HEIGHT);
52+
53+
std::cout << "Size: " << h_C.size() << std::endl;
54+
float *d_A, *d_B, *d_C;
55+
56+
// A is an identity matrix
57+
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
58+
for (size_t i = 0; i < WIDTH; i++) {
59+
h_A[i * WIDTH + i] = 1.0f;
60+
}
61+
62+
// B is a matrix fill with 1
63+
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
64+
65+
sycl::queue q{CUDASelector()};
66+
67+
cublasHandle_t handle;
68+
CHECK_ERROR(cublasCreate(&handle));
69+
70+
{
71+
buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
72+
buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
73+
buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};
74+
75+
q.submit([&](handler &h) {
76+
auto d_A = b_A.get_access<sycl::access::mode::read>(h);
77+
auto d_B = b_B.get_access<sycl::access::mode::read>(h);
78+
auto d_C = b_C.get_access<sycl::access::mode::write>(h);
79+
80+
h.interop_task([=](sycl::interop_handler ih) {
81+
cublasSetStream(handle, ih.get_queue<backend::cuda>());
82+
83+
auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
84+
auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
85+
auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
86+
87+
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
88+
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
89+
cuC, WIDTH));
90+
});
91+
});
92+
}
93+
94+
// C must be all ones
95+
int i = 0;
96+
const bool allEqual =
97+
std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
98+
++i;
99+
if (num != 1) {
100+
std::cout << i << " Not one : " << num << std::endl;
101+
}
102+
return num == 1;
103+
});
104+
105+
if (!allEqual) {
106+
std::cout << " Incorrect result " << std::endl;
107+
} else {
108+
std::cout << " Correct! " << std::endl;
109+
}
110+
111+
CHECK_ERROR(cublasDestroy(handle));
112+
113+
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
114+
}

0 commit comments

Comments
 (0)