Skip to content

Commit 5e0e3f4

Browse files
authored
Add USM cublasSgemm (#21)
1 parent aad6720 commit 5e0e3f4

File tree

2 files changed

+131
-0
lines changed

2 files changed

+131
-0
lines changed

example-02/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,12 @@ target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
3535
target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
3636
target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
3737
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)
38+
39+
# Build the SYCL USM code
40+
add_executable (sycl_sgemm_usm sycl_sgemm_usm.cpp)
41+
target_compile_features(sycl_sgemm_usm PUBLIC cxx_std_17)
42+
target_compile_options(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS})
43+
target_compile_definitions(sycl_sgemm_usm PUBLIC CUDA_NO_HALF)
44+
target_link_libraries(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS})
45+
target_include_directories(sycl_sgemm_usm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
46+
target_link_libraries(sycl_sgemm_usm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)

example-02/sycl_sgemm_usm.cpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include <algorithm>
2+
#include <iostream>
3+
#include <vector>
4+
5+
#include <CL/sycl.hpp>
6+
#include <CL/sycl/backend/cuda.hpp>
7+
8+
#include <cublas_v2.h>
9+
#include <cuda.h>
10+
11+
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
12+
13+
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
14+
if (status != CUBLAS_STATUS_SUCCESS) {
15+
std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
16+
exit(EXIT_FAILURE);
17+
}
18+
}
19+
20+
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
21+
if (status != cudaSuccess) {
22+
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
23+
exit(EXIT_FAILURE);
24+
}
25+
}
26+
27+
void inline checkCudaErrorMsg(CUresult status, const char *msg) {
28+
if (status != CUDA_SUCCESS) {
29+
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
30+
exit(EXIT_FAILURE);
31+
}
32+
}
33+
34+
class CUDASelector : public sycl::device_selector {
35+
public:
36+
int operator()(const sycl::device &device) const override {
37+
if(device.get_platform().get_backend() == sycl::backend::cuda){
38+
std::cout << " CUDA device found " << std::endl;
39+
return 1;
40+
} else{
41+
return -1;
42+
}
43+
}
44+
};
45+
46+
int main() {
47+
using namespace sycl;
48+
49+
constexpr size_t WIDTH = 1024;
50+
constexpr size_t HEIGHT = 1024;
51+
constexpr float ALPHA = 1.0f;
52+
constexpr float BETA = 0.0f;
53+
54+
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
55+
h_C(WIDTH * HEIGHT);
56+
57+
std::cout << "Size: " << h_C.size() << std::endl;
58+
59+
// A is an identity matrix
60+
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
61+
for (size_t i = 0; i < WIDTH; i++) {
62+
h_A[i * WIDTH + i] = 1.0f;
63+
}
64+
65+
// B is a matrix fill with 1
66+
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
67+
68+
sycl::queue q{CUDASelector()};
69+
70+
// Allocate memory on the device
71+
float* d_A = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
72+
float* d_B = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
73+
float* d_C = sycl::malloc_device<float>(WIDTH*HEIGHT,q);
74+
75+
// Copy matrices A & B to device from host vectors
76+
const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
77+
q.memcpy(d_A, h_A.data(), numBytes).wait();
78+
q.memcpy(d_B, h_B.data(), numBytes).wait();
79+
80+
// Create cublas handle
81+
cublasHandle_t handle;
82+
CHECK_ERROR(cublasCreate(&handle));
83+
84+
q.submit([&](handler &h) {
85+
86+
h.host_task([=](sycl::interop_handle ih) {
87+
88+
// Set the correct cuda context & stream
89+
cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
90+
cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
91+
92+
// Call generalised matrix-matrix multiply
93+
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
94+
WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
95+
d_C, WIDTH));
96+
});
97+
}).wait();
98+
99+
// Copy the result back to host
100+
q.memcpy(h_C.data(), d_C, numBytes).wait();
101+
102+
// C must be all ones
103+
int i = 0;
104+
const bool allEqual =
105+
std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
106+
++i;
107+
if (num != 1) {
108+
std::cout << i << " Not one : " << num << std::endl;
109+
}
110+
return num == 1;
111+
});
112+
113+
if (!allEqual) {
114+
std::cout << " Incorrect result " << std::endl;
115+
} else {
116+
std::cout << " Correct! " << std::endl;
117+
}
118+
119+
CHECK_ERROR(cublasDestroy(handle));
120+
121+
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
122+
}

0 commit comments

Comments
 (0)