Skip to content

Commit 1416d9e

Browse files
authored
Update CUDA examples to work with recent DPC++ (#12)
* Update CUDA examples to work with recent DPC++ * Addressing feedback from reviews
1 parent 92e5194 commit 1416d9e

File tree

7 files changed

+131
-19
lines changed

7 files changed

+131
-19
lines changed

example-02/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
2929
target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
3030
target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
3131
target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
32-
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cublas)
32+
target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)

example-02/sgemm.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
1515
}
1616

1717
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
18-
if (status != CUDA_SUCCESS) {
18+
if (status != cudaSuccess) {
1919
std::cout << msg << " - " << status << std::endl;
2020
exit(EXIT_FAILURE);
2121
}

example-02/sycl_sgemm.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,21 @@
1212

1313
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
1414
if (status != CUBLAS_STATUS_SUCCESS) {
15-
std::cout << msg << " - " << status << std::endl;
15+
std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
1616
exit(EXIT_FAILURE);
1717
}
1818
}
1919

2020
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
21+
if (status != cudaSuccess) {
22+
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
23+
exit(EXIT_FAILURE);
24+
}
25+
}
26+
27+
void inline checkCudaErrorMsg(CUresult status, const char *msg) {
2128
if (status != CUDA_SUCCESS) {
22-
std::cout << msg << " - " << status << std::endl;
29+
std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
2330
exit(EXIT_FAILURE);
2431
}
2532
}
@@ -76,12 +83,12 @@ int main() {
7683
auto d_B = b_B.get_access<sycl::access::mode::read>(h);
7784
auto d_C = b_C.get_access<sycl::access::mode::write>(h);
7885

79-
h.interop_task([=](sycl::interop_handler ih) {
80-
cublasSetStream(handle, ih.get_queue<backend::cuda>());
81-
82-
auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
83-
auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
84-
auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
86+
h.codeplay_host_task([=](sycl::interop_handle ih) {
87+
cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
88+
cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
89+
auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_A));
90+
auto cuB = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_B));
91+
auto cuC = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_C));
8592

8693
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
8794
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#include <algorithm>
2+
#include <iostream>
3+
#include <vector>
4+
5+
#include <CL/sycl.hpp>
6+
#include <CL/sycl/backend/cuda.hpp>
7+
8+
#include <cublas_v2.h>
9+
#include <cuda.h>
10+
11+
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
12+
13+
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
14+
if (status != CUBLAS_STATUS_SUCCESS) {
15+
std::cout << msg << " - " << status << std::endl;
16+
exit(EXIT_FAILURE);
17+
}
18+
}
19+
20+
void inline checkCudaErrorMsg(cudaError status, const char *msg) {
21+
if (status != CUDA_SUCCESS) {
22+
std::cout << msg << " - " << status << std::endl;
23+
exit(EXIT_FAILURE);
24+
}
25+
}
26+
27+
class CUDASelector : public sycl::device_selector {
28+
public:
29+
int operator()(const sycl::device &Device) const override {
30+
using namespace sycl::info;
31+
32+
const std::string DriverVersion = Device.get_info<device::driver_version>();
33+
34+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
35+
std::cout << " CUDA device found " << std::endl;
36+
return 1;
37+
};
38+
return -1;
39+
}
40+
};
41+
42+
int main() {
43+
constexpr size_t WIDTH = 1024;
44+
constexpr size_t HEIGHT = 1024;
45+
constexpr float ALPHA = 1.0f;
46+
constexpr float BETA = 0.0f;
47+
48+
std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
49+
h_C(WIDTH * HEIGHT);
50+
51+
std::cout << "Size: " << h_C.size() << std::endl;
52+
53+
// A is an identity matrix
54+
std::fill(std::begin(h_A), std::end(h_A), 0.0f);
55+
for (size_t i = 0; i < WIDTH; i++) {
56+
h_A[i * WIDTH + i] = 1.0f;
57+
}
58+
59+
// B is a matrix fill with 1
60+
std::fill(std::begin(h_B), std::end(h_B), 1.0f);
61+
62+
sycl::queue q{CUDASelector()};
63+
64+
cublasHandle_t handle;
65+
CHECK_ERROR(cublasCreate(&handle));
66+
67+
{
68+
sycl::buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
69+
sycl::buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
70+
sycl::buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};
71+
72+
q.submit([&](sycl::handler &h) {
73+
auto d_A = b_A.get_access<sycl::access::mode::read>(h);
74+
auto d_B = b_B.get_access<sycl::access::mode::read>(h);
75+
auto d_C = b_C.get_access<sycl::access::mode::write>(h);
76+
77+
h.interop_task([=](sycl::interop_handler ih) {
78+
cublasSetStream(handle, ih.get_queue<backend::cuda>());
79+
80+
auto cuA = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_A));
81+
auto cuB = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_B));
82+
auto cuC = reinterpret_cast<float *>(ih.get_mem<backend::cuda>(d_C));
83+
84+
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
85+
WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
86+
cuC, WIDTH));
87+
});
88+
});
89+
}
90+
91+
// C must be all ones
92+
int i = 0;
93+
const bool allEqual =
94+
std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
95+
++i;
96+
if (num != 1) {
97+
std::cout << i << " Not one : " << num << std::endl;
98+
}
99+
return num == 1;
100+
});
101+
102+
if (!allEqual) {
103+
std::cout << " Incorrect result " << std::endl;
104+
} else {
105+
std::cout << " Correct! " << std::endl;
106+
}
107+
108+
CHECK_ERROR(cublasDestroy(handle));
109+
110+
return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
111+
}

example-03/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ CUDAFLAGS=--cuda-gpu-arch=sm_30
88

99
CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
1010

11-
LIBS=-L${SYCL_ROOT_DIR}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart
11+
LIBS=-L${SYCL_ROOT_DIR}/lib -lOpenCL -lsycl -L${CUDA_ROOT_DIR}/lib64 -lcudart
1212

1313
default: vec_add.exe usm_vec_add.exe
1414

example-03/vec_add.cu

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,8 @@ int main(int argc, char *argv[]) {
3939
// Size of vectors
4040
int n = 100000;
4141

42-
// Create a SYCL context for interoperability with CUDA Runtime API
43-
// This is temporary until the property extension is implemented
44-
const bool UsePrimaryContext = true;
4542
device dev{CUDASelector().select_device()};
46-
context myContext{dev, {}, UsePrimaryContext};
43+
context myContext{dev};
4744
queue myQueue{myContext, dev};
4845

4946
{

example-03/vec_add_usm.cu

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,8 @@ int main(int argc, char *argv[]) {
3939
// Size, in bytes, of each vector
4040
size_t bytes = n * sizeof(double);
4141

42-
// Create a SYCL context for interoperability with CUDA Runtime API
43-
// This is temporary until the property extension is implemented
44-
const bool UsePrimaryContext = true;
4542
device dev{CUDASelector().select_device()};
46-
context myContext{dev, {}, UsePrimaryContext};
43+
context myContext{dev};
4744
queue myQueue{myContext, dev};
4845

4946
// Allocate memory for each vector on host

0 commit comments

Comments
 (0)