Skip to content

Commit aad6720

Browse files
joeatoddRuyk
andauthored
Update examples for sycl-nightly/20211108 (#20)
* Ignore temporary files * Update CUDASelector w/ backend query * s/codeplay_host_task/interop_task/ * Match std::cout * USM version of example-01 * Add a couple of build.sh scripts * s/interop_task/host_task/ in example-04 * Use updated nvidia triple in example-04 * Update include dir for docker repo in example-04 * example-02 - suppress CMake warning, consistent naming, host_task * Add manual build command to README.md for example-02 * Update example-04 Note: I found that it's necessary to cudaDeviceSynchronize() within the host_task or else data is read before it's written. This was true only for the USM example, but I'm not sure if, strictly speaking, it ought to be required for both cases. * Don't complain about empty CMAKE_CUDA_ARCHITECTURES * s|clang/11.0.0|clang/14.0.0| * build.sh for exercise-01 * s/nvptx64-nvidia-cuda-sycldevice/nvptx64-nvidia-cuda/ * Update example-02/build.sh Co-authored-by: Ruyman <ruyman@codeplay.com> * Update exercise-01/build.sh Co-authored-by: Ruyman <ruyman@codeplay.com> * Suppress mismatched triple warning * Comment on cudaDeviceSynchronize() * Remove unneeded dummy template class, and lay out lambda clearly Co-authored-by: Ruyman <ruyman@codeplay.com>
1 parent 9ac73b7 commit aad6720

25 files changed

+224
-202
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,8 @@
3030
*.exe
3131
*.out
3232
*.app
33+
34+
# Temporaries
35+
*~
36+
*#
37+
*/build

example-01/CMakeLists.txt

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
2+
# Don't complain about empty CMAKE_CUDA_ARCHITECTURES
3+
cmake_policy(SET CMP0104 OLD)
4+
25
project(cmake_and_cuda LANGUAGES CXX CUDA)
36

47
include(CTest)
@@ -8,11 +11,12 @@ if (NOT SYCL_ROOT)
811
message(FATAL_ERROR "No SYCL installation detected")
912
endif(NOT SYCL_ROOT)
1013

11-
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
14+
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
1215
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
13-
set(SYCL_FLAGS "-fsycl"
14-
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
15-
"-fsycl-unnamed-lambda")
16+
set(SYCL_FLAGS "-fsycl"
17+
"-fsycl-targets=nvptx64-nvidia-cuda"
18+
"-fsycl-unnamed-lambda"
19+
"-Wno-linker-warnings")
1620

1721
# Build the CUDA code
1822
add_executable(vector_addition vector_addition.cu)
@@ -28,3 +32,12 @@ target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
2832
target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR})
2933
target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB})
3034

35+
36+
# Build the SYCL (USM) code
37+
add_executable (sycl_vector_addition_usm vector_addition_usm.cpp)
38+
target_compile_features(sycl_vector_addition_usm PUBLIC cxx_std_17)
39+
target_compile_options(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
40+
target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
41+
target_include_directories(sycl_vector_addition_usm PUBLIC ${SYCL_INCLUDE_DIR})
42+
target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_LIB})
43+

example-01/build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
rm -rf build && mkdir build && cd build
2+
cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ -DCMAKE_EXPORT_COMPILE_COMMANDS=yes
3+
make -j 8

example-01/vector_addition.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,16 @@
2626

2727
class CUDASelector : public sycl::device_selector {
2828
public:
29-
int operator()(const sycl::device &Device) const override {
30-
using namespace sycl::info;
31-
32-
const std::string DriverVersion = Device.get_info<device::driver_version>();
33-
34-
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
29+
int operator()(const sycl::device &device) const override {
30+
if(device.get_platform().get_backend() == sycl::backend::cuda){
3531
std::cout << " CUDA device found " << std::endl;
3632
return 1;
37-
};
38-
return -1;
33+
} else{
34+
return -1;
35+
}
3936
}
4037
};
4138

42-
class vec_add;
4339
int main(int argc, char *argv[]) {
4440
constexpr const size_t N = 100000;
4541
const sycl::range VecSize{N};
@@ -71,8 +67,8 @@ int main(int argc, char *argv[]) {
7167
auto b = bufB.get_access<read_t>(h);
7268
auto c = bufC.get_access<write_t>(h);
7369

74-
h.parallel_for<vec_add>(VecSize,
75-
[=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
70+
h.parallel_for(VecSize,
71+
[=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
7672
};
7773

7874
myQueue.submit(cg);

example-01/vector_addition.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <math.h>
55
#include <stdio.h>
66
#include <stdlib.h>
7+
#include <iostream>
78

89
// CUDA kernel. Each thread takes care of one element of c
910
__global__ void vecAdd(double *a, double *b, double *c, int n) {
@@ -44,9 +45,8 @@ int main(int argc, char *argv[]) {
4445
cudaMalloc(&d_b, bytes);
4546
cudaMalloc(&d_c, bytes);
4647

47-
int i;
4848
// Initialize vectors on host
49-
for (i = 0; i < n; i++) {
49+
for (int i = 0; i < n; i++) {
5050
h_a[i] = sin(i) * sin(i);
5151
h_b[i] = cos(i) * cos(i);
5252
}
@@ -72,9 +72,9 @@ int main(int argc, char *argv[]) {
7272
// Sum up vector c and print result divided by n, this should equal 1 within
7373
// error
7474
double sum = 0;
75-
for (i = 0; i < n; i++)
75+
for (int i = 0; i < n; i++)
7676
sum += h_c[i];
77-
printf("final result: %f\n", sum / n);
77+
std::cout << "Sum is : " << sum << std::endl;
7878

7979
// Release device memory
8080
cudaFree(d_a);

example-01/vector_addition_usm.cpp

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/**
2+
* SYCL FOR CUDA : Vector Addition Example
3+
*
4+
* Copyright 2020 Codeplay Software Ltd.
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* @File: vector_addition.cpp
19+
*/
20+
21+
#include <algorithm>
22+
#include <iostream>
23+
#include <vector>
24+
25+
#include <CL/sycl.hpp>
26+
27+
class CUDASelector : public sycl::device_selector {
28+
public:
29+
int operator()(const sycl::device &device) const override {
30+
if(device.get_platform().get_backend() == sycl::backend::cuda){
31+
std::cout << " CUDA device found " << std::endl;
32+
return 1;
33+
} else{
34+
return -1;
35+
}
36+
}
37+
};
38+
39+
int main(int argc, char *argv[]) {
40+
constexpr const size_t n = 100000;
41+
42+
// Create a sycl queue with our CUDASelector
43+
sycl::queue myQueue{CUDASelector()};
44+
45+
// Host input vectors
46+
double *h_a;
47+
double *h_b;
48+
// Host output vector
49+
double *h_c;
50+
51+
// Device input vectors
52+
double *d_a;
53+
double *d_b;
54+
// Device output vector
55+
double *d_c;
56+
57+
// Size, in bytes, of each vector
58+
size_t bytes = n * sizeof(double);
59+
60+
// Allocate memory for each vector on host
61+
h_a = (double *)malloc(bytes);
62+
h_b = (double *)malloc(bytes);
63+
h_c = (double *)malloc(bytes);
64+
65+
// Allocate memory for each vector on GPU
66+
d_a = sycl::malloc_device<double>(n, myQueue);
67+
d_b = sycl::malloc_device<double>(n, myQueue);
68+
d_c = sycl::malloc_device<double>(n, myQueue);
69+
70+
// Initialize vectors on host
71+
for (int i = 0; i < n; i++) {
72+
h_a[i] = sin(i) * sin(i);
73+
h_b[i] = cos(i) * cos(i);
74+
}
75+
76+
myQueue.memcpy(d_a, h_a, bytes).wait();
77+
myQueue.memcpy(d_b, h_b, bytes).wait();
78+
79+
// Command Group creation
80+
auto cg = [&](sycl::handler &h) {
81+
h.parallel_for(sycl::range(n),
82+
[=](sycl::id<1> i) {
83+
d_c[i] = d_a[i] + d_b[i];
84+
});
85+
};
86+
87+
// Run the kernel defined above
88+
myQueue.submit(cg).wait();
89+
90+
// Copy the result back to host
91+
myQueue.memcpy(h_c, d_c, bytes).wait();
92+
93+
double sum = 0.0f;
94+
for (int i = 0; i < n; i++) {
95+
sum += h_c[i];
96+
}
97+
std::cout << "Sum is : " << sum << std::endl;
98+
99+
return 0;
100+
}

example-02/CMakeLists.txt

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
2+
3+
# Don't complain about empty CMAKE_CUDA_ARCHITECTURES
4+
cmake_policy(SET CMP0104 OLD)
5+
26
project(sycl_cuda_interop LANGUAGES CXX CUDA)
37

48
find_package(CUDAToolkit)
@@ -8,19 +12,20 @@ if (NOT SYCL_ROOT)
812
message(FATAL_ERROR "No SYCL installation detected")
913
endif(NOT SYCL_ROOT)
1014

11-
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
15+
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
1216
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
1317
set(SYCL_FLAGS "-fsycl"
14-
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
15-
"-fsycl-unnamed-lambda")
18+
"-fsycl-targets=nvptx64-nvidia-cuda"
19+
"-fsycl-unnamed-lambda"
20+
"-Wno-linker-warnings")
1621

1722

1823
# Build the CUDA code
19-
add_executable(sgemm_cuda sgemm.cu)
20-
target_compile_features(sgemm_cuda PUBLIC cxx_std_11)
21-
set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
22-
set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
23-
target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas)
24+
add_executable(cuda_sgemm sgemm.cu)
25+
target_compile_features(cuda_sgemm PUBLIC cxx_std_11)
26+
set_target_properties(cuda_sgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
27+
set_property(TARGET cuda_sgemm PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
28+
target_link_libraries(cuda_sgemm CUDA::toolkit CUDA::cublas)
2429

2530
# Build the SYCL code
2631
add_executable (sycl_sgemm sycl_sgemm.cpp)

example-02/README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ Building the example
1717
=====================
1818

1919
``` sh
20-
$ mkdir build && cd build
21-
$ cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
22-
$ make -j 8
20+
$ bash build.sh
2321
```
2422

23+
or (SYCL version only):
24+
25+
```
26+
${SYCL_ROOT_DIR}/bin/clang++ -DCUDA_NO_HALF -isystem /usr/local/cuda/include -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -std=gnu++17 -L/usr/local/cuda/lib64 -lcublas -lcudart -lcuda -o sycl_sgemm sycl_sgemm.cpp
27+
```
2528
Example
2629
=========
2730

example-02/build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
rm -rf build && mkdir build && cd build
2+
cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
3+
make -j

example-02/sycl_sgemm.cpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,13 @@ void inline checkCudaErrorMsg(CUresult status, const char *msg) {
3333

3434
class CUDASelector : public sycl::device_selector {
3535
public:
36-
int operator()(const sycl::device &Device) const override {
37-
using namespace sycl::info;
38-
39-
const std::string DriverVersion = Device.get_info<device::driver_version>();
40-
41-
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
36+
int operator()(const sycl::device &device) const override {
37+
if(device.get_platform().get_backend() == sycl::backend::cuda){
4238
std::cout << " CUDA device found " << std::endl;
4339
return 1;
44-
};
45-
return -1;
40+
} else{
41+
return -1;
42+
}
4643
}
4744
};
4845

@@ -83,7 +80,7 @@ int main() {
8380
auto d_B = b_B.get_access<sycl::access::mode::read>(h);
8481
auto d_C = b_C.get_access<sycl::access::mode::write>(h);
8582

86-
h.codeplay_host_task([=](sycl::interop_handle ih) {
83+
h.host_task([=](sycl::interop_handle ih) {
8784
cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
8885
cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
8986
auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_A));

0 commit comments

Comments
 (0)