Skip to content

Commit 0904367

Browse files
committed
Addressing feedback from review
1 parent f68f1ca commit 0904367

File tree

3 files changed

+40
-40
lines changed

3 files changed

+40
-40
lines changed

example-03/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Pre-requisites
1414
---------------
1515

1616
You would need an installation of DPC++ with CUDA support,
17-
see [Getting Started Guide](https://github.com/intel/llvm/doc/GetStartedWithSYCLCompiler.md)
17+
see [Getting Started Guide](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda)
1818
for details on how to build it.
1919

2020
The example is built using Makefiles, since there is no support yet on
@@ -24,7 +24,7 @@ Building the example
2424
---------------------
2525

2626
```sh
27-
$ SYCL_ROOT=/path/to/dpcpp make
27+
$ SYCL_ROOT=/path/to/dpcpp make
2828
```
2929

3030
This compiles the SYCL code with the LLVM CUDA support, and generates
@@ -42,7 +42,7 @@ The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
4242
A simple way of running the example is as follows:
4343

4444
```
45-
$ LD_LIBRARY_PATH=/path/to/dpcpp/lib ./vec_add.exe
45+
$ LD_LIBRARY_PATH=/path/to/dpcpp/lib:$LD_LIBRARY_PATH ./vec_add.exe
4646
```
4747

4848

example-03/vec_add.cu

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public:
1616
const std::string DriverVersion = Device.get_info<device::driver_version>();
1717

1818
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
19-
std::cout << " CUDA device found " << std::endl;
19+
std::cout << " CUDA device found \n";
2020
return 1;
2121
};
2222
return -1;
@@ -41,23 +41,23 @@ int main(int argc, char *argv[]) {
4141
// Create a SYCL context for interoperability with CUDA Runtime API
4242
// This is temporary until the property extension is implemented
4343
const bool UsePrimaryContext = true;
44-
sycl::device dev{CUDASelector().select_device()};
45-
sycl::context myContext{dev, {}, UsePrimaryContext};
46-
sycl::queue myQueue{myContext, dev};
44+
device dev{CUDASelector().select_device()};
45+
context myContext{dev, {}, UsePrimaryContext};
46+
queue myQueue{myContext, dev};
4747

4848
{
4949
buffer<double> bA{range<1>(n)};
5050
buffer<double> bB{range<1>(n)};
5151
buffer<double> bC{range<1>(n)};
5252

5353
{
54-
auto h_a = bA.get_access<access::mode::write>();
55-
auto h_b = bB.get_access<access::mode::write>();
54+
auto hA = bA.get_access<access::mode::write>();
55+
auto hB = bB.get_access<access::mode::write>();
5656

5757
// Initialize vectors on host
5858
for (int i = 0; i < n; i++) {
59-
h_a[i] = sin(i) * sin(i);
60-
h_b[i] = cos(i) * cos(i);
59+
hA[i] = sin(i) * sin(i);
60+
hB[i] = cos(i) * cos(i);
6161
}
6262
}
6363

@@ -68,28 +68,29 @@ int main(int argc, char *argv[]) {
6868
auto accC = bC.get_access<access::mode::write>(h);
6969

7070
h.interop_task([=](interop_handler ih) {
71-
auto d_a = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
72-
auto d_b = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
73-
auto d_c = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
71+
auto dA = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
72+
auto dB = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
73+
auto dC = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
7474

7575
int blockSize, gridSize;
7676
// Number of threads in each thread block
7777
blockSize = 1024;
7878
// Number of thread blocks in grid
79-
gridSize = (int)ceil((float)n / blockSize);
79+
gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
8080
// Call the CUDA kernel directly from SYCL
81-
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
81+
vecAdd<<<gridSize, blockSize>>>(dA, dB, dC, n);
8282
});
8383
});
8484

8585
{
86-
auto h_c = bC.get_access<access::mode::read>();
86+
auto hC = bC.get_access<access::mode::read>();
8787
// Sum up vector c and print result divided by n, this should equal 1 within
8888
// error
8989
double sum = 0;
90-
for (int i = 0; i < n; i++)
91-
sum += h_c[i];
92-
printf("final result: %f\n", sum / n);
90+
for (int i = 0; i < n; i++) {
91+
sum += hC[i];
92+
}
93+
std::cout << "Final result " << sum / n << std::endl;
9394
}
9495
}
9596

example-03/vec_add_usm.cu

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public:
1212
const std::string DriverVersion = Device.get_info<device::driver_version>();
1313

1414
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
15-
std::cout << " CUDA device found " << std::endl;
15+
std::cout << " CUDA device found \n";
1616
return 1;
1717
};
1818
return -1;
@@ -41,33 +41,31 @@ int main(int argc, char *argv[]) {
4141
// Create a SYCL context for interoperability with CUDA Runtime API
4242
// This is temporary until the property extension is implemented
4343
const bool UsePrimaryContext = true;
44-
sycl::device dev{CUDASelector().select_device()};
45-
sycl::context myContext{dev, {}, UsePrimaryContext};
46-
sycl::queue myQueue{myContext, dev};
44+
device dev{CUDASelector().select_device()};
45+
context myContext{dev, {}, UsePrimaryContext};
46+
queue myQueue{myContext, dev};
4747

4848
// Allocate memory for each vector on host
49-
double* d_a = (double *)malloc_shared(bytes, myQueue);
50-
double* d_b = (double *)malloc_shared(bytes, myQueue);
51-
double* d_c = (double *)malloc_shared(bytes, myQueue);
49+
auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
50+
auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
51+
auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
5252

5353
// Initialize vectors on host
5454
for (int i = 0; i < n; i++) {
55-
d_a[i] = sin(i) * sin(i);
56-
d_b[i] = cos(i) * cos(i);
55+
d_A[i] = sin(i) * sin(i);
56+
d_B[i] = cos(i) * cos(i);
5757
}
5858

5959
myQueue.submit([&](handler& h) {
6060
h.interop_task([=](interop_handler ih) {
61-
int blockSize, gridSize;
62-
6361
// Number of threads in each thread block
64-
blockSize = 1024;
62+
int blockSize = 1024;
6563

6664
// Number of thread blocks in grid
67-
gridSize = (int)ceil((float)n / blockSize);
65+
int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
6866

6967
// Execute the kernel
70-
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
68+
vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
7169
});
7270
});
7371

@@ -76,13 +74,14 @@ int main(int argc, char *argv[]) {
7674
// Sum up vector c and print result divided by n, this should equal 1 within
7775
// error
7876
double sum = 0;
79-
for (int i = 0; i < n; i++)
80-
sum += d_c[i];
81-
printf("final result: %f\n", sum / n);
77+
for (int i = 0; i < n; i++) {
78+
sum += d_C[i];
79+
}
80+
std::cout << "Final result " << sum / n << std::endl;
8281

83-
sycl::free(d_a, myContext);
84-
sycl::free(d_b, myContext);
85-
sycl::free(d_c, myContext);
82+
free(d_A, myContext);
83+
free(d_B, myContext);
84+
free(d_C, myContext);
8685

8786
return 0;
8887
}

0 commit comments

Comments
 (0)