Skip to content

Commit 0f9f058

Browse files
authored
MPI + SYCL Example (#13)
* MPI + SYCL Example Example showing how to call MPI functions from SYCL with CUDA-aware MPI * Fixed: Build errors
1 parent 1416d9e commit 0f9f058

File tree

3 files changed

+225
-0
lines changed

3 files changed

+225
-0
lines changed

example-04/Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
MPICOMP=mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -O1 -fsycl-unnamed-lambda -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice
2+
3+
sycl-mpi-smaple: SYCL-MPI-Sample.o
4+
$(MPICOMP) SYCL-MPI-Sample.o -o sycl-mpi-sample
5+
6+
SYCL-MPI-Sample.o: SYCL-MPI-Sample.cpp
7+
$(MPICOMP) -c SYCL-MPI-Sample.cpp
8+
9+
run: sycl-mpi-sample
10+
mpirun -np 2 ./sycl-mpi-sample
11+
12+
.PHONY: clean
13+
14+
clean:
15+
rm -f sycl-mpi-sample *.o

example-04/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
## MPI + SYCL example
2+
3+
This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks for integration.
4+
5+
6+
## Requisites
7+
8+
The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support.
9+
That requires the MPI implementation to be built, or use, the DPCPP compiler.
10+
The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI")
11+
12+
## Compilation
13+
14+
If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program.
15+
16+
## Execution
17+
18+
The makefile contains a target to execute the problem in two processes:
19+
20+
```sh
21+
make run
22+
```
23+
24+
The target assumes mpirun is on the PATH
25+
26+

example-04/SYCL-MPI-Sample.cpp

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
2+
#include <CL/sycl.hpp>
3+
4+
#include <CL/sycl/backend/cuda.hpp>
5+
#include <algorithm>
6+
#include <mpi.h>
7+
#include <mpi-ext.h>
8+
#include <numeric>
9+
#include <stdio.h>
10+
#include <stdlib.h>
11+
12+
int main(int argc, char *argv[]) {
13+
/* -------------------------------------------------------------------------------------------
14+
SYCL Initialization, which internally sets the CUDA device
15+
--------------------------------------------------------------------------------------------*/
16+
sycl::queue q(cl::sycl::gpu_selector{});
17+
18+
/* -------------------------------------------------------------------------------------------
19+
Check to see if MPI library is CUDA-aware
20+
--------------------------------------------------------------------------------------------*/
21+
printf("Run time check:\n");
22+
#if defined(MPIX_CUDA_AWARE_SUPPORT)
23+
if (1 == MPIX_Query_cuda_support()) {
24+
printf("This MPI library has CUDA-aware support.\n");
25+
} else {
26+
printf("This MPI library does not have CUDA-aware support.\n");
27+
}
28+
#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
29+
printf("This MPI library cannot determine if there is CUDA-aware support.\n");
30+
#endif /* MPIX_CUDA_AWARE_SUPPORT */
31+
32+
/* -------------------------------------------------------------------------------------------
33+
MPI Initialization
34+
--------------------------------------------------------------------------------------------*/
35+
MPI_Init(&argc, &argv);
36+
37+
int size;
38+
MPI_Comm_size(MPI_COMM_WORLD, &size);
39+
40+
int rank;
41+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
42+
43+
if (size != 2) {
44+
if (rank == 0) {
45+
printf("This program requires exactly 2 MPI ranks, but you are "
46+
"attempting to use %d! Exiting...\n",
47+
size);
48+
}
49+
MPI_Finalize();
50+
exit(0);
51+
}
52+
53+
/* -------------------------------------------------------------------------------------------
54+
Setting data size to 1MB
55+
Allocating 1 MB data on Host
56+
--------------------------------------------------------------------------------------------*/
57+
long int N = 1 << 10;
58+
std::vector<double> A(N, 1.0);
59+
size_t local_size = N / size;
60+
61+
/* -------------------------------------------------------------------------------------------
62+
Create SYCL buffers
63+
--------------------------------------------------------------------------------------------*/
64+
sycl::buffer<double> input_buffer(std::begin(A), std::end(A));
65+
sycl::buffer<double> local_buffer(sycl::range{local_size});
66+
sycl::buffer<double> out_buffer(sycl::range{1});
67+
sycl::buffer<double> global_sum(sycl::range{1});
68+
69+
double start_time, stop_time, elapsed_time;
70+
start_time = MPI_Wtime();
71+
/* -------------------------------------------------------------------------------------------
72+
Create an SYCL interoperability with CUDA to scatter the data among two MPI
73+
process
74+
--------------------------------------------------------------------------------------------*/
75+
76+
auto ht = [&](sycl::handler &h) {
77+
sycl::accessor input_acc{input_buffer, h, sycl::read_write};
78+
sycl::accessor local_acc{local_buffer, h, sycl::read_write};
79+
h.codeplay_host_task([=](sycl::interop_handle ih) {
80+
auto cuda_ptr = reinterpret_cast<double *>(
81+
ih.get_native_mem<sycl::backend::cuda>(input_acc));
82+
auto cuda_local_ptr = reinterpret_cast<double *>(
83+
ih.get_native_mem<sycl::backend::cuda>(local_acc));
84+
MPI_Scatter(cuda_ptr, local_size, MPI_DOUBLE, cuda_local_ptr, local_size,
85+
MPI_DOUBLE, 0, MPI_COMM_WORLD);
86+
});
87+
};
88+
q.submit(ht);
89+
90+
91+
/* -------------------------------------------------------------------------------------------
92+
Create a SYCL GPU kernel to sale each element of the data based on the MPI
93+
process ID
94+
--------------------------------------------------------------------------------------------*/
95+
auto cg = [&](sycl::handler &h) {
96+
auto acc = local_buffer.get_access(h);
97+
auto kern = [=](cl::sycl::id<1> id) { acc[id] *= (rank + 1); };
98+
h.parallel_for(sycl::range<1>{local_size}, kern);
99+
};
100+
q.submit(cg);
101+
102+
/* -------------------------------------------------------------------------------------------
103+
Create a SYCL GPU kernel to partially reduce each local data into an scalar
104+
--------------------------------------------------------------------------------------------*/
105+
auto cg2 = [&](sycl::handler &h) {
106+
auto acc = local_buffer.get_access(h);
107+
h.parallel_for(sycl::nd_range<1>(
108+
cl::sycl::range<1>(local_size),
109+
cl::sycl::range<1>(std::min(local_size, size_t(256)))),
110+
sycl::reduction(out_buffer, h, 1.0, std::plus<double>()),
111+
[=](sycl::nd_item<1> idx, auto &reducer) {
112+
reducer.combine(acc[idx.get_global_id(0)]);
113+
});
114+
};
115+
q.submit(cg2);
116+
/* -------------------------------------------------------------------------------------------
117+
Create a SYCL interoperability with CUDA to calculate the total sum of the
118+
reduced scalar created by each MPI process
119+
--------------------------------------------------------------------------------------------*/
120+
auto ht2 = [&](sycl::handler &h) {
121+
sycl::accessor out_acc{out_buffer, h, sycl::read_write};
122+
sycl::accessor global_sum_acc{global_sum, h, sycl::read_write};
123+
h.codeplay_host_task([=](sycl::interop_handle ih) {
124+
auto cuda_out_ptr = reinterpret_cast<double *>(
125+
ih.get_native_mem<sycl::backend::cuda>(out_acc));
126+
auto cuda_global_sum_ptr = reinterpret_cast<double *>(
127+
ih.get_native_mem<sycl::backend::cuda>(global_sum_acc));
128+
MPI_Allreduce(cuda_out_ptr, cuda_global_sum_ptr, 1, MPI_DOUBLE, MPI_SUM,
129+
MPI_COMM_WORLD);
130+
});
131+
};
132+
133+
q.submit(ht2);
134+
135+
/* -------------------------------------------------------------------------------------------
136+
Create a SYCL GPU kernel to normalize local buffer based on the global sum
137+
result
138+
--------------------------------------------------------------------------------------------*/
139+
auto cg3 = [&](sycl::handler &h) {
140+
auto acc = local_buffer.get_access(h);
141+
auto global_sum_acc = global_sum.get_access(h);
142+
auto kern = [=](cl::sycl::id<1> id) { acc[id] /= global_sum_acc[0]; };
143+
h.parallel_for(sycl::range<1>{local_size}, kern);
144+
};
145+
q.submit(cg3);
146+
147+
/* -------------------------------------------------------------------------------------------
148+
Create a SYCL interoperability with CUDA to replace the original input with
149+
normalized value
150+
--------------------------------------------------------------------------------------------*/
151+
auto ht3 = [&](sycl::handler &h) {
152+
sycl::accessor input_acc{input_buffer, h, sycl::read_write};
153+
sycl::accessor local_acc{local_buffer, h, sycl::read_write};
154+
h.codeplay_host_task([=](sycl::interop_handle ih) {
155+
auto cuda_local_ptr = reinterpret_cast<double *>(
156+
ih.get_native_mem<sycl::backend::cuda>(local_acc));
157+
auto cuda_input_ptr = reinterpret_cast<double *>(
158+
ih.get_native_mem<sycl::backend::cuda>(input_acc));
159+
MPI_Gather(cuda_local_ptr, local_size, MPI_DOUBLE, cuda_input_ptr,
160+
local_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
161+
});
162+
};
163+
164+
q.submit(ht3);
165+
q.wait_and_throw();
166+
stop_time = MPI_Wtime();
167+
elapsed_time = stop_time - start_time;
168+
169+
/* -------------------------------------------------------------------------------------------
170+
Print the output
171+
--------------------------------------------------------------------------------------------*/
172+
if (rank == 0) {
173+
std::cout << "elapsed_time" << elapsed_time;
174+
#if defined(PRINT_DEBUG_MODE)
175+
auto p = input_buffer.get_host_access();
176+
for (int i = 0; i < 1; i++) {
177+
std::cout << " value at i : " << p[i] << "\n";
178+
}
179+
#endif
180+
}
181+
MPI_Finalize();
182+
183+
return 0;
184+
}

0 commit comments

Comments
 (0)