Skip to content

Commit 130a487

Browse files
mehdi-golijoeatodd
andauthored
Adding distributed batch gemm example using SYCL-BLAS library (#15)
* Adding distributed batch gemm example using SYCL-BLAS library * Update example-05/distributed-batch-gemm.cpp Co-authored-by: Joe Todd <joeatodd@users.noreply.github.com> Co-authored-by: Joe Todd <joeatodd@users.noreply.github.com>
1 parent 0f9f058 commit 130a487

File tree

3 files changed

+221
-0
lines changed

3 files changed

+221
-0
lines changed

example-05/Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas
2+
3+
distributed-batch-gemm: distributed-batch-gemm.o
4+
$(MPICOMP) distributed-batch-gemm.o -o distributed-batch-gemm
5+
6+
distributed-batch-gemm.o: distributed-batch-gemm.cpp
7+
$(MPICOMP) -c distributed-batch-gemm.cpp
8+
9+
run: distributed-batch-gemm
10+
LD_LIBRARY_PATH=~/sycl_workspace/build_dpcpp/install/lib:$(HOME)/sycl-blas/build mpirun -np 2 --mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./distributed-batch-gemm
11+
12+
.PHONY: clean
13+
14+
clean:
15+
rm -f distributed-batch-gemm *.o

example-05/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
## Distributed Batch GEMM example
2+
3+
This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks to distribute Batch GEMM accross MPI process.
4+
5+
6+
## Requisites
7+
8+
The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support.
9+
That requires the MPI implementation to be built, or use, the DPCPP compiler.
10+
The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI")
11+
12+
The example uses [SYCL-BLAS](https://github.com/codeplaysoftware/sycl-blas) library to call the GEMM routine.
13+
The SYCL-BLAS Library should be [compiled by DPCPP compiler](https://github.com/codeplaysoftware/sycl-blas#compile-with-dpc) to target CUDA backend. The following command line is used to build SYCL-BLAS library:
14+
15+
```bash
16+
cmake -GNinja ../ -DTARGET=NVIDIA_GPU -DSYCL_COMPILER=dpcpp -DBLAS_DATA_TYPES=float -DGEMM_VECTORIZATION_SUPPORT=ON -DBLAS_ENABLE_TESTING=OFF -DENABLE_EXPRESSION_TESTS=OFF -DBLAS_ENABLE_BENCHMARK=OFF -DBLAS_VERIFY_BENCHMARK=OFF -DBLAS_BUILD_SAMPLES=OFF
17+
```
18+
19+
## Compilation
20+
21+
If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program.
22+
23+
## Execution
24+
25+
The makefile contains a target to execute the problem in two processes:
26+
27+
```sh
28+
make run
29+
```
30+
31+
The target assumes mpirun is on the PATH

example-05/distributed-batch-gemm.cpp

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#include <CL/sycl.hpp>
2+
#include <CL/sycl/backend/cuda.hpp>
3+
#include <algorithm>
4+
#include <mpi.h>
5+
#include <mpi-ext.h>
6+
#include <numeric>
7+
#include <stdio.h>
8+
#include <stdlib.h>
9+
#include <sycl_blas.h>
10+
11+
#define PRINT_DEBUG_MODE 1
12+
13+
int main(int argc, char **argv) {
14+
/* Create a SYCL queue with the default device selector */
15+
sycl::queue q(cl::sycl::gpu_selector{});
16+
17+
/* -------------------------------------------------------------------------------------------
18+
Check to see if MPI library is CUDA-aware
19+
--------------------------------------------------------------------------------------------*/
20+
printf("Run time check:\n");
21+
#if defined(MPIX_CUDA_AWARE_SUPPORT)
22+
if (1 == MPIX_Query_cuda_support()) {
23+
printf("This MPI library has CUDA-aware support.\n");
24+
} else {
25+
printf("This MPI library does not have CUDA-aware support.\n");
26+
}
27+
#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
28+
printf("This MPI library cannot determine if there is CUDA-aware support.\n");
29+
#endif /* MPIX_CUDA_AWARE_SUPPORT */
30+
31+
/* -------------------------------------------------------------------------------------------
32+
MPI Initialization
33+
--------------------------------------------------------------------------------------------*/
34+
MPI_Init(&argc, &argv);
35+
36+
int size;
37+
MPI_Comm_size(MPI_COMM_WORLD, &size);
38+
39+
int rank;
40+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
41+
42+
if (size != 2) {
43+
if (rank == 0) {
44+
printf(
45+
"This program requires exactly 2 MPI ranks, but you are "
46+
"attempting to use %d! Exiting...\n",
47+
size);
48+
}
49+
MPI_Finalize();
50+
exit(0);
51+
}
52+
53+
double start_time, stop_time, elapsed_time;
54+
/* Create a SYCL-BLAS executor and get the policy handler */
55+
blas::Executor<blas::PolicyHandler<blas::codeplay_policy>> executor(q);
56+
auto policy_handler = executor.get_policy_handler();
57+
58+
/* Arguments of the Gemm operation.
59+
* Note: these matrix dimensions are too small to get a performance gain by
60+
* using SYCL-BLAS, but they are convenient for this sample */
61+
const int m = 32;
62+
const int k = 32;
63+
const int n = 32;
64+
const int lda = m;
65+
const int ldb = k;
66+
const int ldc = m;
67+
const float alpha = 1;
68+
const float beta = 0;
69+
const float batch = 2;
70+
71+
/* creating local buffer */
72+
auto local_a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
73+
auto local_b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
74+
auto local_c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);
75+
76+
/* Create the global buffer */
77+
auto global_a_gpu = blas::make_sycl_iterator_buffer<float>(batch * lda * k);
78+
auto global_b_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldb * n);
79+
auto global_c_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldc * n);
80+
81+
if (rank == 0) {
82+
// Setting buffer value for A and B
83+
std::vector<float> A = std::vector<float>(batch * lda * k, float(1.0));
84+
std::vector<float> B = std::vector<float>(batch * ldb * n, float(1.0));
85+
policy_handler.copy_to_device(A.data(), global_a_gpu, batch * lda * k);
86+
policy_handler.copy_to_device(B.data(), global_b_gpu, batch * ldb * n);
87+
}
88+
/* -------------------------------------------------------------------------------------------
89+
Create an SYCL interoperability with CUDA to scatter the data each batch A,
90+
B among the two MPI process
91+
--------------------------------------------------------------------------------------------*/
92+
start_time = MPI_Wtime();
93+
auto ht_a = [&](sycl::handler &h) {
94+
auto global_a_acc =
95+
global_a_gpu.get_buffer().template get_access<sycl::access::mode::read>(
96+
h);
97+
auto local_a_acc =
98+
local_a_gpu.get_buffer().template get_access<sycl::access::mode::write>(
99+
h);
100+
h.codeplay_host_task([=](sycl::interop_handle ih) {
101+
auto global_a_ptr = reinterpret_cast<float *>(
102+
ih.get_native_mem<sycl::backend::cuda>(global_a_acc));
103+
auto local_a_ptr = reinterpret_cast<float *>(
104+
ih.get_native_mem<sycl::backend::cuda>(local_a_acc));
105+
MPI_Scatter(global_a_ptr, lda * k, MPI_FLOAT, local_a_ptr, lda * k,
106+
MPI_FLOAT, 0, MPI_COMM_WORLD);
107+
});
108+
};
109+
q.submit(ht_a);
110+
111+
auto ht_b = [&](sycl::handler &h) {
112+
auto global_b_acc =
113+
global_b_gpu.get_buffer().template get_access<sycl::access::mode::read>(
114+
h);
115+
auto local_b_acc =
116+
local_b_gpu.get_buffer().template get_access<sycl::access::mode::write>(
117+
h);
118+
h.codeplay_host_task([=](sycl::interop_handle ih) {
119+
auto global_b_ptr = reinterpret_cast<float *>(
120+
ih.get_native_mem<sycl::backend::cuda>(global_b_acc));
121+
auto local_b_ptr = reinterpret_cast<float *>(
122+
ih.get_native_mem<sycl::backend::cuda>(local_b_acc));
123+
MPI_Scatter(global_b_ptr, ldb * n, MPI_FLOAT, local_b_ptr, ldb * n,
124+
MPI_FLOAT, 0, MPI_COMM_WORLD);
125+
});
126+
};
127+
q.submit(ht_b);
128+
q.wait_and_throw();
129+
130+
/* Execute the GEMM operation */
131+
auto event = blas::_gemm(executor, 'n', 'n', m, n, k, alpha, local_a_gpu, lda,
132+
local_b_gpu, ldb, beta, local_c_gpu, ldc);
133+
policy_handler.wait(event);
134+
135+
/* -------------------------------------------------------------------------------------------
136+
Create a SYCL interoperability with CUDA to replace the original input with
137+
normalized value
138+
--------------------------------------------------------------------------------------------*/
139+
auto ht_c = [&](sycl::handler &h) {
140+
auto global_c_acc = global_c_gpu.get_buffer()
141+
.template get_access<sycl::access::mode::write>(h);
142+
auto local_c_acc =
143+
local_c_gpu.get_buffer().template get_access<sycl::access::mode::read>(
144+
h);
145+
h.codeplay_host_task([=](sycl::interop_handle ih) {
146+
auto local_c_ptr = reinterpret_cast<float *>(
147+
ih.get_native_mem<sycl::backend::cuda>(local_c_acc));
148+
auto global_c_ptr = reinterpret_cast<float *>(
149+
ih.get_native_mem<sycl::backend::cuda>(global_c_acc));
150+
MPI_Gather(local_c_ptr, ldc * n, MPI_FLOAT, global_c_ptr, ldc * n,
151+
MPI_FLOAT, 0, MPI_COMM_WORLD);
152+
});
153+
};
154+
155+
q.submit(ht_c);
156+
q.wait_and_throw();
157+
stop_time = MPI_Wtime();
158+
elapsed_time = stop_time - start_time;
159+
160+
/* -------------------------------------------------------------------------------------------
161+
Print the output
162+
--------------------------------------------------------------------------------------------*/
163+
if (rank == 0) {
164+
std::cout << "elapsed_time" << elapsed_time;
165+
#if defined(PRINT_DEBUG_MODE)
166+
auto C = global_c_gpu.get_buffer().get_host_access();
167+
for (int i = 0; i < batch * ldc * n; i++) {
168+
std::cout << " value at " << i << " : " << C[i] << "\n";
169+
}
170+
#endif
171+
}
172+
173+
MPI_Finalize();
174+
return 0;
175+
}

0 commit comments

Comments
 (0)