Adding distributed batch gemm example using SYCL-BLAS library (#15)

mehdi-goli · joeatodd · web-flow · commit 130a4870aed3 · 2021-06-15T16:53:47.000+01:00
* Adding distributed batch gemm example using SYCL-BLAS library

* Update example-05/distributed-batch-gemm.cpp

Co-authored-by: Joe Todd &lt;joeatodd@users.noreply.github.com&gt;

Co-authored-by: Joe Todd &lt;joeatodd@users.noreply.github.com&gt;
diff --git a/example-05/Makefile b/example-05/Makefile
@@ -0,0 +1,15 @@
+MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17  -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas
+
+distributed-batch-gemm: distributed-batch-gemm.o
+	$(MPICOMP) distributed-batch-gemm.o -o distributed-batch-gemm
+
+distributed-batch-gemm.o: distributed-batch-gemm.cpp
+	$(MPICOMP) -c distributed-batch-gemm.cpp
+
+run: distributed-batch-gemm
+	LD_LIBRARY_PATH=~/sycl_workspace/build_dpcpp/install/lib:$(HOME)/sycl-blas/build mpirun -np 2 --mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./distributed-batch-gemm
+
+.PHONY: clean
+
+clean:
+	rm -f distributed-batch-gemm *.o
diff --git a/example-05/README.md b/example-05/README.md
@@ -0,0 +1,31 @@
+## Distributed Batch GEMM example
+
+This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks to distribute Batch GEMM accross MPI process.
+
+
+## Requisites
+
+The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support.
+That requires the MPI implementation to be built, or use, the DPCPP compiler.
+The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI")
+
+The example uses [SYCL-BLAS](https://github.com/codeplaysoftware/sycl-blas) library to call the GEMM routine.
+The SYCL-BLAS Library should be [compiled by DPCPP compiler](https://github.com/codeplaysoftware/sycl-blas#compile-with-dpc) to target CUDA backend. The following command line is used to build SYCL-BLAS library:
+
+```bash
+cmake -GNinja ../ -DTARGET=NVIDIA_GPU -DSYCL_COMPILER=dpcpp -DBLAS_DATA_TYPES=float -DGEMM_VECTORIZATION_SUPPORT=ON -DBLAS_ENABLE_TESTING=OFF -DENABLE_EXPRESSION_TESTS=OFF -DBLAS_ENABLE_BENCHMARK=OFF -DBLAS_VERIFY_BENCHMARK=OFF -DBLAS_BUILD_SAMPLES=OFF
+```
+
+## Compilation
+
+If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program.
+
+## Execution
+
+The makefile contains a target to execute the problem in two processes:
+
+```sh
+make run
+```
+
+The target assumes mpirun is on the PATH
diff --git a/example-05/distributed-batch-gemm.cpp b/example-05/distributed-batch-gemm.cpp
@@ -0,0 +1,175 @@
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+#include <algorithm>
+#include <mpi.h>
+#include <mpi-ext.h>
+#include <numeric>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sycl_blas.h>
+
+#define PRINT_DEBUG_MODE 1
+
+int main(int argc, char **argv) {
+  /* Create a SYCL queue with the default device selector */
+  sycl::queue q(cl::sycl::gpu_selector{});
+
+  /* -------------------------------------------------------------------------------------------
+      Check to see if MPI library is CUDA-aware
+  --------------------------------------------------------------------------------------------*/
+  printf("Run time check:\n");
+#if defined(MPIX_CUDA_AWARE_SUPPORT)
+  if (1 == MPIX_Query_cuda_support()) {
+    printf("This MPI library has CUDA-aware support.\n");
+  } else {
+    printf("This MPI library does not have CUDA-aware support.\n");
+  }
+#else  /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
+  printf("This MPI library cannot determine if there is CUDA-aware support.\n");
+#endif /* MPIX_CUDA_AWARE_SUPPORT */
+
+  /* -------------------------------------------------------------------------------------------
+     MPI Initialization
+  --------------------------------------------------------------------------------------------*/
+  MPI_Init(&argc, &argv);
+
+  int size;
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  if (size != 2) {
+    if (rank == 0) {
+      printf(
+          "This program requires exactly 2 MPI ranks, but you are "
+          "attempting to use %d! Exiting...\n",
+          size);
+    }
+    MPI_Finalize();
+    exit(0);
+  }
+
+  double start_time, stop_time, elapsed_time;
+  /* Create a SYCL-BLAS executor and get the policy handler */
+  blas::Executor<blas::PolicyHandler<blas::codeplay_policy>> executor(q);
+  auto policy_handler = executor.get_policy_handler();
+
+  /* Arguments of the Gemm operation.
+   * Note: these matrix dimensions are too small to get a performance gain by
+   * using SYCL-BLAS, but they are convenient for this sample */
+  const int m = 32;
+  const int k = 32;
+  const int n = 32;
+  const int lda = m;
+  const int ldb = k;
+  const int ldc = m;
+  const float alpha = 1;
+  const float beta = 0;
+  const float batch = 2;
+
+  /* creating local buffer */
+  auto local_a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
+  auto local_b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
+  auto local_c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);
+
+  /* Create the global buffer */
+  auto global_a_gpu = blas::make_sycl_iterator_buffer<float>(batch * lda * k);
+  auto global_b_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldb * n);
+  auto global_c_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldc * n);
+
+  if (rank == 0) {
+    // Setting buffer value for A and B
+    std::vector<float> A = std::vector<float>(batch * lda * k, float(1.0));
+    std::vector<float> B = std::vector<float>(batch * ldb * n, float(1.0));
+    policy_handler.copy_to_device(A.data(), global_a_gpu, batch * lda * k);
+    policy_handler.copy_to_device(B.data(), global_b_gpu, batch * ldb * n);
+  }
+  /* -------------------------------------------------------------------------------------------
+    Create an SYCL interoperability with CUDA to scatter the data each batch A,
+  B among the two MPI process
+  --------------------------------------------------------------------------------------------*/
+  start_time = MPI_Wtime();
+  auto ht_a = [&](sycl::handler &h) {
+    auto global_a_acc =
+        global_a_gpu.get_buffer().template get_access<sycl::access::mode::read>(
+            h);
+    auto local_a_acc =
+        local_a_gpu.get_buffer().template get_access<sycl::access::mode::write>(
+            h);
+    h.codeplay_host_task([=](sycl::interop_handle ih) {
+      auto global_a_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(global_a_acc));
+      auto local_a_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(local_a_acc));
+      MPI_Scatter(global_a_ptr, lda * k, MPI_FLOAT, local_a_ptr, lda * k,
+                  MPI_FLOAT, 0, MPI_COMM_WORLD);
+    });
+  };
+  q.submit(ht_a);
+
+  auto ht_b = [&](sycl::handler &h) {
+    auto global_b_acc =
+        global_b_gpu.get_buffer().template get_access<sycl::access::mode::read>(
+            h);
+    auto local_b_acc =
+        local_b_gpu.get_buffer().template get_access<sycl::access::mode::write>(
+            h);
+    h.codeplay_host_task([=](sycl::interop_handle ih) {
+      auto global_b_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(global_b_acc));
+      auto local_b_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(local_b_acc));
+      MPI_Scatter(global_b_ptr, ldb * n, MPI_FLOAT, local_b_ptr, ldb * n,
+                  MPI_FLOAT, 0, MPI_COMM_WORLD);
+    });
+  };
+  q.submit(ht_b);
+  q.wait_and_throw();
+
+  /* Execute the GEMM operation */
+  auto event = blas::_gemm(executor, 'n', 'n', m, n, k, alpha, local_a_gpu, lda,
+                           local_b_gpu, ldb, beta, local_c_gpu, ldc);
+  policy_handler.wait(event);
+
+  /* -------------------------------------------------------------------------------------------
+    Create a SYCL interoperability with CUDA to replace the original input with
+  normalized value
+  --------------------------------------------------------------------------------------------*/
+  auto ht_c = [&](sycl::handler &h) {
+    auto global_c_acc = global_c_gpu.get_buffer()
+                            .template get_access<sycl::access::mode::write>(h);
+    auto local_c_acc =
+        local_c_gpu.get_buffer().template get_access<sycl::access::mode::read>(
+            h);
+    h.codeplay_host_task([=](sycl::interop_handle ih) {
+      auto local_c_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(local_c_acc));
+      auto global_c_ptr = reinterpret_cast<float *>(
+          ih.get_native_mem<sycl::backend::cuda>(global_c_acc));
+      MPI_Gather(local_c_ptr, ldc * n, MPI_FLOAT, global_c_ptr, ldc * n,
+                 MPI_FLOAT, 0, MPI_COMM_WORLD);
+    });
+  };
+
+  q.submit(ht_c);
+  q.wait_and_throw();
+  stop_time = MPI_Wtime();
+  elapsed_time = stop_time - start_time;
+
+  /* -------------------------------------------------------------------------------------------
+       Print the output
+    --------------------------------------------------------------------------------------------*/
+  if (rank == 0) {
+    std::cout << "elapsed_time" << elapsed_time;
+#if defined(PRINT_DEBUG_MODE)
+    auto C = global_c_gpu.get_buffer().get_host_access();
+    for (int i = 0; i < batch * ldc * n; i++) {
+      std::cout << " value at " << i << " : " << C[i] << "\n";
+    }
+#endif
+  }
+
+  MPI_Finalize();
+  return 0;
+}