Skip to content

Commit a0193bd

Browse files
committed
Improve support for AMD ROCm GPU devices
This commit enhances the support for AMD ROCm devices in Open MPI. The core aspect of this commit is to enable using hipMemcpy* based operations for packing/unpacking derived datatypes. At the moment, support for AMD GPUs is only available through ROCm enabled UCX. Note that applications using basic datatypes and contiguous buffers with ROCm enabled UCX already work even without this commit, and even applications using derived datatypes worked in most instances due to the fact that AMD GPUs require enabling PCIe large bar support. The most notable limitations: - rocm mca variables are not listed with ompi_info, since there is no component (at the moment) that is using rocm. - the 'compiled but not enabled' has received very limited testing. Compiling a hip/MPI code will require a few additional flags. A typical compile line is: mpiCC -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include/hip -I/opt/rocm/include hipMPItest.cc -L/opt/rocm/lib -lamdhip64 Signed-off-by: Edgar <edgar.gabriel@amd.com>
1 parent 450ae3a commit a0193bd

26 files changed

+921
-35
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@ ompi/mpiext/cuda/c/MPIX_Query_cuda_support.3
290290
ompi/mpiext/cuda/c/mpiext_cuda_c.h
291291
ompi/mpiext/cuda/c/cuda_c.h
292292

293+
ompi/mpiext/rocm/c/mpiext_rocm_c.h
294+
293295
ompi/mpiext/pcollreq/c/MPIX_*.3
294296
ompi/mpiext/pcollreq/c/profile/pallgather_init.c
295297
ompi/mpiext/pcollreq/c/profile/pallgatherv_init.c

config/opal_check_rocm.m4

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
dnl
2+
dnl Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved.
3+
dnl $COPYRIGHT$
4+
dnl
5+
dnl Additional copyrights may follow
6+
dnl
7+
dnl $HEADER$
8+
dnl
9+
10+
11+
# OMPI_CHECK_ROCM(prefix, [action-if-found], [action-if-not-found])
12+
# --------------------------------------------------------
13+
# check if ROCM support can be found. sets prefix_{CPPFLAGS,
14+
# LDFLAGS, LIBS} as needed and runs action-if-found if there is
15+
# support, otherwise executes action-if-not-found
16+
17+
18+
#
19+
# Check for ROCm support
20+
#
21+
AC_DEFUN([OPAL_CHECK_ROCM],[
22+
23+
OPAL_VAR_SCOPE_PUSH([opal_check_rocm_happy rocm_save_CPPFLAGS rocm_save_LDFLAGS rocm_CPPFLAGS rocm_LDFLAGS])
24+
25+
rocm_save_CPPFLAGS="$CPPFLAGS"
26+
rocm_save_LDFLAGS="$LDFLAGS"
27+
28+
# Get some configuration information
29+
AC_ARG_WITH([rocm],
30+
[AS_HELP_STRING([--with-rocm(=DIR)],
31+
[Build ROCm support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])])
32+
33+
34+
AS_IF([ test -n "$with_rocm" && test "$with_rocm" = "yes" ],
35+
[ with_rocm="/opt/rocm"] )
36+
37+
rocm_CPPFLAGS="-D__HIP_PLATFORM_AMD__"
38+
rocm_LDFLAGS="-L${with_rocm}/lib/hip"
39+
40+
AS_IF([ test -n "$with_rocm" && test "$with_rocm" != "no" ],
41+
[ OPAL_APPEND([CPPFLAGS], [$rocm_CPPFLAGS])
42+
OPAL_APPEND([LDFLAGS], [$rocm_LDFLAGS]) ])
43+
44+
OAC_CHECK_PACKAGE([rocm],
45+
[$1],
46+
[hip/hip_runtime.h],
47+
[amdhip64],
48+
[hipFree],
49+
[opal_check_rocm_happy="yes"],
50+
[opal_check_rocm_happy="no"])
51+
52+
LDFLAGS="$rocm_save_LDFLAGS"
53+
OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] )
54+
55+
AS_IF([ test "$opal_check_rocm_happy" = "no" ],
56+
[ CPPFLAGS="$rocm_save_CPPFLAGS"])
57+
58+
AS_IF([ test "$opal_check_rocm_happy" = "yes" ],
59+
[ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support])
60+
ROCM_SUPPORT=1 ],
61+
[ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support])
62+
ROCM_SUPPORT=0 ])
63+
64+
AS_IF([ test "$opal_check_rocm_happy" = "yes" ],
65+
[$2],
66+
[AS_IF([test -n "$with_rocm" && test "$with_rocm" != "no"],
67+
[AC_MSG_ERROR([ROCm support requested but not found. Aborting])])
68+
$3])
69+
70+
AM_CONDITIONAL([OPAL_rocm_support], [test "$opal_check_rocm_happy" = "yes"])
71+
OPAL_VAR_SCOPE_POP
72+
])
73+
74+
AC_DEFUN([OPAL_CHECK_ROCM_AFTER_OPAL_DL],[
75+
# We cannot have ROCm support without OPAL DL support. Error out
76+
# if the user wants Rocm but we do not have OPAL DL support.
77+
AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && test "$opal_check_rocm_happy" = "yes"],
78+
[AC_MSG_WARN([--with-rocm was specified, but dlopen support is disabled.])
79+
AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
80+
AC_MSG_ERROR([Cannot continue.])])
81+
82+
])

config/opal_config_files.m4

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ AC_DEFUN([OPAL_CONFIG_FILES],[
1818
AC_CONFIG_FILES([
1919
opal/Makefile
2020
opal/cuda/Makefile
21+
opal/rocm/Makefile
2122
opal/etc/Makefile
2223
opal/include/Makefile
2324
opal/datatype/Makefile

configure.ac

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,8 +1003,24 @@ AC_CACHE_SAVE
10031003
opal_show_title "System-specific tests"
10041004

10051005
OPAL_CHECK_CUDA
1006+
##################################
1007+
# ROCm support
1008+
##################################
1009+
OPAL_CHECK_ROCM([opal_rocm],
1010+
[opal_rocm_happy="yes"],
1011+
[opal_rocm_happy="no"])
1012+
OPAL_SUMMARY_ADD([Miscellaneous], [ROCm suport], [], [$opal_rocm_happy])
1013+
1014+
AS_IF([test "$OPAL_CUDA_SUPPORT" = "1" && test "$OPAL_ROCM_SUPPORT" = "1"],
1015+
[AC_MSG_WARN([Cannot support both CUDA and ROCm.])
1016+
AC_MSG_WARN([You must reconfigure Open MPI choosing either CUDA or ROCm .])
1017+
AC_MSG_ERROR([Cannot continue.])])
1018+
1019+
##################################
10061020
OPAL_CHECK_OS_FLAVORS
10071021

1022+
1023+
10081024
# Do we have _SC_NPROCESSORS_ONLN? (only going to pass if we also have
10091025
# <unistd.h> and sysconf(), which is ok) OS X 10.4 has <unistd.h> and
10101026
# sysconf(), but does not have _SC_NPROCESSORS_ONLN. Doh!
@@ -1247,6 +1263,8 @@ AC_CACHE_SAVE
12471263

12481264
OPAL_CHECK_CUDA_AFTER_OPAL_DL
12491265

1266+
OPAL_CHECK_ROCM_AFTER_OPAL_DL
1267+
12501268
##################################
12511269
# MPI Extended Interfaces
12521270
##################################
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
.. _mpix_query_rocm_support:
2+
3+
4+
MPIX_Query_rocm_support
5+
=======================
6+
7+
.. include_body
8+
9+
**MPIX_Query_rocm_support** - Returns 1 if there is AMD ROCm aware support
10+
and 0 if there is not.
11+
12+
13+
SYNTAX
14+
------
15+
16+
17+
C Syntax
18+
^^^^^^^^
19+
20+
.. code-block:: c
21+
22+
#include <mpi.h>
23+
#include <mpi-ext.h>
24+
25+
int MPIX_Query_rocm_support(void)
26+
27+
28+
Fortran Syntax
29+
^^^^^^^^^^^^^^
30+
31+
There is no Fortran binding for this function.
32+
33+
34+
C++ Syntax
35+
^^^^^^^^^^
36+
37+
There is no C++ binding for this function.
38+
39+
40+
DESCRIPTION
41+
-----------
42+
43+
This routine return 1 if MPI library is build with ROCm and runtime
44+
supports ROCm buffers. This routine must be called after MPI is
45+
initialized by a call to :ref:`MPI_Init` or :ref:`MPI_Init_thread`.
46+
47+
48+
Examples
49+
^^^^^^^^
50+
51+
::
52+
53+
54+
#include <stdio.h>
55+
#include "mpi.h"
56+
57+
#include "mpi-ext.h" /* Needed for ROCm-aware check */
58+
59+
int main(int argc, char *argv[])
60+
{
61+
62+
MPI_Init(&argc, &argv);
63+
64+
if (MPIX_Query_rocm_support()) {
65+
printf("This MPI library has ROCm-aware support.);
66+
} else {
67+
printf("This MPI library does not have ROCm-aware support.);
68+
}
69+
MPI_Finalize();
70+
71+
return 0;
72+
}

docs/man-openmpi/man3/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,4 +466,5 @@ MPI API manual pages (section 3)
466466
MPI_Wtick.3.rst
467467
MPI_Wtime.3.rst
468468
MPIX_Query_cuda_support.3.rst
469+
MPIX_Query_rocm_support.3.rst
469470
OMPI_Affinity_str.3.rst

ompi/mpiext/rocm/Makefile.am

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#
2+
# Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
3+
# University Research and Technology
4+
# Corporation. All rights reserved.
5+
# Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
6+
# Copyright (c) 2015 NVIDIA, Inc. All rights reserved
7+
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved
8+
# $COPYRIGHT$
9+
#
10+
# Additional copyrights may follow
11+
#
12+
# $HEADER$
13+
#
14+
15+
# This Makefile is not traversed during a normal "make all" in an OMPI
16+
# build. It *is* traversed during "make dist", however. So you can
17+
# put EXTRA_DIST targets in here.
18+
#
19+
# You can also use this as a convenience for building this MPI
20+
# extension (i.e., "make all" in this directory to invoke "make all"
21+
# in all the subdirectories).
22+
23+
SUBDIRS = c

ompi/mpiext/rocm/c/Makefile.am

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#
2+
# Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
3+
# University Research and Technology
4+
# Corporation. All rights reserved.
5+
# Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved.
6+
# Copyright (c) 2015 NVIDIA, Inc. All rights reserved.
7+
# Copyright (c) 2018 Research Organization for Information Science
8+
# and Technology (RIST). All rights reserved.
9+
# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
10+
#
11+
# $COPYRIGHT$
12+
#
13+
# Additional copyrights may follow
14+
#
15+
# $HEADER$
16+
#
17+
18+
# This file builds the C bindings for MPI extensions. It must be
19+
# present in all MPI extensions.
20+
21+
# We must set these #defines so that the inner OMPI MPI prototype
22+
# header files do the Right Thing.
23+
AM_CPPFLAGS = -DOMPI_PROFILE_LAYER=0 -DOMPI_COMPILING_FORTRAN_WRAPPERS=1
24+
25+
# Convenience libtool library that will be slurped up into libmpi.la.
26+
noinst_LTLIBRARIES = libmpiext_rocm_c.la
27+
28+
# This is where the top-level header file (that is included in
29+
# <mpi-ext.h>) must be installed.
30+
ompidir = $(ompiincludedir)/mpiext
31+
32+
# This is the header file that is installed.
33+
nodist_ompi_HEADERS = mpiext_rocm_c.h
34+
35+
# Sources for the convenience libtool library. Other than the one
36+
# header file, all source files in the extension have no file naming
37+
# conventions.
38+
libmpiext_rocm_c_la_SOURCES = \
39+
$(ompi_HEADERS) \
40+
mpiext_rocm.c
41+
libmpiext_rocm_c_la_LDFLAGS = -module -avoid-version
42+

ompi/mpiext/rocm/c/mpiext_rocm.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
3+
* University Research and Technology
4+
* Corporation. All rights reserved.
5+
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
6+
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
7+
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
8+
* reserved.
9+
* Copyright (c) 2015 NVIDIA, Inc. All rights reserved.
10+
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
11+
* $COPYRIGHT$
12+
*
13+
* Additional copyrights may follow
14+
*
15+
* $HEADER$
16+
*
17+
*/
18+
19+
#include "ompi_config.h"
20+
21+
#include <stdio.h>
22+
#include <string.h>
23+
24+
#include "opal/constants.h"
25+
#include "opal/runtime/opal_params.h"
26+
#include "ompi/mpiext/rocm/c/mpiext_rocm_c.h"
27+
28+
#if OPAL_ROCM_SUPPORT
29+
#include "opal/rocm/common_rocm_prototypes.h"
30+
#endif
31+
32+
int MPIX_Query_rocm_support(void)
33+
{
34+
35+
if (!opal_built_with_rocm_support) {
36+
return 0;
37+
} else {
38+
if ( opal_rocm_runtime_initialized ) {
39+
return 1;
40+
}
41+
#if OPAL_ROCM_SUPPORT
42+
// There is a chance that the rocm runtime has simply not
43+
// been initialized yet, since that is done during the first convertor creation
44+
// Invoke a function that will trigger the rocm runtime initialized and
45+
// check the value again after that.
46+
47+
int val1, val2;
48+
mca_common_rocm_check_bufs((char *)&val1, (char *)&val2);
49+
#endif
50+
}
51+
52+
return opal_rocm_runtime_initialized;
53+
}

ompi/mpiext/rocm/c/mpiext_rocm_c.h.in

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
* Copyright (c) 2004-2009 The Trustees of Indiana University.
3+
* All rights reserved.
4+
* Copyright (c) 2010-2012 Cisco Systems, Inc. All rights reserved.
5+
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
6+
* Copyright (c) 2015 NVIDIA, Inc. All rights reserved.
7+
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
8+
* $COPYRIGHT$
9+
*
10+
* Additional copyrights may follow
11+
*
12+
* $HEADER$
13+
*
14+
*/
15+
16+
#define MPIX_ROCM_AWARE_SUPPORT @MPIX_ROCM_AWARE_SUPPORT@
17+
OMPI_DECLSPEC int MPIX_Query_rocm_support(void);

0 commit comments

Comments
 (0)