Skip to content

Commit 19e24fa

Browse files
committed
MPI 4: Add MPI_COMM_TYPE_HW_UNGUIDED and MPI_COMM_TYPE_HW_GUIDED
* `MPI_COMM_TYPE_HW_GUIDED` supports all of the existing `OMPI_COMM_TYPE_` options. * `MPI_COMM_TYPE_HW_UNGUIDED` is recognized, but not supported so it returns `MPI_COMM_NULL` indidicating that the MPI library cannot split the communicator any further. Signed-off-by: Joshua Hursey <jhursey@us.ibm.com>
1 parent d32ce3f commit 19e24fa

File tree

4 files changed

+135
-8
lines changed

4 files changed

+135
-8
lines changed

ompi/communicator/comm.c

Lines changed: 101 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
* and Technology (RIST). All rights reserved.
2323
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
2424
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
25-
* Copyright (c) 2017 IBM Corporation. All rights reserved.
25+
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2626
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
2727
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
2828
* reserved.
@@ -57,6 +57,28 @@
5757

5858
#include "ompi/runtime/params.h"
5959

60+
struct ompi_comm_split_type_hw_guided_t {
61+
const char *info_value;
62+
int split_type;
63+
};
64+
typedef struct ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_t;
65+
66+
static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_support[] = {
67+
{.info_value = "mpi_shared_memory", .split_type = MPI_COMM_TYPE_SHARED},
68+
{.info_value = "hwthread", .split_type = OMPI_COMM_TYPE_HWTHREAD},
69+
{.info_value = "core", .split_type = OMPI_COMM_TYPE_CORE},
70+
{.info_value = "l1cache", .split_type = OMPI_COMM_TYPE_L1CACHE},
71+
{.info_value = "l2cache", .split_type = OMPI_COMM_TYPE_L2CACHE},
72+
{.info_value = "l3cache", .split_type = OMPI_COMM_TYPE_L3CACHE},
73+
{.info_value = "socket", .split_type = OMPI_COMM_TYPE_SOCKET},
74+
{.info_value = "numanode", .split_type = OMPI_COMM_TYPE_NUMA},
75+
{.info_value = "board", .split_type = OMPI_COMM_TYPE_BOARD},
76+
{.info_value = "host", .split_type = OMPI_COMM_TYPE_HOST},
77+
{.info_value = "cu", .split_type = OMPI_COMM_TYPE_CU},
78+
{.info_value = "cluster", .split_type = OMPI_COMM_TYPE_CLUSTER},
79+
{.info_value = NULL},
80+
};
81+
6082
/*
6183
** sort-function for MPI_Comm_split
6284
*/
@@ -764,6 +786,15 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
764786
case OMPI_COMM_TYPE_CLUSTER:
765787
include = OPAL_PROC_ON_LOCAL_CLUSTER(locality);
766788
break;
789+
case MPI_COMM_TYPE_HW_GUIDED:
790+
case MPI_COMM_TYPE_HW_UNGUIDED:
791+
/*
792+
* MPI_COMM_TYPE_HW_(UN)GUIDED handled in calling function.
793+
* We should not get here as the split type will be changed
794+
* at a higher level.
795+
*/
796+
opal_output(0, "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d", split_type);
797+
return OMPI_ERR_BAD_PARAM;
767798
}
768799

769800
if (include) {
@@ -837,8 +868,9 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
837868
ompi_communicator_t *newcomp = MPI_COMM_NULL;
838869
int my_size, my_rsize = 0, mode, inter;
839870
int *lranks = NULL, *rranks = NULL;
840-
int global_split_type, ok, tmp[4];
871+
int global_split_type, ok, tmp[6];
841872
int rc;
873+
int orig_split_type = split_type;
842874

843875
/* silence clang warning. newcomm should never be NULL */
844876
if (OPAL_UNLIKELY(NULL == newcomm)) {
@@ -847,14 +879,58 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
847879

848880
inter = OMPI_COMM_IS_INTER(comm);
849881

882+
/* Step 0: Convert MPI_COMM_TYPE_HW_GUIDED to the internal type */
883+
if (MPI_COMM_TYPE_HW_GUIDED == split_type) {
884+
int flag;
885+
opal_cstring_t *value = NULL;
886+
887+
opal_info_get(info, "mpi_hw_resource_type", &value, &flag);
888+
/* If key is not in the 'info', then return MPI_COMM_NULL.
889+
* This is caught at the MPI interface level, but it doesn't hurt to
890+
* check it again.
891+
*/
892+
if (!flag) {
893+
*newcomm = MPI_COMM_NULL;
894+
return OMPI_SUCCESS;
895+
}
896+
897+
/* Verify the value associated with the "mpi_hw_resource_type" key
898+
* - is supported, and
899+
* - is the same value at all ranks
900+
*
901+
* If not supported, then return MPI_COMM_NULL.
902+
* If not the same at all ranks, throw an error.
903+
*/
904+
flag = 0;
905+
for (int i = 0; ompi_comm_split_type_hw_guided_support[i].info_value; ++i) {
906+
if (0 == strncasecmp(value->string, ompi_comm_split_type_hw_guided_support[i].info_value, strlen(ompi_comm_split_type_hw_guided_support[i].info_value))) {
907+
split_type = ompi_comm_split_type_hw_guided_support[i].split_type;
908+
flag = 1;
909+
break;
910+
}
911+
}
912+
/* If not supported, then return MPI_COMM_NULL. */
913+
if (0 == flag) {
914+
*newcomm = MPI_COMM_NULL;
915+
return OMPI_SUCCESS;
916+
}
917+
}
918+
850919
/* Step 1: verify all ranks have supplied the same value for split type. All split types
851920
* must be the same or MPI_UNDEFINED (which is negative). */
852-
tmp[0] = split_type;
853-
tmp[1] = -split_type;
921+
tmp[0] = orig_split_type;
922+
tmp[1] = -orig_split_type;
854923
tmp[2] = key;
855924
tmp[3] = -key;
925+
/* For MPI_COMM_TYPE_HW_GUIDED, verify all ranks have supplied the same
926+
* split_type (represented by orig_split_type) and info 'value' (represented by split_type).
927+
*
928+
* For split_type != MPI_COMM_TYPE_HW_GUIDED then orig_split_type == split_type.
929+
*/
930+
tmp[4] = split_type;
931+
tmp[5] = -split_type;
856932

857-
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &tmp, 4, MPI_INT, MPI_MAX, comm,
933+
rc = comm->c_coll->coll_allreduce (MPI_IN_PLACE, &tmp, 6, MPI_INT, MPI_MAX, comm,
858934
comm->c_coll->coll_allreduce_module);
859935
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
860936
return rc;
@@ -899,6 +975,26 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
899975
return OMPI_SUCCESS;
900976
}
901977

978+
/* MPI_COMM_TYPE_HW_GUIDED: Check if 'value' the same at all ranks */
979+
if (tmp[4] != -tmp[5]) {
980+
if (0 == ompi_comm_rank(comm)) {
981+
opal_output(0, "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED");
982+
}
983+
return OMPI_ERR_BAD_PARAM;
984+
}
985+
986+
/* TODO: Make this better...
987+
*
988+
* See Example 7.4 in the MPI 4.0 standard for example usage.
989+
*
990+
* Stage 0: Recognized, but not implemented.
991+
* Stage 1: Do better than that
992+
*/
993+
if (MPI_COMM_TYPE_HW_UNGUIDED == global_split_type) {
994+
*newcomm = MPI_COMM_NULL;
995+
return OMPI_SUCCESS;
996+
}
997+
902998
/* Step 2: Build potential communicator groups. If any ranks will not be part of
903999
* the ultimate communicator we will drop them later. This saves doing an extra
9041000
* allgather on the whole communicator. By using ompi_comm_split() later only

ompi/include/mpi.h.in

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
* Copyright (c) 2015 University of Houston. All rights reserved.
2020
* Copyright (c) 2015-2021 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
22-
* Copyright (c) 2017-2019 IBM Corporation. All rights reserved.
22+
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2323
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
2424
* Copyright (c) 2021-2022 Google, LLC. All rights reserved.
2525
* Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All Rights
@@ -852,7 +852,9 @@ enum {
852852
OMPI_COMM_TYPE_BOARD,
853853
OMPI_COMM_TYPE_HOST,
854854
OMPI_COMM_TYPE_CU,
855-
OMPI_COMM_TYPE_CLUSTER
855+
OMPI_COMM_TYPE_CLUSTER,
856+
MPI_COMM_TYPE_HW_UNGUIDED,
857+
MPI_COMM_TYPE_HW_GUIDED
856858
};
857859
#define OMPI_COMM_TYPE_NODE MPI_COMM_TYPE_SHARED
858860

ompi/include/mpif-values.pl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# Copyright (c) 2020 The University of Tennessee and The University
88
# of Tennessee Research Foundation. All rights
99
# reserved.
10+
# Copyright (c) 2022 IBM Corporation. All rights reserved.
1011
# $COPYRIGHT$
1112
#
1213
# Additional copyrights may follow
@@ -395,6 +396,8 @@ sub write_file {
395396
$constants->{OMPI_COMM_TYPE_HOST} = 9;
396397
$constants->{OMPI_COMM_TYPE_CU} = 10;
397398
$constants->{OMPI_COMM_TYPE_CLUSTER} = 11;
399+
$constants->{MPI_COMM_TYPE_HW_UNGUIDED} = 12;
400+
$constants->{MPI_COMM_TYPE_HW_GUIDED} = 13;
398401

399402
#----------------------------------------------------------------------------
400403

ompi/mpi/c/comm_split_type.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2010-2012 Oak Ridge National Labs. All rights reserved.
1515
* Copyright (c) 2015 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
17-
* Copyright (c) 2017 IBM Corporation. All rights reserved.
17+
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
1818
* $COPYRIGHT$
1919
*
2020
* Additional copyrights may follow
@@ -65,6 +65,8 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key,
6565
}
6666

6767
if ( MPI_COMM_TYPE_SHARED != split_type && // Same as OMPI_COMM_TYPE_NODE
68+
MPI_COMM_TYPE_HW_UNGUIDED != split_type &&
69+
MPI_COMM_TYPE_HW_GUIDED != split_type &&
6870
OMPI_COMM_TYPE_CLUSTER != split_type &&
6971
OMPI_COMM_TYPE_CU != split_type &&
7072
OMPI_COMM_TYPE_HOST != split_type &&
@@ -99,6 +101,30 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key,
99101
}
100102
#endif
101103

104+
if ( MPI_COMM_TYPE_HW_GUIDED == split_type ) {
105+
int flag;
106+
opal_cstring_t *value = NULL;
107+
108+
/* MPI_Info is required for this split_type.
109+
* Not an error condition, per MPI 4.0.
110+
*/
111+
if ( MPI_INFO_NULL == info ) {
112+
*newcomm = MPI_COMM_NULL;
113+
rc = MPI_SUCCESS;
114+
OMPI_ERRHANDLER_RETURN ( rc, comm, rc, FUNC_NAME);
115+
}
116+
117+
/* MPI_Info with key "mpi_hw_resource_type" is required for this split_type.
118+
* Not an error condition, per MPI 4.0.
119+
*/
120+
ompi_info_get(info, "mpi_hw_resource_type", &value, &flag);
121+
if ( !flag ) {
122+
*newcomm = MPI_COMM_NULL;
123+
rc = MPI_SUCCESS;
124+
OMPI_ERRHANDLER_RETURN ( rc, comm, rc, FUNC_NAME);
125+
}
126+
}
127+
102128
if( (MPI_COMM_SELF == comm) && (MPI_UNDEFINED == split_type) ) {
103129
*newcomm = MPI_COMM_NULL;
104130
rc = MPI_SUCCESS;

0 commit comments

Comments
 (0)