22
22
* and Technology (RIST). All rights reserved.
23
23
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
24
24
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
25
- * Copyright (c) 2017 IBM Corporation. All rights reserved.
25
+ * Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
26
26
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
27
27
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
28
28
* reserved.
57
57
58
58
#include "ompi/runtime/params.h"
59
59
60
+ struct ompi_comm_split_type_hw_guided_t {
61
+ const char * info_value ;
62
+ int split_type ;
63
+ };
64
+ typedef struct ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_t ;
65
+
66
+ static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_support [] = {
67
+ {.info_value = "mpi_shared_memory" , .split_type = MPI_COMM_TYPE_SHARED },
68
+ {.info_value = "hwthread" , .split_type = OMPI_COMM_TYPE_HWTHREAD },
69
+ {.info_value = "core" , .split_type = OMPI_COMM_TYPE_CORE },
70
+ {.info_value = "l1cache" , .split_type = OMPI_COMM_TYPE_L1CACHE },
71
+ {.info_value = "l2cache" , .split_type = OMPI_COMM_TYPE_L2CACHE },
72
+ {.info_value = "l3cache" , .split_type = OMPI_COMM_TYPE_L3CACHE },
73
+ {.info_value = "socket" , .split_type = OMPI_COMM_TYPE_SOCKET },
74
+ {.info_value = "numanode" , .split_type = OMPI_COMM_TYPE_NUMA },
75
+ {.info_value = "board" , .split_type = OMPI_COMM_TYPE_BOARD },
76
+ {.info_value = "host" , .split_type = OMPI_COMM_TYPE_HOST },
77
+ {.info_value = "cu" , .split_type = OMPI_COMM_TYPE_CU },
78
+ {.info_value = "cluster" , .split_type = OMPI_COMM_TYPE_CLUSTER },
79
+ {.info_value = NULL },
80
+ };
81
+
60
82
/*
61
83
** sort-function for MPI_Comm_split
62
84
*/
@@ -764,6 +786,15 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
764
786
case OMPI_COMM_TYPE_CLUSTER :
765
787
include = OPAL_PROC_ON_LOCAL_CLUSTER (locality );
766
788
break ;
789
+ case MPI_COMM_TYPE_HW_GUIDED :
790
+ case MPI_COMM_TYPE_HW_UNGUIDED :
791
+ /*
792
+ * MPI_COMM_TYPE_HW_(UN)GUIDED handled in calling function.
793
+ * We should not get here as the split type will be changed
794
+ * at a higher level.
795
+ */
796
+ opal_output (0 , "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d" , split_type );
797
+ return OMPI_ERR_BAD_PARAM ;
767
798
}
768
799
769
800
if (include ) {
@@ -837,8 +868,9 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
837
868
ompi_communicator_t * newcomp = MPI_COMM_NULL ;
838
869
int my_size , my_rsize = 0 , mode , inter ;
839
870
int * lranks = NULL , * rranks = NULL ;
840
- int global_split_type , ok , tmp [4 ];
871
+ int global_split_type , ok , tmp [6 ];
841
872
int rc ;
873
+ int orig_split_type = split_type ;
842
874
843
875
/* silence clang warning. newcomm should never be NULL */
844
876
if (OPAL_UNLIKELY (NULL == newcomm )) {
@@ -847,14 +879,58 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
847
879
848
880
inter = OMPI_COMM_IS_INTER (comm );
849
881
882
+ /* Step 0: Convert MPI_COMM_TYPE_HW_GUIDED to the internal type */
883
+ if (MPI_COMM_TYPE_HW_GUIDED == split_type ) {
884
+ int flag ;
885
+ opal_cstring_t * value = NULL ;
886
+
887
+ opal_info_get (info , "mpi_hw_resource_type" , & value , & flag );
888
+ /* If key is not in the 'info', then return MPI_COMM_NULL.
889
+ * This is caught at the MPI interface level, but it doesn't hurt to
890
+ * check it again.
891
+ */
892
+ if (!flag ) {
893
+ * newcomm = MPI_COMM_NULL ;
894
+ return OMPI_SUCCESS ;
895
+ }
896
+
897
+ /* Verify the value associated with the "mpi_hw_resource_type" key
898
+ * - is supported, and
899
+ * - is the same value at all ranks
900
+ *
901
+ * If not supported, then return MPI_COMM_NULL.
902
+ * If not the same at all ranks, throw an error.
903
+ */
904
+ flag = 0 ;
905
+ for (int i = 0 ; ompi_comm_split_type_hw_guided_support [i ].info_value ; ++ i ) {
906
+ if (0 == strncasecmp (value -> string , ompi_comm_split_type_hw_guided_support [i ].info_value , strlen (ompi_comm_split_type_hw_guided_support [i ].info_value ))) {
907
+ split_type = ompi_comm_split_type_hw_guided_support [i ].split_type ;
908
+ flag = 1 ;
909
+ break ;
910
+ }
911
+ }
912
+ /* If not supported, then return MPI_COMM_NULL. */
913
+ if (0 == flag ) {
914
+ * newcomm = MPI_COMM_NULL ;
915
+ return OMPI_SUCCESS ;
916
+ }
917
+ }
918
+
850
919
/* Step 1: verify all ranks have supplied the same value for split type. All split types
851
920
* must be the same or MPI_UNDEFINED (which is negative). */
852
- tmp [0 ] = split_type ;
853
- tmp [1 ] = - split_type ;
921
+ tmp [0 ] = orig_split_type ;
922
+ tmp [1 ] = - orig_split_type ;
854
923
tmp [2 ] = key ;
855
924
tmp [3 ] = - key ;
925
+ /* For MPI_COMM_TYPE_HW_GUIDED, verify all ranks have supplied the same
926
+ * split_type (represented by orig_split_type) and info 'value' (represented by split_type).
927
+ *
928
+ * For split_type != MPI_COMM_TYPE_HW_GUIDED then orig_split_type == split_type.
929
+ */
930
+ tmp [4 ] = split_type ;
931
+ tmp [5 ] = - split_type ;
856
932
857
- rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & tmp , 4 , MPI_INT , MPI_MAX , comm ,
933
+ rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & tmp , 6 , MPI_INT , MPI_MAX , comm ,
858
934
comm -> c_coll -> coll_allreduce_module );
859
935
if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
860
936
return rc ;
@@ -899,6 +975,26 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
899
975
return OMPI_SUCCESS ;
900
976
}
901
977
978
+ /* MPI_COMM_TYPE_HW_GUIDED: Check if 'value' the same at all ranks */
979
+ if (tmp [4 ] != - tmp [5 ]) {
980
+ if (0 == ompi_comm_rank (comm )) {
981
+ opal_output (0 , "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED" );
982
+ }
983
+ return OMPI_ERR_BAD_PARAM ;
984
+ }
985
+
986
+ /* TODO: Make this better...
987
+ *
988
+ * See Example 7.4 in the MPI 4.0 standard for example usage.
989
+ *
990
+ * Stage 0: Recognized, but not implemented.
991
+ * Stage 1: Do better than that
992
+ */
993
+ if (MPI_COMM_TYPE_HW_UNGUIDED == global_split_type ) {
994
+ * newcomm = MPI_COMM_NULL ;
995
+ return OMPI_SUCCESS ;
996
+ }
997
+
902
998
/* Step 2: Build potential communicator groups. If any ranks will not be part of
903
999
* the ultimate communicator we will drop them later. This saves doing an extra
904
1000
* allgather on the whole communicator. By using ompi_comm_split() later only
0 commit comments