@@ -881,12 +881,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
881
881
* @return OMPI_SUCCESS if BTLs can be found
882
882
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
883
883
*
884
- * In this case an "alternate" BTL is a BTL that does not provide true RDMA but
885
- * can use active messages using the BTL base AM RDMA/atomics. Since more than
886
- * one BTL may be needed for this support the OSC component will disable the
887
- * use of registration-based RDMA (these BTLs will not be used) and will use
888
- * any remaining BTL. By default the BTLs used will be tcp and sm but any single
889
- * (or pair) of BTLs may be used.
884
+ * In this case an "alternate" BTL is a BTL does not meet the
885
+ * requirements of a BTL outlined in ompi_osc_rdma_query_btls().
886
+ * Either it does not provide connectivity to all peers, provide
887
+ * remote completion, or natively support put/get/atomic.. Since more
888
+ * than one BTL may be needed for this support the OSC component will
889
+ * disable the use of registration-based RDMA (these BTLs will not be
890
+ * used) and will use any remaining BTL. By default the BTLs used will
891
+ * be tcp and sm but any single (or pair) of BTLs may be used.
890
892
*/
891
893
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
892
894
{
@@ -935,6 +937,26 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
935
937
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
936
938
}
937
939
940
+ /*
941
+ * Attempt to find a BTL that can be used for native RDMA
942
+ *
943
+ * Attempt to find an "accelerated" BTL that can be used directly, as
944
+ * opposed to emulated rdma semantics with the alternate BTLs. To be
945
+ * an accelerated BTL, four conditions must be true:
946
+ *
947
+ * 1) The BTL must be able to communicate with all peers in the
948
+ * Window
949
+ * 2) The BTL must provide remote completion
950
+ * 3) The BTL must be able to register the entire target window
951
+ * 4) The BTL must natively support put/get/atomic operations
952
+ *
953
+ * Testing (1) is expensive, so as an optimization, the
954
+ * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL
955
+ * components we know can achieve (1) in almost all usage scenarios.
956
+ *
957
+ * If module is NULL, the code acts as a query mechanism to find any
958
+ * potential BTLs, and is used to implement osc_rdma_query().
959
+ */
938
960
static int ompi_osc_rdma_query_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
939
961
{
940
962
struct mca_btl_base_module_t * * possible_btls = NULL ;
@@ -948,14 +970,15 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
948
970
char * * btls_to_use ;
949
971
void * tmp ;
950
972
951
- btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
952
-
953
973
if (module ) {
954
974
ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
955
975
module -> btls_in_use = 0 ;
956
976
module -> use_memory_registration = false;
957
977
}
958
978
979
+ /* Check for BTLs in the list of BTLs we know can reach all peers
980
+ in general usage. */
981
+ btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls , ',' );
959
982
if (btls_to_use ) {
960
983
/* rdma and atomics are only supported with BTLs at the moment
961
984
* If a btl does not support remote completion, it cannot be used as the primary btl.
@@ -992,7 +1015,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
992
1015
993
1016
/* if osc/rdma gets selected we need to ensure that all local procs have been added */
994
1017
ompi_osc_rdma_ensure_local_add_procs ();
995
-
1018
+
1019
+ /*
1020
+ * A BTL in the list of known can reach all peers that met our
1021
+ * other requirements was not found. Look for BTLs that may be
1022
+ * able to talk to all peers. This is obviously more expensive
1023
+ * than the check above.
1024
+ */
1025
+
996
1026
for (int rank = 0 ; rank < comm_size ; ++ rank ) {
997
1027
ompi_proc_t * proc = ompi_comm_peer_lookup (comm , rank );
998
1028
mca_bml_base_endpoint_t * endpoint ;
@@ -1036,10 +1066,16 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
1036
1066
btl_counts = tmp ;
1037
1067
1038
1068
for (int i_btl = 0 ; i_btl < num_btls ; ++ i_btl ) {
1039
- /* for this implementation we need only compare-and-swap and fetch-and-add
1069
+ /* Check for BTL requirements:
1070
+ * 1) RDMA (put/get) and ATOMIC operations. We only
1071
+ * require cswap and fetch and add and will emulate
1072
+ * other opterations with those two as necessary.
1073
+ * 2) Remote Completion
1040
1074
*
1041
- * If a btl does not support remote completion, it cannot be used as the primary btl.
1042
- * It can still be selected as an alternate btl */
1075
+ * If the BTL meets all those requirements, increment the
1076
+ * btl_counts to indicate that this btl can talk to the
1077
+ * current peer proc.
1078
+ */
1043
1079
if (((endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) ==
1044
1080
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS )) &&
1045
1081
(endpoint -> btl_rdma .bml_btls [i_btl ].btl -> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD ) &&
@@ -1081,7 +1117,9 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
1081
1117
}
1082
1118
1083
1119
if (possible_btls [i ]-> btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB ) {
1084
- /* do not need to use the btl for self communication */
1120
+ /* The onesided component can, if BTL atomics are atomic
1121
+ relative to CPU atomics, handle atomics to self, so
1122
+ increment the counter once to cover that case. */
1085
1123
btl_count ++ ;
1086
1124
}
1087
1125
0 commit comments