Skip to content

Commit e509302

Browse files
committed
osc/rdma: Add initialization documentation
No functional changes (an initialization was reordered to make documentation easier), but add documentation about BTL selection behavior. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent fcd215d commit e509302

File tree

1 file changed

+51
-13
lines changed

1 file changed

+51
-13
lines changed

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -881,12 +881,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
881881
* @return OMPI_SUCCESS if BTLs can be found
882882
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
883883
*
884-
* In this case an "alternate" BTL is a BTL that does not provide true RDMA but
885-
* can use active messages using the BTL base AM RDMA/atomics. Since more than
886-
* one BTL may be needed for this support the OSC component will disable the
887-
* use of registration-based RDMA (these BTLs will not be used) and will use
888-
* any remaining BTL. By default the BTLs used will be tcp and sm but any single
889-
* (or pair) of BTLs may be used.
884+
* In this case an "alternate" BTL is a BTL does not meet the
885+
* requirements of a BTL outlined in ompi_osc_rdma_query_btls().
886+
* Either it does not provide connectivity to all peers, provide
887+
* remote completion, or natively support put/get/atomic.. Since more
888+
* than one BTL may be needed for this support the OSC component will
889+
* disable the use of registration-based RDMA (these BTLs will not be
890+
* used) and will use any remaining BTL. By default the BTLs used will
891+
* be tcp and sm but any single (or pair) of BTLs may be used.
890892
*/
891893
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
892894
{
@@ -935,6 +937,26 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
935937
return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
936938
}
937939

940+
/*
941+
* Attempt to find a BTL that can be used for native RDMA
942+
*
943+
* Attempt to find an "accelerated" BTL that can be used directly, as
944+
* opposed to emulated rdma semantics with the alternate BTLs. To be
945+
* an accelerated BTL, four conditions must be true:
946+
*
947+
* 1) The BTL must be able to communicate with all peers in the
948+
* Window
949+
* 2) The BTL must provide remote completion
950+
* 3) The BTL must be able to register the entire target window
951+
* 4) The BTL must natively support put/get/atomic operations
952+
*
953+
* Testing (1) is expensive, so as an optimization, the
954+
* ompi_osc_rdma_full_connectivity_btls list contains the list of BTL
955+
* components we know can achieve (1) in almost all usage scenarios.
956+
*
957+
* If module is NULL, the code acts as a query mechanism to find any
958+
* potential BTLs, and is used to implement osc_rdma_query().
959+
*/
938960
static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
939961
{
940962
struct mca_btl_base_module_t **possible_btls = NULL;
@@ -948,14 +970,15 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
948970
char **btls_to_use;
949971
void *tmp;
950972

951-
btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls, ',');
952-
953973
if (module) {
954974
ompi_osc_rdma_selected_btl_insert(module, NULL, 0);
955975
module->btls_in_use = 0;
956976
module->use_memory_registration = false;
957977
}
958978

979+
/* Check for BTLs in the list of BTLs we know can reach all peers
980+
in general usage. */
981+
btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls, ',');
959982
if (btls_to_use) {
960983
/* rdma and atomics are only supported with BTLs at the moment
961984
* If a btl does not support remote completion, it cannot be used as the primary btl.
@@ -992,7 +1015,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
9921015

9931016
/* if osc/rdma gets selected we need to ensure that all local procs have been added */
9941017
ompi_osc_rdma_ensure_local_add_procs ();
995-
1018+
1019+
/*
1020+
* A BTL in the list of known can reach all peers that met our
1021+
* other requirements was not found. Look for BTLs that may be
1022+
* able to talk to all peers. This is obviously more expensive
1023+
* than the check above.
1024+
*/
1025+
9961026
for (int rank = 0 ; rank < comm_size ; ++rank) {
9971027
ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank);
9981028
mca_bml_base_endpoint_t *endpoint;
@@ -1036,10 +1066,16 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10361066
btl_counts = tmp;
10371067

10381068
for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
1039-
/* for this implementation we need only compare-and-swap and fetch-and-add
1069+
/* Check for BTL requirements:
1070+
* 1) RDMA (put/get) and ATOMIC operations. We only
1071+
* require cswap and fetch and add and will emulate
1072+
* other opterations with those two as necessary.
1073+
* 2) Remote Completion
10401074
*
1041-
* If a btl does not support remote completion, it cannot be used as the primary btl.
1042-
* It can still be selected as an alternate btl */
1075+
* If the BTL meets all those requirements, increment the
1076+
* btl_counts to indicate that this btl can talk to the
1077+
* current peer proc.
1078+
*/
10431079
if (((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
10441080
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) &&
10451081
(endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD) &&
@@ -1081,7 +1117,9 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
10811117
}
10821118

10831119
if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
1084-
/* do not need to use the btl for self communication */
1120+
/* The onesided component can, if BTL atomics are atomic
1121+
relative to CPU atomics, handle atomics to self, so
1122+
increment the counter once to cover that case. */
10851123
btl_count++;
10861124
}
10871125

0 commit comments

Comments
 (0)