@@ -410,16 +410,17 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
410
410
region -> len = size ;
411
411
412
412
if (module -> use_memory_registration && size ) {
413
+ assert (module -> use_accelerated_btl );
413
414
if (MPI_WIN_FLAVOR_ALLOCATE != module -> flavor || NULL == module -> state_handle ) {
414
415
ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , * base , size , MCA_BTL_REG_FLAG_ACCESS_ANY ,
415
416
& module -> base_handle );
416
417
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
417
418
return OMPI_ERR_OUT_OF_RESOURCE ;
418
419
}
419
420
420
- memcpy (region -> btl_handle_data , module -> base_handle , module -> selected_btls [ 0 ] -> btl_registration_handle_size );
421
+ memcpy (region -> btl_handle_data , module -> base_handle , module -> accelerated_btl -> btl_registration_handle_size );
421
422
} else {
422
- memcpy (region -> btl_handle_data , module -> state_handle , module -> selected_btls [ 0 ] -> btl_registration_handle_size );
423
+ memcpy (region -> btl_handle_data , module -> state_handle , module -> accelerated_btl -> btl_registration_handle_size );
423
424
}
424
425
}
425
426
@@ -580,8 +581,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
580
581
module -> use_cpu_atomics = module -> single_node ;
581
582
582
583
if (!module -> single_node ) {
583
- for (int i = 0 ; i < module -> btls_in_use ; ++ i ) {
584
- module -> use_cpu_atomics = module -> use_cpu_atomics && !!(module -> selected_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
584
+ if (module -> use_accelerated_btl ) {
585
+ module -> use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
586
+ } else {
587
+ for (int i = 0 ; i < module -> alternate_btl_count ; ++ i ) {
588
+ module -> use_cpu_atomics &= !!(module -> alternate_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
589
+ }
585
590
}
586
591
}
587
592
@@ -703,14 +708,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
703
708
if (0 == local_rank ) {
704
709
/* unlink the shared memory backing file */
705
710
opal_shmem_unlink (& module -> seg_ds );
706
- /* just go ahead and register the whole segment */
707
- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size ,
708
- MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
709
- if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
710
- state_region -> base = (intptr_t ) module -> segment_base ;
711
- if (module -> state_handle ) {
712
- memcpy (state_region -> btl_handle_data , module -> state_handle ,
713
- module -> selected_btls [0 ]-> btl_registration_handle_size );
711
+ if (module -> use_accelerated_btl ) {
712
+ /* just go ahead and register the whole segment */
713
+ ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size ,
714
+ MCA_BTL_REG_FLAG_ACCESS_ANY , & module -> state_handle );
715
+ if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
716
+ state_region -> base = (intptr_t ) module -> segment_base ;
717
+ if (module -> state_handle ) {
718
+ memcpy (state_region -> btl_handle_data , module -> state_handle ,
719
+ module -> accelerated_btl -> btl_registration_handle_size );
720
+ }
714
721
}
715
722
}
716
723
}
@@ -730,8 +737,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
730
737
region -> base = state_region -> base + my_base_offset ;
731
738
region -> len = size ;
732
739
if (module -> use_memory_registration ) {
733
- memcpy (region -> btl_handle_data , state_region -> btl_handle_data ,
734
- module -> selected_btls [0 ]-> btl_registration_handle_size );
740
+ assert (module -> use_accelerated_btl );
741
+ memcpy (region -> btl_handle_data , state_region -> btl_handle_data ,
742
+ module -> accelerated_btl -> btl_registration_handle_size );
735
743
}
736
744
}
737
745
@@ -910,12 +918,23 @@ static int btl_latency_sort_fn(const void *a, const void *b)
910
918
*/
911
919
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t * comm , ompi_osc_rdma_module_t * module )
912
920
{
921
+ size_t btl_count ;
922
+ size_t index = 0 ;
913
923
mca_btl_base_selected_module_t * item ;
914
924
int ret ;
915
925
916
926
assert (NULL != module );
917
927
918
- module -> btls_in_use = 0 ;
928
+ btl_count = opal_list_get_size (& mca_btl_base_modules_initialized );
929
+ if (btl_count > UINT8_MAX ) {
930
+ return OMPI_ERROR ;
931
+ }
932
+
933
+ module -> alternate_btl_count = btl_count ;
934
+ module -> alternate_btls = malloc (sizeof (struct mca_btl_base_module_t * ) * btl_count );
935
+ if (NULL == module -> alternate_btls ) {
936
+ return OMPI_ERR_TEMP_OUT_OF_RESOURCE ;
937
+ }
919
938
920
939
/* add all alternate btls to the selected_btls list, not worrying
921
940
about ordering yet. We have to add all btls unless we want to
@@ -937,17 +956,17 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937
956
if (OMPI_SUCCESS != ret ) {
938
957
return ret ;
939
958
}
940
- ompi_osc_rdma_selected_btl_insert ( module , item -> btl_module , module -> btls_in_use ++ ) ;
959
+ module -> alternate_btls [ index ++ ] = item -> btl_module ;
941
960
}
961
+ assert (index == btl_count );
942
962
943
963
/* sort based on latency, lowest first */
944
- qsort (module -> selected_btls , module -> btls_in_use ,
964
+ qsort (module -> alternate_btls , module -> alternate_btl_count ,
945
965
sizeof (struct mca_btl_base_module_t * ), btl_latency_sort_fn );
946
966
947
- /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */
948
967
module -> use_memory_registration = false;
949
968
950
- return module -> btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH ;
969
+ return OMPI_SUCCESS ;
951
970
}
952
971
953
972
@@ -991,8 +1010,7 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
991
1010
992
1011
assert (NULL != module );
993
1012
994
- ompi_osc_rdma_selected_btl_insert (module , NULL , 0 );
995
- module -> btls_in_use = 0 ;
1013
+ module -> use_accelerated_btl = false;
996
1014
module -> use_memory_registration = false;
997
1015
998
1016
/* Check for BTLs in the list of BTLs we know can reach all peers
@@ -1106,8 +1124,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
1106
1124
}
1107
1125
1108
1126
btl_selection_complete :
1109
- ompi_osc_rdma_selected_btl_insert ( module , selected_btl , 0 ) ;
1110
- module -> btls_in_use = 1 ;
1127
+ module -> use_accelerated_btl = true ;
1128
+ module -> accelerated_btl = selected_btl ;
1111
1129
module -> use_memory_registration = selected_btl -> btl_register_mem != NULL ;
1112
1130
1113
1131
opal_output_verbose (MCA_BASE_VERBOSE_INFO , ompi_osc_base_framework .framework_output ,
@@ -1152,7 +1170,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
1152
1170
my_data -> len = (osc_rdma_size_t ) my_rank ;
1153
1171
1154
1172
if (module -> use_memory_registration && module -> state_handle ) {
1155
- memcpy (my_data -> btl_handle_data , module -> state_handle , module -> selected_btls [0 ]-> btl_registration_handle_size );
1173
+ assert (module -> use_accelerated_btl );
1174
+ memcpy (my_data -> btl_handle_data , module -> state_handle , module -> accelerated_btl -> btl_registration_handle_size );
1156
1175
}
1157
1176
1158
1177
/* gather state data at each node leader */
@@ -1326,9 +1345,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1326
1345
module -> acc_use_amo = mca_osc_rdma_component .acc_use_amo ;
1327
1346
module -> network_amo_max_count = mca_osc_rdma_component .network_amo_max_count ;
1328
1347
1329
- module -> selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT ;
1330
- module -> selected_btls = calloc (module -> selected_btls_size , sizeof (struct mca_btl_base_module_t * ));
1331
-
1332
1348
module -> all_sync .module = module ;
1333
1349
1334
1350
module -> flavor = flavor ;
@@ -1386,6 +1402,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1386
1402
}
1387
1403
1388
1404
/* find rdma capable endpoints */
1405
+ module -> use_accelerated_btl = false;
1389
1406
ret = ompi_osc_rdma_query_accelerated_btls (module -> comm , module );
1390
1407
if (OMPI_SUCCESS != ret ) {
1391
1408
opal_output_verbose (MCA_BASE_VERBOSE_WARN , ompi_osc_base_framework .framework_output ,
@@ -1404,7 +1421,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1404
1421
1405
1422
module -> region_size = sizeof (ompi_osc_rdma_region_t );
1406
1423
if (module -> use_memory_registration ) {
1407
- module -> region_size += module -> selected_btls [0 ]-> btl_registration_handle_size ;
1424
+ assert (module -> use_accelerated_btl );
1425
+ module -> region_size += module -> accelerated_btl -> btl_registration_handle_size ;
1408
1426
}
1409
1427
1410
1428
module -> state_size = sizeof (ompi_osc_rdma_state_t );
0 commit comments