Skip to content

Commit 75e36e5

Browse files
committed
osc/rdma: Split btl storage based on mode
Split the storage of btls based on whether the window is using accelerated or alternate btls. This makes it more obvious when the code has made assumptions about mode that may not be true (such as the memory registration calls throughout the code that assumed selected_btl[0] was the one true BTL). Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent 6a15883 commit 75e36e5

File tree

5 files changed

+144
-85
lines changed

5 files changed

+144
-85
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,6 @@
5757

5858
#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)
5959

60-
#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4
61-
6260
enum {
6361
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
6462
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
@@ -260,14 +258,23 @@ struct ompi_osc_rdma_module_t {
260258
/** lock for peer hash table/array */
261259
opal_mutex_t peer_lock;
262260

263-
264-
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
265-
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
266-
* could be used to support multiple RDMA-capable BTLs but the memory registration
267-
* paths will need to be updated to pack/unpack multiple registration handles. */
268-
struct mca_btl_base_module_t **selected_btls;
269-
uint8_t selected_btls_size;
270-
uint8_t btls_in_use;
261+
/* we currently support two modes of operation, a single
262+
* accelerated btl (which can use memory registration and can use
263+
* btl_flush() and one or more alternate btls, which cannot use
264+
* flush() or rely on memory registration. Since it is an
265+
* either/or situation, we use a union to simplify the code.
266+
*/
267+
bool use_accelerated_btl;
268+
269+
union {
270+
struct {
271+
struct mca_btl_base_module_t *accelerated_btl;
272+
};
273+
struct {
274+
struct mca_btl_base_module_t **alternate_btls;
275+
uint8_t alternate_btl_count;
276+
};
277+
};
271278

272279
/** Only true if one BTL is in use. Memory registration is only supported when
273280
* using a single BTL. */
@@ -383,10 +390,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
383390
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
384391
{
385392
if (module->use_memory_registration) {
393+
assert(module->use_accelerated_btl);
386394
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
387395
ptr, (void*)((char *) ptr + size), size);
388396

389-
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
397+
*handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags);
390398
if (OPAL_UNLIKELY(NULL == *handle)) {
391399
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
392400
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
@@ -404,7 +412,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
404412
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
405413
{
406414
if (handle) {
407-
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
415+
assert(module->use_memory_registration);
416+
assert(module->use_accelerated_btl);
417+
module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle);
408418
}
409419
}
410420

@@ -536,10 +546,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
536546
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
537547
{
538548
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
539-
return !!(module->selected_btls[0]->btl_flush);
540-
#else
541-
return false;
549+
if (module->use_accelerated_btl) {
550+
return (NULL != module->accelerated_btl->btl_flush);
551+
}
542552
#endif
553+
return false;
543554
}
544555

545556
/**
@@ -601,13 +612,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
601612
opal_progress ();
602613
} while (ompi_osc_rdma_sync_get_count (sync));
603614
#else
604-
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];
605-
606615
do {
607616
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
608617
opal_progress ();
609618
} else {
610-
btl_module->btl_flush (btl_module, NULL);
619+
assert(sync->module->use_accelerated_btl);
620+
mca_btl_base_module_t *btl_module = sync->module->accelerated_btl;
621+
btl_module->btl_flush(btl_module, NULL);
611622
}
612623
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
613624
#endif
@@ -637,17 +648,13 @@ static inline bool ompi_osc_rdma_oor (int rc)
637648

638649
__opal_attribute_always_inline__
639650
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
640-
return module->selected_btls[btl_index];
641-
}
642-
643-
__opal_attribute_always_inline__
644-
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) {
645-
if(btl_index == module->selected_btls_size) {
646-
module->selected_btls_size *= 2;
647-
module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *));
648-
assert(NULL != module->selected_btls);
651+
if (module->use_accelerated_btl) {
652+
assert(0 == btl_index);
653+
return module->accelerated_btl;
654+
} else {
655+
assert(btl_index < module->alternate_btl_count);
656+
return module->alternate_btls[btl_index];
649657
}
650-
module->selected_btls[btl_index] = btl;
651658
}
652659

653660
#endif /* OMPI_OSC_RDMA_H */

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -410,16 +410,17 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
410410
region->len = size;
411411

412412
if (module->use_memory_registration && size) {
413+
assert(module->use_accelerated_btl);
413414
if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) {
414415
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY,
415416
&module->base_handle);
416417
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
417418
return OMPI_ERR_OUT_OF_RESOURCE;
418419
}
419420

420-
memcpy (region->btl_handle_data, module->base_handle, module->selected_btls[0]->btl_registration_handle_size);
421+
memcpy (region->btl_handle_data, module->base_handle, module->accelerated_btl->btl_registration_handle_size);
421422
} else {
422-
memcpy (region->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size);
423+
memcpy (region->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size);
423424
}
424425
}
425426

@@ -580,8 +581,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
580581
module->use_cpu_atomics = module->single_node;
581582

582583
if (!module->single_node) {
583-
for (int i = 0 ; i < module->btls_in_use ; ++i) {
584-
module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
584+
if (module->use_accelerated_btl) {
585+
module->use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
586+
} else {
587+
for (int i = 0 ; i < module->alternate_btl_count ; ++i) {
588+
module->use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
589+
}
585590
}
586591
}
587592

@@ -703,14 +708,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
703708
if (0 == local_rank) {
704709
/* unlink the shared memory backing file */
705710
opal_shmem_unlink (&module->seg_ds);
706-
/* just go ahead and register the whole segment */
707-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size,
708-
MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
709-
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
710-
state_region->base = (intptr_t) module->segment_base;
711-
if (module->state_handle) {
712-
memcpy (state_region->btl_handle_data, module->state_handle,
713-
module->selected_btls[0]->btl_registration_handle_size);
711+
if (module->use_accelerated_btl) {
712+
/* just go ahead and register the whole segment */
713+
ret = ompi_osc_rdma_register(module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size,
714+
MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
715+
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
716+
state_region->base = (intptr_t) module->segment_base;
717+
if (module->state_handle) {
718+
memcpy(state_region->btl_handle_data, module->state_handle,
719+
module->accelerated_btl->btl_registration_handle_size);
720+
}
714721
}
715722
}
716723
}
@@ -730,8 +737,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
730737
region->base = state_region->base + my_base_offset;
731738
region->len = size;
732739
if (module->use_memory_registration) {
733-
memcpy (region->btl_handle_data, state_region->btl_handle_data,
734-
module->selected_btls[0]->btl_registration_handle_size);
740+
assert(module->use_accelerated_btl);
741+
memcpy(region->btl_handle_data, state_region->btl_handle_data,
742+
module->accelerated_btl->btl_registration_handle_size);
735743
}
736744
}
737745

@@ -910,12 +918,23 @@ static int btl_latency_sort_fn(const void *a, const void *b)
910918
*/
911919
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
912920
{
921+
size_t btl_count;
922+
size_t index = 0;
913923
mca_btl_base_selected_module_t *item;
914924
int ret;
915925

916926
assert(NULL != module);
917927

918-
module->btls_in_use = 0;
928+
btl_count = opal_list_get_size(&mca_btl_base_modules_initialized);
929+
if (btl_count > UINT8_MAX) {
930+
return OMPI_ERROR;
931+
}
932+
933+
module->alternate_btl_count = btl_count;
934+
module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count);
935+
if (NULL == module->alternate_btls) {
936+
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
937+
}
919938

920939
/* add all alternate btls to the selected_btls list, not worrying
921940
about ordering yet. We have to add all btls unless we want to
@@ -937,17 +956,17 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o
937956
if (OMPI_SUCCESS != ret) {
938957
return ret;
939958
}
940-
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
959+
module->alternate_btls[index++] = item->btl_module;
941960
}
961+
assert(index == btl_count);
942962

943963
/* sort based on latency, lowest first */
944-
qsort(module->selected_btls, module->btls_in_use,
964+
qsort(module->alternate_btls, module->alternate_btl_count,
945965
sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn);
946966

947-
/* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */
948967
module->use_memory_registration = false;
949968

950-
return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
969+
return OMPI_SUCCESS;
951970
}
952971

953972

@@ -991,8 +1010,7 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
9911010

9921011
assert(NULL != module);
9931012

994-
ompi_osc_rdma_selected_btl_insert(module, NULL, 0);
995-
module->btls_in_use = 0;
1013+
module->use_accelerated_btl = false;
9961014
module->use_memory_registration = false;
9971015

9981016
/* Check for BTLs in the list of BTLs we know can reach all peers
@@ -1106,8 +1124,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi
11061124
}
11071125

11081126
btl_selection_complete:
1109-
ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0);
1110-
module->btls_in_use = 1;
1127+
module->use_accelerated_btl = true;
1128+
module->accelerated_btl = selected_btl;
11111129
module->use_memory_registration = selected_btl->btl_register_mem != NULL;
11121130

11131131
opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output,
@@ -1152,7 +1170,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
11521170
my_data->len = (osc_rdma_size_t) my_rank;
11531171

11541172
if (module->use_memory_registration && module->state_handle) {
1155-
memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size);
1173+
assert(module->use_accelerated_btl);
1174+
memcpy (my_data->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size);
11561175
}
11571176

11581177
/* gather state data at each node leader */
@@ -1326,9 +1345,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13261345
module->acc_use_amo = mca_osc_rdma_component.acc_use_amo;
13271346
module->network_amo_max_count = mca_osc_rdma_component.network_amo_max_count;
13281347

1329-
module->selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT;
1330-
module->selected_btls = calloc(module->selected_btls_size, sizeof(struct mca_btl_base_module_t *));
1331-
13321348
module->all_sync.module = module;
13331349

13341350
module->flavor = flavor;
@@ -1386,6 +1402,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13861402
}
13871403

13881404
/* find rdma capable endpoints */
1405+
module->use_accelerated_btl = false;
13891406
ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module);
13901407
if (OMPI_SUCCESS != ret) {
13911408
opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output,
@@ -1404,7 +1421,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14041421

14051422
module->region_size = sizeof (ompi_osc_rdma_region_t);
14061423
if (module->use_memory_registration) {
1407-
module->region_size += module->selected_btls[0]->btl_registration_handle_size;
1424+
assert(module->use_accelerated_btl);
1425+
module->region_size += module->accelerated_btl->btl_registration_handle_size;
14081426
}
14091427

14101428
module->state_size = sizeof (ompi_osc_rdma_state_t);

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,8 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
252252
return OMPI_ERR_RMA_ATTACH;
253253
}
254254

255-
memcpy (region->btl_handle_data, handle, module->selected_btls[0]->btl_registration_handle_size);
255+
assert(module->use_accelerated_btl);
256+
memcpy(region->btl_handle_data, handle, module->accelerated_btl->btl_registration_handle_size);
256257
rdma_region_handle->btl_handle = handle;
257258
} else {
258259
rdma_region_handle->btl_handle = NULL;

ompi/mca/osc/rdma/osc_rdma_module.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ int ompi_osc_rdma_free(ompi_win_t *win)
144144
free (module->outstanding_lock_array);
145145
mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
146146
module->free_after);
147-
free (module->selected_btls);
147+
if (!module->use_accelerated_btl) {
148+
free(module->alternate_btls);
149+
}
148150
free (module);
149151

150152
return OMPI_SUCCESS;

0 commit comments

Comments
 (0)