Skip to content

Commit d524fa4

Browse files
committed
osc/rdma: add support for mpi_minimum_memory_alignment info key
Signed-off-by: Joseph Schuchart <schuchart@hlrs.de>
1 parent 8820fd2 commit d524fa4

File tree

3 files changed

+80
-40
lines changed

3 files changed

+80
-40
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* All rights reserved.
55
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
66
* All rights reserved.
7-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
88
* University of Stuttgart. All rights reserved.
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
@@ -116,6 +116,9 @@ struct ompi_osc_rdma_component_t {
116116

117117
/** maximum count for network AMO usage */
118118
unsigned long network_amo_max_count;
119+
120+
/** memory alignmen to be used for new windows */
121+
size_t memory_alignment;
119122
};
120123
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
121124

@@ -221,6 +224,9 @@ struct ompi_osc_rdma_module_t {
221224
/** offset in the shared memory segment where the state array starts */
222225
size_t state_offset;
223226

227+
/** memory alignmen to be used for new windows */
228+
size_t memory_alignment;
229+
224230
/* ********************* sync data ************************ */
225231

226232
/** global sync object (PSCW, fence, lock all) */

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 68 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2004-2017 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
8-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
@@ -54,6 +54,7 @@
5454
#include "opal/mca/common/cuda/common_cuda.h"
5555
#endif /* OPAL_CUDA_SUPPORT */
5656
#include "opal/util/info_subscriber.h"
57+
#include "opal/mca/mpool/base/base.h"
5758

5859
#include "ompi/info/info.h"
5960
#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305306
MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, OPAL_INFO_LVL_3,
306307
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.network_amo_max_count);
307308

309+
mca_osc_rdma_component.memory_alignment = opal_getpagesize();
310+
opal_asprintf(&description_str, "The minimum memory alignment used to allocate local window memory (default: %zu). "
311+
"This is a best effort approach. Alignments larger than the page size may not be supported.",
312+
mca_osc_rdma_component.memory_alignment);
313+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "minimum_memory_alignment",
314+
description_str,
315+
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_3,
316+
MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_rdma_component.memory_alignment);
317+
free(description_str);
318+
308319
/* register performance variables */
309320

310321
(void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390401
{
391402

392403
if (MPI_WIN_FLAVOR_SHARED == flavor) {
393-
return -1;
404+
return OMPI_ERR_RMA_SHARED;
394405
}
395406

396407
#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448459

449460
static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
450461
{
451-
size_t total_size, local_rank_array_size, leader_peer_data_size;
462+
size_t total_size, local_rank_array_size, leader_peer_data_size, base_data_size;
452463
ompi_osc_rdma_peer_t *my_peer;
453464
int ret, my_rank;
465+
size_t memory_alignment = module->memory_alignment;
454466

455467
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state");
456468

@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463475
* registration handles needed to access this data. */
464476
total_size = local_rank_array_size + module->region_size +
465477
module->state_size + leader_peer_data_size;
466-
total_size += OPAL_ALIGN_PAD_AMOUNT(total_size, OPAL_ALIGN_MIN);
478+
base_data_size = total_size;
467479

468480
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
469-
total_size += size;
481+
base_data_size += OPAL_ALIGN_PAD_AMOUNT(base_data_size, memory_alignment);
482+
total_size = base_data_size + size;
470483
}
471484

472485
/* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473486
* (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474487
* segment but placing it there simplifies the peer data fetch and cleanup code. */
475488

476-
module->rank_array = calloc (total_size, 1);
489+
module->rank_array = mca_mpool_base_default_module->mpool_alloc(mca_mpool_base_default_module, total_size,
490+
memory_alignment, 0);
477491
if (OPAL_UNLIKELY(NULL == module->rank_array)) {
478492
return OMPI_ERR_OUT_OF_RESOURCE;
479493
}
480494

481-
// Note, the extra module->region_size space added after local_rank_array_size
482-
// is unused but is there to match what happens in allocte_state_shared()
483-
// This allows module->state_offset to be uniform across the ranks which
484-
// is part of how they pull peer info from each other.
495+
/* Note, the extra module->region_size space added after local_rank_array_size
496+
* is unused but is there to match what happens in allocte_state_shared()
497+
* This allows module->state_offset to be uniform across the ranks which
498+
* is part of how they pull peer info from each other. */
485499
module->state_offset = local_rank_array_size + module->region_size;
486500

487501
module->state = (ompi_osc_rdma_state_t *) ((intptr_t) module->rank_array + module->state_offset);
488502
module->node_comm_info = (unsigned char *) ((intptr_t) module->state + module->state_size);
489503

490504
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
491-
*base = (void *) ((intptr_t) module->node_comm_info + leader_peer_data_size);
505+
*base = (void *) ((intptr_t) module->rank_array + base_data_size);
492506
}
493507

494508
/* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583597
ompi_osc_rdma_region_t *state_region;
584598
struct _local_data *temp;
585599
char *data_file;
586-
int page_size = opal_getpagesize();
600+
size_t memory_alignment = module->memory_alignment;
587601

588602
shared_comm = module->shared_comm;
589603

@@ -620,8 +634,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
620634

621635
/* ensure proper alignment */
622636
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
623-
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, page_size);
624-
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
637+
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, memory_alignment);
638+
size += OPAL_ALIGN_PAD_AMOUNT(size, memory_alignment);
625639
}
626640

627641
do {
@@ -649,6 +663,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
649663
my_base_offset = total_size;
650664
}
651665
total_size += temp[i].size;
666+
total_size += OPAL_ALIGN_PAD_AMOUNT(total_size, memory_alignment);
652667
}
653668
}
654669

@@ -660,12 +675,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
660675
if (0 > ret) {
661676
ret = OMPI_ERR_OUT_OF_RESOURCE;
662677
} else {
663-
/* allocate enough space for the state + data for all local ranks */
664-
ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
665-
free (data_file);
666-
if (OPAL_SUCCESS != ret) {
667-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
668-
}
678+
/* allocate enough space for the state + data for all local ranks */
679+
ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
680+
free (data_file);
681+
if (OPAL_SUCCESS != ret) {
682+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
683+
}
669684
}
670685
}
671686

@@ -692,6 +707,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
692707
}
693708

694709
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
710+
size_t page_size = opal_getpagesize();
695711
char *baseptr = (char *)((intptr_t) module->segment_base + my_base_offset);
696712
*base = (void *)baseptr;
697713
// touch each page to force allocation on local NUMA node
@@ -795,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
795811
}
796812

797813
if (my_rank == peer_rank) {
798-
module->my_peer = peer;
814+
module->my_peer = peer;
799815
}
800816

801817
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor &&
@@ -855,12 +871,12 @@ static int ompi_osc_rdma_query_mtls (void)
855871

856872
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
857873
if (mtls_to_use && ompi_mtl_base_selected_component) {
858-
for (int i = 0 ; mtls_to_use[i] ; ++i) {
859-
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
860-
opal_argv_free(mtls_to_use);
861-
return OMPI_SUCCESS;
862-
}
863-
}
874+
for (int i = 0 ; mtls_to_use[i] ; ++i) {
875+
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
876+
opal_argv_free(mtls_to_use);
877+
return OMPI_SUCCESS;
878+
}
879+
}
864880
}
865881
opal_argv_free(mtls_to_use);
866882
return -1;
@@ -1305,6 +1321,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13051321
int world_size = ompi_comm_size (comm);
13061322
int init_limit = 256;
13071323
int ret;
1324+
int flag;
1325+
char infoval[32];
13081326
char *name;
13091327

13101328
/* the osc/sm component is the exclusive provider for support for shared
@@ -1343,6 +1361,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13431361
module->win = win;
13441362
module->disp_unit = disp_unit;
13451363
module->size = size;
1364+
module->memory_alignment = mca_osc_rdma_component.memory_alignment;
1365+
if (NULL != info) {
1366+
opal_cstring_t *align_info_str;
1367+
opal_info_get(info, "mpi_minimum_memory_alignment", &align_info_str, &flag);
1368+
if (flag) {
1369+
ssize_t tmp_align = atoll(align_info_str->string);
1370+
OBJ_RELEASE(align_info_str);
1371+
if (OPAL_ALIGN_MIN < tmp_align) {
1372+
module->memory_alignment = tmp_align;
1373+
}
1374+
}
1375+
}
13461376

13471377
/* set the module so we properly cleanup */
13481378
win->w_osc_module = (ompi_osc_base_module_t*) module;
@@ -1415,15 +1445,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14151445
} else {
14161446
module->state_size += mca_osc_rdma_component.max_attach * module->region_size;
14171447
}
1418-
/*
1419-
* These are the info's that this module is interested in
1420-
*/
1448+
1449+
/*
1450+
* These are the info's that this module is interested in
1451+
*/
14211452
opal_infosubscribe_subscribe(&win->super, "no_locks", "false", ompi_osc_rdma_set_no_lock_info);
14221453

1423-
/*
1424-
* TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1425-
* to be used anywhere. If that changes, they should be subscribed
1426-
*/
1454+
/*
1455+
* TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1456+
* to be used anywhere. If that changes, they should be subscribed
1457+
*/
14271458

14281459
/* fill in the function pointer part */
14291460
memcpy(&module->super, &ompi_osc_rdma_module_rdma_template, sizeof(module->super));
@@ -1541,8 +1572,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
15411572
}
15421573
/* enforce collectiveness... */
15431574
module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
1544-
/*
1545-
* Accept any value
1546-
*/
1575+
/*
1576+
* Accept any value
1577+
*/
15471578
return module->no_locks ? "true" : "false";
15481579
}

ompi/mca/osc/rdma/osc_rdma_module.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* All rights reserved.
55
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
66
* All rights reserved.
7-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
88
* University of Stuttgart. All rights reserved.
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
@@ -22,6 +22,8 @@
2222
* $HEADER$
2323
*/
2424

25+
#include "opal/mca/mpool/base/base.h"
26+
2527
#include "osc_rdma.h"
2628
#include "osc_rdma_lock.h"
2729

@@ -140,7 +142,8 @@ int ompi_osc_rdma_free(ompi_win_t *win)
140142

141143
free (module->peer_array);
142144
free (module->outstanding_lock_array);
143-
free (module->free_after);
145+
mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
146+
module->free_after);
144147
free (module->selected_btls);
145148
free (module);
146149

0 commit comments

Comments
 (0)