Skip to content

Commit 0811c16

Browse files
authored
Merge pull request #7985 from devreal/add-mpi-minimum-alignment-info-key
Add support for mpi_minimum_alignment info key
2 parents d25cc39 + 67baec6 commit 0811c16

File tree

7 files changed

+165
-69
lines changed

7 files changed

+165
-69
lines changed

ompi/mca/osc/portals4/osc_portals4_component.c

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
* reserved.
1111
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
1212
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
13+
* Copyright (c) 2020 High Performance Computing Center Stuttgart,
14+
* University of Stuttgart. All rights reserved.
1315
* $COPYRIGHT$
1416
*
1517
* Additional copyrights may follow
@@ -20,6 +22,8 @@
2022
#include "ompi_config.h"
2123

2224
#include "opal/util/printf.h"
25+
#include "opal/include/opal/align.h"
26+
#include "opal/mca/mpool/base/base.h"
2327

2428
#include "ompi/mca/osc/base/base.h"
2529
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
@@ -384,13 +388,27 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
384388
{
385389
ompi_osc_portals4_module_t *module = NULL;
386390
int ret = OMPI_ERROR;
387-
int tmp;
391+
int tmp, flag;
388392
ptl_md_t md;
389393
ptl_me_t me;
390394
char *name;
395+
size_t memory_alignment = OPAL_ALIGN_MIN;
391396

392397
if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED;
393398

399+
if (NULL != info) {
400+
opal_cstring_t *align_info_str;
401+
opal_info_get(info, "mpi_minimum_memory_alignment",
402+
&align_info_str, &flag);
403+
if (flag) {
404+
ssize_t tmp_align = atoll(infoval);
405+
OBJ_RELEASE(align_info_str);
406+
if (OPAL_ALIGN_MIN < tmp_align) {
407+
memory_alignment = tmp_align;
408+
}
409+
}
410+
}
411+
394412
/* create module structure */
395413
module = (ompi_osc_portals4_module_t*)
396414
calloc(1, sizeof(ompi_osc_portals4_module_t));
@@ -402,8 +420,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
402420

403421
/* fill in our part */
404422
if (MPI_WIN_FLAVOR_ALLOCATE == flavor) {
405-
module->free_after = *base = malloc(size);
423+
*base = mca_mpool_base_default_module->mpool_alloc(mca_mpool_base_default_module, size,
424+
memory_alignment, 0);
406425
if (NULL == *base) goto error;
426+
module->free_after = *base;
407427
} else {
408428
module->free_after = NULL;
409429
}
@@ -646,7 +666,8 @@ ompi_osc_portals4_free(struct ompi_win_t *win)
646666
PtlCTFree(module->ct_h);
647667
if (NULL != module->disp_units) free(module->disp_units);
648668
ompi_comm_free(&module->comm);
649-
if (NULL != module->free_after) free(module->free_after);
669+
mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
670+
module->free_after);
650671

651672
if (!opal_list_is_empty(&module->outstanding_locks)) {
652673
ret = OMPI_ERR_RMA_SYNC;

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* All rights reserved.
55
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
66
* All rights reserved.
7-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
88
* University of Stuttgart. All rights reserved.
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
@@ -116,6 +116,9 @@ struct ompi_osc_rdma_component_t {
116116

117117
/** maximum count for network AMO usage */
118118
unsigned long network_amo_max_count;
119+
120+
/** memory alignmen to be used for new windows */
121+
size_t memory_alignment;
119122
};
120123
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
121124

@@ -221,6 +224,9 @@ struct ompi_osc_rdma_module_t {
221224
/** offset in the shared memory segment where the state array starts */
222225
size_t state_offset;
223226

227+
/** memory alignmen to be used for new windows */
228+
size_t memory_alignment;
229+
224230
/* ********************* sync data ************************ */
225231

226232
/** global sync object (PSCW, fence, lock all) */

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 68 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2004-2017 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
8-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
@@ -54,6 +54,7 @@
5454
#include "opal/mca/common/cuda/common_cuda.h"
5555
#endif /* OPAL_CUDA_SUPPORT */
5656
#include "opal/util/info_subscriber.h"
57+
#include "opal/mca/mpool/base/base.h"
5758

5859
#include "ompi/info/info.h"
5960
#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305306
MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, OPAL_INFO_LVL_3,
306307
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.network_amo_max_count);
307308

309+
mca_osc_rdma_component.memory_alignment = opal_getpagesize();
310+
opal_asprintf(&description_str, "The minimum memory alignment used to allocate local window memory (default: %zu). "
311+
"This is a best effort approach. Alignments larger than the page size may not be supported.",
312+
mca_osc_rdma_component.memory_alignment);
313+
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "minimum_memory_alignment",
314+
description_str,
315+
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0, OPAL_INFO_LVL_3,
316+
MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_rdma_component.memory_alignment);
317+
free(description_str);
318+
308319
/* register performance variables */
309320

310321
(void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390401
{
391402

392403
if (MPI_WIN_FLAVOR_SHARED == flavor) {
393-
return -1;
404+
return OMPI_ERR_RMA_SHARED;
394405
}
395406

396407
#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448459

449460
static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
450461
{
451-
size_t total_size, local_rank_array_size, leader_peer_data_size;
462+
size_t total_size, local_rank_array_size, leader_peer_data_size, base_data_size;
452463
ompi_osc_rdma_peer_t *my_peer;
453464
int ret, my_rank;
465+
size_t memory_alignment = module->memory_alignment;
454466

455467
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state");
456468

@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463475
* registration handles needed to access this data. */
464476
total_size = local_rank_array_size + module->region_size +
465477
module->state_size + leader_peer_data_size;
466-
total_size += OPAL_ALIGN_PAD_AMOUNT(total_size, OPAL_ALIGN_MIN);
478+
base_data_size = total_size;
467479

468480
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
469-
total_size += size;
481+
base_data_size += OPAL_ALIGN_PAD_AMOUNT(base_data_size, memory_alignment);
482+
total_size = base_data_size + size;
470483
}
471484

472485
/* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473486
* (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474487
* segment but placing it there simplifies the peer data fetch and cleanup code. */
475488

476-
module->rank_array = calloc (total_size, 1);
489+
module->rank_array = mca_mpool_base_default_module->mpool_alloc(mca_mpool_base_default_module, total_size,
490+
memory_alignment, 0);
477491
if (OPAL_UNLIKELY(NULL == module->rank_array)) {
478492
return OMPI_ERR_OUT_OF_RESOURCE;
479493
}
480494

481-
// Note, the extra module->region_size space added after local_rank_array_size
482-
// is unused but is there to match what happens in allocte_state_shared()
483-
// This allows module->state_offset to be uniform across the ranks which
484-
// is part of how they pull peer info from each other.
495+
/* Note, the extra module->region_size space added after local_rank_array_size
496+
* is unused but is there to match what happens in allocte_state_shared()
497+
* This allows module->state_offset to be uniform across the ranks which
498+
* is part of how they pull peer info from each other. */
485499
module->state_offset = local_rank_array_size + module->region_size;
486500

487501
module->state = (ompi_osc_rdma_state_t *) ((intptr_t) module->rank_array + module->state_offset);
488502
module->node_comm_info = (unsigned char *) ((intptr_t) module->state + module->state_size);
489503

490504
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
491-
*base = (void *) ((intptr_t) module->node_comm_info + leader_peer_data_size);
505+
*base = (void *) ((intptr_t) module->rank_array + base_data_size);
492506
}
493507

494508
/* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583597
ompi_osc_rdma_region_t *state_region;
584598
struct _local_data *temp;
585599
char *data_file;
586-
int page_size = opal_getpagesize();
600+
size_t memory_alignment = module->memory_alignment;
587601

588602
shared_comm = module->shared_comm;
589603

@@ -616,8 +630,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
616630

617631
/* ensure proper alignment */
618632
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
619-
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, page_size);
620-
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
633+
data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, memory_alignment);
634+
size += OPAL_ALIGN_PAD_AMOUNT(size, memory_alignment);
621635
}
622636

623637
do {
@@ -645,6 +659,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
645659
my_base_offset = total_size;
646660
}
647661
total_size += temp[i].size;
662+
total_size += OPAL_ALIGN_PAD_AMOUNT(total_size, memory_alignment);
648663
}
649664
}
650665

@@ -656,12 +671,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
656671
if (0 > ret) {
657672
ret = OMPI_ERR_OUT_OF_RESOURCE;
658673
} else {
659-
/* allocate enough space for the state + data for all local ranks */
660-
ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
661-
free (data_file);
662-
if (OPAL_SUCCESS != ret) {
663-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
664-
}
674+
/* allocate enough space for the state + data for all local ranks */
675+
ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size);
676+
free (data_file);
677+
if (OPAL_SUCCESS != ret) {
678+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
679+
}
665680
}
666681
}
667682

@@ -688,6 +703,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
688703
}
689704

690705
if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
706+
size_t page_size = opal_getpagesize();
691707
char *baseptr = (char *)((intptr_t) module->segment_base + my_base_offset);
692708
*base = (void *)baseptr;
693709
// touch each page to force allocation on local NUMA node
@@ -794,7 +810,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
794810
}
795811

796812
if (my_rank == peer_rank) {
797-
module->my_peer = peer;
813+
module->my_peer = peer;
798814
}
799815

800816
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor &&
@@ -854,12 +870,12 @@ static int ompi_osc_rdma_query_mtls (void)
854870

855871
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ',');
856872
if (mtls_to_use && ompi_mtl_base_selected_component) {
857-
for (int i = 0 ; mtls_to_use[i] ; ++i) {
858-
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
859-
opal_argv_free(mtls_to_use);
860-
return OMPI_SUCCESS;
861-
}
862-
}
873+
for (int i = 0 ; mtls_to_use[i] ; ++i) {
874+
if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) {
875+
opal_argv_free(mtls_to_use);
876+
return OMPI_SUCCESS;
877+
}
878+
}
863879
}
864880
opal_argv_free(mtls_to_use);
865881
return -1;
@@ -1304,6 +1320,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13041320
int world_size = ompi_comm_size (comm);
13051321
int init_limit = 256;
13061322
int ret;
1323+
int flag;
1324+
char infoval[32];
13071325
char *name;
13081326

13091327
/* the osc/sm component is the exclusive provider for support for shared
@@ -1342,6 +1360,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13421360
module->win = win;
13431361
module->disp_unit = disp_unit;
13441362
module->size = size;
1363+
module->memory_alignment = mca_osc_rdma_component.memory_alignment;
1364+
if (NULL != info) {
1365+
opal_cstring_t *align_info_str;
1366+
opal_info_get(info, "mpi_minimum_memory_alignment", &align_info_str, &flag);
1367+
if (flag) {
1368+
ssize_t tmp_align = atoll(align_info_str->string);
1369+
OBJ_RELEASE(align_info_str);
1370+
if (OPAL_ALIGN_MIN < tmp_align) {
1371+
module->memory_alignment = tmp_align;
1372+
}
1373+
}
1374+
}
13451375

13461376
/* set the module so we properly cleanup */
13471377
win->w_osc_module = (ompi_osc_base_module_t*) module;
@@ -1414,15 +1444,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14141444
} else {
14151445
module->state_size += mca_osc_rdma_component.max_attach * module->region_size;
14161446
}
1417-
/*
1418-
* These are the info's that this module is interested in
1419-
*/
1447+
1448+
/*
1449+
* These are the info's that this module is interested in
1450+
*/
14201451
opal_infosubscribe_subscribe(&win->super, "no_locks", "false", ompi_osc_rdma_set_no_lock_info);
14211452

1422-
/*
1423-
* TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1424-
* to be used anywhere. If that changes, they should be subscribed
1425-
*/
1453+
/*
1454+
* TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1455+
* to be used anywhere. If that changes, they should be subscribed
1456+
*/
14261457

14271458
/* fill in the function pointer part */
14281459
memcpy(&module->super, &ompi_osc_rdma_module_rdma_template, sizeof(module->super));
@@ -1540,8 +1571,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
15401571
}
15411572
/* enforce collectiveness... */
15421573
module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
1543-
/*
1544-
* Accept any value
1545-
*/
1574+
/*
1575+
* Accept any value
1576+
*/
15461577
return module->no_locks ? "true" : "false";
15471578
}

ompi/mca/osc/rdma/osc_rdma_module.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* All rights reserved.
55
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
66
* All rights reserved.
7-
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
7+
* Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
88
* University of Stuttgart. All rights reserved.
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
@@ -22,6 +22,8 @@
2222
* $HEADER$
2323
*/
2424

25+
#include "opal/mca/mpool/base/base.h"
26+
2527
#include "osc_rdma.h"
2628
#include "osc_rdma_lock.h"
2729

@@ -140,7 +142,8 @@ int ompi_osc_rdma_free(ompi_win_t *win)
140142

141143
free (module->peer_array);
142144
free (module->outstanding_lock_array);
143-
free (module->free_after);
145+
mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module,
146+
module->free_after);
144147
free (module->selected_btls);
145148
free (module);
146149

0 commit comments

Comments
 (0)