5
5
* Copyright (c) 2004-2017 The University of Tennessee and The University
6
6
* of Tennessee Research Foundation. All rights
7
7
* reserved.
8
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8
+ * Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
9
9
* University of Stuttgart. All rights reserved.
10
10
* Copyright (c) 2004-2005 The Regents of the University of California.
11
11
* All rights reserved.
54
54
#include "opal/mca/common/cuda/common_cuda.h"
55
55
#endif /* OPAL_CUDA_SUPPORT */
56
56
#include "opal/util/info_subscriber.h"
57
+ #include "opal/mca/mpool/base/base.h"
57
58
58
59
#include "ompi/info/info.h"
59
60
#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305
306
MCA_BASE_VAR_TYPE_UNSIGNED_LONG , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
306
307
MCA_BASE_VAR_SCOPE_LOCAL , & mca_osc_rdma_component .network_amo_max_count );
307
308
309
+ mca_osc_rdma_component .memory_alignment = opal_getpagesize ();
310
+ opal_asprintf (& description_str , "The minimum memory alignment used to allocate local window memory (default: %zu). "
311
+ "This is a best effort approach. Alignments larger than the page size may not be supported." ,
312
+ mca_osc_rdma_component .memory_alignment );
313
+ (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "minimum_memory_alignment" ,
314
+ description_str ,
315
+ MCA_BASE_VAR_TYPE_SIZE_T , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
316
+ MCA_BASE_VAR_SCOPE_READONLY , & mca_osc_rdma_component .memory_alignment );
317
+ free (description_str );
318
+
308
319
/* register performance variables */
309
320
310
321
(void ) mca_base_component_pvar_register (& mca_osc_rdma_component .super .osc_version , "put_retry_count" ,
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390
401
{
391
402
392
403
if (MPI_WIN_FLAVOR_SHARED == flavor ) {
393
- return -1 ;
404
+ return OMPI_ERR_RMA_SHARED ;
394
405
}
395
406
396
407
#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448
459
449
460
static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
450
461
{
451
- size_t total_size , local_rank_array_size , leader_peer_data_size ;
462
+ size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
452
463
ompi_osc_rdma_peer_t * my_peer ;
453
464
int ret , my_rank ;
465
+ size_t memory_alignment = module -> memory_alignment ;
454
466
455
467
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "allocating private internal state" );
456
468
@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463
475
* registration handles needed to access this data. */
464
476
total_size = local_rank_array_size + module -> region_size +
465
477
module -> state_size + leader_peer_data_size ;
466
- total_size += OPAL_ALIGN_PAD_AMOUNT ( total_size , OPAL_ALIGN_MIN ) ;
478
+ base_data_size = total_size ;
467
479
468
480
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
469
- total_size += size ;
481
+ base_data_size += OPAL_ALIGN_PAD_AMOUNT (base_data_size , memory_alignment );
482
+ total_size = base_data_size + size ;
470
483
}
471
484
472
485
/* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473
486
* (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474
487
* segment but placing it there simplifies the peer data fetch and cleanup code. */
475
488
476
- module -> rank_array = calloc (total_size , 1 );
489
+ module -> rank_array = mca_mpool_base_default_module -> mpool_alloc (mca_mpool_base_default_module , total_size ,
490
+ memory_alignment , 0 );
477
491
if (OPAL_UNLIKELY (NULL == module -> rank_array )) {
478
492
return OMPI_ERR_OUT_OF_RESOURCE ;
479
493
}
480
494
481
- // Note, the extra module->region_size space added after local_rank_array_size
482
- // is unused but is there to match what happens in allocte_state_shared()
483
- // This allows module->state_offset to be uniform across the ranks which
484
- // is part of how they pull peer info from each other.
495
+ /* Note, the extra module->region_size space added after local_rank_array_size
496
+ * is unused but is there to match what happens in allocte_state_shared()
497
+ * This allows module->state_offset to be uniform across the ranks which
498
+ * is part of how they pull peer info from each other. */
485
499
module -> state_offset = local_rank_array_size + module -> region_size ;
486
500
487
501
module -> state = (ompi_osc_rdma_state_t * ) ((intptr_t ) module -> rank_array + module -> state_offset );
488
502
module -> node_comm_info = (unsigned char * ) ((intptr_t ) module -> state + module -> state_size );
489
503
490
504
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
491
- * base = (void * ) ((intptr_t ) module -> node_comm_info + leader_peer_data_size );
505
+ * base = (void * ) ((intptr_t ) module -> rank_array + base_data_size );
492
506
}
493
507
494
508
/* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583
597
ompi_osc_rdma_region_t * state_region ;
584
598
struct _local_data * temp ;
585
599
char * data_file ;
586
- int page_size = opal_getpagesize () ;
600
+ size_t memory_alignment = module -> memory_alignment ;
587
601
588
602
shared_comm = module -> shared_comm ;
589
603
@@ -620,8 +634,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
620
634
621
635
/* ensure proper alignment */
622
636
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
623
- data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , page_size );
624
- size += OPAL_ALIGN_PAD_AMOUNT (size , page_size );
637
+ data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , memory_alignment );
638
+ size += OPAL_ALIGN_PAD_AMOUNT (size , memory_alignment );
625
639
}
626
640
627
641
do {
@@ -649,6 +663,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
649
663
my_base_offset = total_size ;
650
664
}
651
665
total_size += temp [i ].size ;
666
+ total_size += OPAL_ALIGN_PAD_AMOUNT (total_size , memory_alignment );
652
667
}
653
668
}
654
669
@@ -660,12 +675,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
660
675
if (0 > ret ) {
661
676
ret = OMPI_ERR_OUT_OF_RESOURCE ;
662
677
} else {
663
- /* allocate enough space for the state + data for all local ranks */
664
- ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
665
- free (data_file );
666
- if (OPAL_SUCCESS != ret ) {
667
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
668
- }
678
+ /* allocate enough space for the state + data for all local ranks */
679
+ ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
680
+ free (data_file );
681
+ if (OPAL_SUCCESS != ret ) {
682
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
683
+ }
669
684
}
670
685
}
671
686
@@ -692,6 +707,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
692
707
}
693
708
694
709
if (size && MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
710
+ size_t page_size = opal_getpagesize ();
695
711
char * baseptr = (char * )((intptr_t ) module -> segment_base + my_base_offset );
696
712
* base = (void * )baseptr ;
697
713
// touch each page to force allocation on local NUMA node
@@ -795,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
795
811
}
796
812
797
813
if (my_rank == peer_rank ) {
798
- module -> my_peer = peer ;
814
+ module -> my_peer = peer ;
799
815
}
800
816
801
817
if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
@@ -855,12 +871,12 @@ static int ompi_osc_rdma_query_mtls (void)
855
871
856
872
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names , ',' );
857
873
if (mtls_to_use && ompi_mtl_base_selected_component ) {
858
- for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
859
- if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
860
- opal_argv_free (mtls_to_use );
861
- return OMPI_SUCCESS ;
862
- }
863
- }
874
+ for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
875
+ if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
876
+ opal_argv_free (mtls_to_use );
877
+ return OMPI_SUCCESS ;
878
+ }
879
+ }
864
880
}
865
881
opal_argv_free (mtls_to_use );
866
882
return -1 ;
@@ -1305,6 +1321,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1305
1321
int world_size = ompi_comm_size (comm );
1306
1322
int init_limit = 256 ;
1307
1323
int ret ;
1324
+ int flag ;
1325
+ char infoval [32 ];
1308
1326
char * name ;
1309
1327
1310
1328
/* the osc/sm component is the exclusive provider for support for shared
@@ -1343,6 +1361,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1343
1361
module -> win = win ;
1344
1362
module -> disp_unit = disp_unit ;
1345
1363
module -> size = size ;
1364
+ module -> memory_alignment = mca_osc_rdma_component .memory_alignment ;
1365
+ if (NULL != info ) {
1366
+ opal_cstring_t * align_info_str ;
1367
+ opal_info_get (info , "mpi_minimum_memory_alignment" , & align_info_str , & flag );
1368
+ if (flag ) {
1369
+ ssize_t tmp_align = atoll (align_info_str -> string );
1370
+ OBJ_RELEASE (align_info_str );
1371
+ if (OPAL_ALIGN_MIN < tmp_align ) {
1372
+ module -> memory_alignment = tmp_align ;
1373
+ }
1374
+ }
1375
+ }
1346
1376
1347
1377
/* set the module so we properly cleanup */
1348
1378
win -> w_osc_module = (ompi_osc_base_module_t * ) module ;
@@ -1415,15 +1445,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1415
1445
} else {
1416
1446
module -> state_size += mca_osc_rdma_component .max_attach * module -> region_size ;
1417
1447
}
1418
- /*
1419
- * These are the info's that this module is interested in
1420
- */
1448
+
1449
+ /*
1450
+ * These are the info's that this module is interested in
1451
+ */
1421
1452
opal_infosubscribe_subscribe (& win -> super , "no_locks" , "false" , ompi_osc_rdma_set_no_lock_info );
1422
1453
1423
- /*
1424
- * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1425
- * to be used anywhere. If that changes, they should be subscribed
1426
- */
1454
+ /*
1455
+ * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1456
+ * to be used anywhere. If that changes, they should be subscribed
1457
+ */
1427
1458
1428
1459
/* fill in the function pointer part */
1429
1460
memcpy (& module -> super , & ompi_osc_rdma_module_rdma_template , sizeof (module -> super ));
@@ -1541,8 +1572,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
1541
1572
}
1542
1573
/* enforce collectiveness... */
1543
1574
module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
1544
- /*
1545
- * Accept any value
1546
- */
1575
+ /*
1576
+ * Accept any value
1577
+ */
1547
1578
return module -> no_locks ? "true" : "false" ;
1548
1579
}
0 commit comments