5
5
* Copyright (c) 2004-2017 The University of Tennessee and The University
6
6
* of Tennessee Research Foundation. All rights
7
7
* reserved.
8
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8
+ * Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
9
9
* University of Stuttgart. All rights reserved.
10
10
* Copyright (c) 2004-2005 The Regents of the University of California.
11
11
* All rights reserved.
54
54
#include "opal/mca/common/cuda/common_cuda.h"
55
55
#endif /* OPAL_CUDA_SUPPORT */
56
56
#include "opal/util/info_subscriber.h"
57
+ #include "opal/mca/mpool/base/base.h"
57
58
58
59
#include "ompi/info/info.h"
59
60
#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305
306
MCA_BASE_VAR_TYPE_UNSIGNED_LONG , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
306
307
MCA_BASE_VAR_SCOPE_LOCAL , & mca_osc_rdma_component .network_amo_max_count );
307
308
309
+ mca_osc_rdma_component .memory_alignment = opal_getpagesize ();
310
+ opal_asprintf (& description_str , "The minimum memory alignment used to allocate local window memory (default: %zu). "
311
+ "This is a best effort approach. Alignments larger than the page size may not be supported." ,
312
+ mca_osc_rdma_component .memory_alignment );
313
+ (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "minimum_memory_alignment" ,
314
+ description_str ,
315
+ MCA_BASE_VAR_TYPE_SIZE_T , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
316
+ MCA_BASE_VAR_SCOPE_READONLY , & mca_osc_rdma_component .memory_alignment );
317
+ free (description_str );
318
+
308
319
/* register performance variables */
309
320
310
321
(void ) mca_base_component_pvar_register (& mca_osc_rdma_component .super .osc_version , "put_retry_count" ,
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390
401
{
391
402
392
403
if (MPI_WIN_FLAVOR_SHARED == flavor ) {
393
- return -1 ;
404
+ return OMPI_ERR_RMA_SHARED ;
394
405
}
395
406
396
407
#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448
459
449
460
static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
450
461
{
451
- size_t total_size , local_rank_array_size , leader_peer_data_size ;
462
+ size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
452
463
ompi_osc_rdma_peer_t * my_peer ;
453
464
int ret , my_rank ;
465
+ size_t memory_alignment = module -> memory_alignment ;
454
466
455
467
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "allocating private internal state" );
456
468
@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463
475
* registration handles needed to access this data. */
464
476
total_size = local_rank_array_size + module -> region_size +
465
477
module -> state_size + leader_peer_data_size ;
466
- total_size += OPAL_ALIGN_PAD_AMOUNT ( total_size , OPAL_ALIGN_MIN ) ;
478
+ base_data_size = total_size ;
467
479
468
480
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
469
- total_size += size ;
481
+ base_data_size += OPAL_ALIGN_PAD_AMOUNT (base_data_size , memory_alignment );
482
+ total_size = base_data_size + size ;
470
483
}
471
484
472
485
/* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473
486
* (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474
487
* segment but placing it there simplifies the peer data fetch and cleanup code. */
475
488
476
- module -> rank_array = calloc (total_size , 1 );
489
+ module -> rank_array = mca_mpool_base_default_module -> mpool_alloc (mca_mpool_base_default_module , total_size ,
490
+ memory_alignment , 0 );
477
491
if (OPAL_UNLIKELY (NULL == module -> rank_array )) {
478
492
return OMPI_ERR_OUT_OF_RESOURCE ;
479
493
}
480
494
481
- // Note, the extra module->region_size space added after local_rank_array_size
482
- // is unused but is there to match what happens in allocte_state_shared()
483
- // This allows module->state_offset to be uniform across the ranks which
484
- // is part of how they pull peer info from each other.
495
+ /* Note, the extra module->region_size space added after local_rank_array_size
496
+ * is unused but is there to match what happens in allocte_state_shared()
497
+ * This allows module->state_offset to be uniform across the ranks which
498
+ * is part of how they pull peer info from each other. */
485
499
module -> state_offset = local_rank_array_size + module -> region_size ;
486
500
487
501
module -> state = (ompi_osc_rdma_state_t * ) ((intptr_t ) module -> rank_array + module -> state_offset );
488
502
module -> node_comm_info = (unsigned char * ) ((intptr_t ) module -> state + module -> state_size );
489
503
490
504
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
491
- * base = (void * ) ((intptr_t ) module -> node_comm_info + leader_peer_data_size );
505
+ * base = (void * ) ((intptr_t ) module -> rank_array + base_data_size );
492
506
}
493
507
494
508
/* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583
597
ompi_osc_rdma_region_t * state_region ;
584
598
struct _local_data * temp ;
585
599
char * data_file ;
586
- int page_size = opal_getpagesize () ;
600
+ size_t memory_alignment = module -> memory_alignment ;
587
601
588
602
shared_comm = module -> shared_comm ;
589
603
@@ -616,8 +630,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
616
630
617
631
/* ensure proper alignment */
618
632
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
619
- data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , page_size );
620
- size += OPAL_ALIGN_PAD_AMOUNT (size , page_size );
633
+ data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , memory_alignment );
634
+ size += OPAL_ALIGN_PAD_AMOUNT (size , memory_alignment );
621
635
}
622
636
623
637
do {
@@ -645,6 +659,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
645
659
my_base_offset = total_size ;
646
660
}
647
661
total_size += temp [i ].size ;
662
+ total_size += OPAL_ALIGN_PAD_AMOUNT (total_size , memory_alignment );
648
663
}
649
664
}
650
665
@@ -656,12 +671,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
656
671
if (0 > ret ) {
657
672
ret = OMPI_ERR_OUT_OF_RESOURCE ;
658
673
} else {
659
- /* allocate enough space for the state + data for all local ranks */
660
- ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
661
- free (data_file );
662
- if (OPAL_SUCCESS != ret ) {
663
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
664
- }
674
+ /* allocate enough space for the state + data for all local ranks */
675
+ ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
676
+ free (data_file );
677
+ if (OPAL_SUCCESS != ret ) {
678
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
679
+ }
665
680
}
666
681
}
667
682
@@ -688,6 +703,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
688
703
}
689
704
690
705
if (size && MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
706
+ size_t page_size = opal_getpagesize ();
691
707
char * baseptr = (char * )((intptr_t ) module -> segment_base + my_base_offset );
692
708
* base = (void * )baseptr ;
693
709
// touch each page to force allocation on local NUMA node
@@ -794,7 +810,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
794
810
}
795
811
796
812
if (my_rank == peer_rank ) {
797
- module -> my_peer = peer ;
813
+ module -> my_peer = peer ;
798
814
}
799
815
800
816
if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
@@ -854,12 +870,12 @@ static int ompi_osc_rdma_query_mtls (void)
854
870
855
871
mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names , ',' );
856
872
if (mtls_to_use && ompi_mtl_base_selected_component ) {
857
- for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
858
- if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
859
- opal_argv_free (mtls_to_use );
860
- return OMPI_SUCCESS ;
861
- }
862
- }
873
+ for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
874
+ if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
875
+ opal_argv_free (mtls_to_use );
876
+ return OMPI_SUCCESS ;
877
+ }
878
+ }
863
879
}
864
880
opal_argv_free (mtls_to_use );
865
881
return -1 ;
@@ -1304,6 +1320,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1304
1320
int world_size = ompi_comm_size (comm );
1305
1321
int init_limit = 256 ;
1306
1322
int ret ;
1323
+ int flag ;
1324
+ char infoval [32 ];
1307
1325
char * name ;
1308
1326
1309
1327
/* the osc/sm component is the exclusive provider for support for shared
@@ -1342,6 +1360,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1342
1360
module -> win = win ;
1343
1361
module -> disp_unit = disp_unit ;
1344
1362
module -> size = size ;
1363
+ module -> memory_alignment = mca_osc_rdma_component .memory_alignment ;
1364
+ if (NULL != info ) {
1365
+ opal_cstring_t * align_info_str ;
1366
+ opal_info_get (info , "mpi_minimum_memory_alignment" , & align_info_str , & flag );
1367
+ if (flag ) {
1368
+ ssize_t tmp_align = atoll (align_info_str -> string );
1369
+ OBJ_RELEASE (align_info_str );
1370
+ if (OPAL_ALIGN_MIN < tmp_align ) {
1371
+ module -> memory_alignment = tmp_align ;
1372
+ }
1373
+ }
1374
+ }
1345
1375
1346
1376
/* set the module so we properly cleanup */
1347
1377
win -> w_osc_module = (ompi_osc_base_module_t * ) module ;
@@ -1414,15 +1444,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1414
1444
} else {
1415
1445
module -> state_size += mca_osc_rdma_component .max_attach * module -> region_size ;
1416
1446
}
1417
- /*
1418
- * These are the info's that this module is interested in
1419
- */
1447
+
1448
+ /*
1449
+ * These are the info's that this module is interested in
1450
+ */
1420
1451
opal_infosubscribe_subscribe (& win -> super , "no_locks" , "false" , ompi_osc_rdma_set_no_lock_info );
1421
1452
1422
- /*
1423
- * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1424
- * to be used anywhere. If that changes, they should be subscribed
1425
- */
1453
+ /*
1454
+ * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1455
+ * to be used anywhere. If that changes, they should be subscribed
1456
+ */
1426
1457
1427
1458
/* fill in the function pointer part */
1428
1459
memcpy (& module -> super , & ompi_osc_rdma_module_rdma_template , sizeof (module -> super ));
@@ -1540,8 +1571,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
1540
1571
}
1541
1572
/* enforce collectiveness... */
1542
1573
module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
1543
- /*
1544
- * Accept any value
1545
- */
1574
+ /*
1575
+ * Accept any value
1576
+ */
1546
1577
return module -> no_locks ? "true" : "false" ;
1547
1578
}
0 commit comments