@@ -427,7 +427,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
427
427
return OMPI_SUCCESS ;
428
428
}
429
429
430
- static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
430
+ static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size , bool use_cpu_atomics )
431
431
{
432
432
size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
433
433
ompi_osc_rdma_peer_t * my_peer ;
@@ -507,7 +507,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
507
507
my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
508
508
my_peer -> state = (uint64_t ) (uintptr_t ) module -> state ;
509
509
510
- if (module -> use_cpu_atomics ) {
510
+ if (use_cpu_atomics ) {
511
511
/* all peers are local or it is safe to mix cpu and nic atomics */
512
512
my_peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
513
513
} else {
@@ -526,7 +526,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
526
526
ex_peer -> size = size ;
527
527
}
528
528
529
- if (!module -> use_cpu_atomics ) {
529
+ if (!use_cpu_atomics ) {
530
530
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
531
531
/* base is local and cpu atomics are available */
532
532
ex_peer -> super .base_handle = module -> state_handle ;
@@ -570,6 +570,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
570
570
struct _local_data * temp ;
571
571
char * data_file ;
572
572
size_t memory_alignment = module -> memory_alignment ;
573
+ bool use_cpu_atomics ;
573
574
574
575
shared_comm = module -> shared_comm ;
575
576
@@ -578,21 +579,26 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
578
579
579
580
/* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */
580
581
module -> single_node = local_size == global_size ;
581
- module -> use_cpu_atomics = module -> single_node ;
582
582
583
- if (!module -> single_node ) {
584
- if (module -> use_accelerated_btl ) {
585
- module -> use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
586
- } else {
587
- for (int i = 0 ; i < module -> alternate_btl_count ; ++ i ) {
588
- module -> use_cpu_atomics &= !!(module -> alternate_btls [i ]-> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
589
- }
590
- }
583
+ if (module -> single_node ) {
584
+ use_cpu_atomics = true;
585
+ } else if (module -> use_accelerated_btl ) {
586
+ use_cpu_atomics = !!(module -> accelerated_btl -> btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB );
587
+ } else {
588
+ /* using the shared state optimization that is enabled by
589
+ * being able to use cpu atomics was never enabled for
590
+ * alternate btls, due to a previous bug in the enablement
591
+ * logic when alternate btls were first supported. It is
592
+ * likely that this optimization could work with sufficient
593
+ * testing, but for now, always disable to not introduce new
594
+ * correctness risks.
595
+ */
596
+ use_cpu_atomics = false;
591
597
}
592
598
593
599
if (1 == local_size ) {
594
600
/* no point using a shared segment if there are no other processes on this node */
595
- return allocate_state_single (module , base , size );
601
+ return allocate_state_single (module , base , size , use_cpu_atomics );
596
602
}
597
603
598
604
opal_output_verbose (MCA_BASE_VERBOSE_TRACE , ompi_osc_base_framework .framework_output ,
@@ -771,7 +777,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
771
777
ex_peer = (ompi_osc_rdma_peer_extended_t * ) peer ;
772
778
773
779
/* set up peer state */
774
- if (module -> use_cpu_atomics ) {
780
+ if (use_cpu_atomics ) {
775
781
/* all peers are local or it is safe to mix cpu and nic atomics */
776
782
peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE ;
777
783
peer -> state = (osc_rdma_counter_t ) peer_state ;
@@ -796,7 +802,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
796
802
}
797
803
798
804
if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
799
- !module -> use_cpu_atomics && temp [i ].size && i > 0 ) {
805
+ !use_cpu_atomics && temp [i ].size && i > 0 ) {
800
806
/* use the local leader's endpoint */
801
807
peer -> data_endpoint = local_leader -> data_endpoint ;
802
808
peer -> data_btl_index = local_leader -> data_btl_index ;
@@ -805,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
805
811
ompi_osc_module_add_peer (module , peer );
806
812
807
813
if (MPI_WIN_FLAVOR_DYNAMIC == module -> flavor ) {
808
- if (module -> use_cpu_atomics && peer_rank == my_rank ) {
814
+ if (use_cpu_atomics && peer_rank == my_rank ) {
809
815
peer -> flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE ;
810
816
}
811
817
/* nothing more to do */
@@ -821,7 +827,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
821
827
ex_peer -> size = temp [i ].size ;
822
828
}
823
829
824
- if (module -> use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor || peer_rank == my_rank )) {
830
+ if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor || peer_rank == my_rank )) {
825
831
/* base is local and cpu atomics are available */
826
832
if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
827
833
ex_peer -> super .base = (uintptr_t ) module -> segment_base + offset ;
0 commit comments