@@ -523,6 +523,19 @@ struct _local_data {
523
523
size_t size ;
524
524
};
525
525
526
+ static int synchronize_errorcode (int errorcode , ompi_communicator_t * comm )
527
+ {
528
+ int ret ;
529
+ int err = errorcode ;
530
+ /* This assumes that error codes are negative integers */
531
+ ret = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & err , 1 , MPI_INT , MPI_MIN ,
532
+ comm , comm -> c_coll -> coll_allreduce_module );
533
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
534
+ err = ret ;
535
+ }
536
+ return err ;
537
+ }
538
+
526
539
static int allocate_state_shared (ompi_osc_rdma_module_t * module , void * * base , size_t size )
527
540
{
528
541
ompi_communicator_t * shared_comm ;
@@ -593,28 +606,35 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
593
606
OMPI_PROC_MY_NAME -> jobid , ompi_comm_get_cid (module -> comm ));
594
607
if (0 > ret ) {
595
608
ret = OMPI_ERR_OUT_OF_RESOURCE ;
596
- break ;
609
+ } else {
610
+ /* allocate enough space for the state + data for all local ranks */
611
+ ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
612
+ free (data_file );
613
+ if (OPAL_SUCCESS != ret ) {
614
+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
615
+ }
597
616
}
617
+ }
598
618
599
- /* allocate enough space for the state + data for all local ranks */
600
- ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
601
- free (data_file );
602
- if (OPAL_SUCCESS != ret ) {
603
- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
604
- break ;
605
- }
619
+ ret = synchronize_errorcode (ret , shared_comm );
620
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
621
+ break ;
606
622
}
607
623
608
- ret = module -> comm -> c_coll -> coll_bcast (& module -> seg_ds , sizeof (module -> seg_ds ), MPI_BYTE , 0 ,
624
+ ret = shared_comm -> c_coll -> coll_bcast (& module -> seg_ds , sizeof (module -> seg_ds ), MPI_BYTE , 0 ,
609
625
shared_comm , shared_comm -> c_coll -> coll_bcast_module );
610
- if (OMPI_SUCCESS != ret ) {
626
+ if (OPAL_UNLIKELY ( OMPI_SUCCESS != ret ) ) {
611
627
break ;
612
628
}
613
629
614
630
module -> segment_base = opal_shmem_segment_attach (& module -> seg_ds );
615
631
if (NULL == module -> segment_base ) {
616
632
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to attach to the shared memory segment" );
617
633
ret = OPAL_ERROR ;
634
+ }
635
+
636
+ ret = synchronize_errorcode (ret , shared_comm );
637
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
618
638
break ;
619
639
}
620
640
@@ -643,27 +663,23 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
643
663
/* just go ahead and register the whole segment */
644
664
ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , module -> segment_base , total_size , MCA_BTL_REG_FLAG_ACCESS_ANY ,
645
665
& module -> state_handle );
646
- if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
647
- break ;
648
- }
649
-
650
- state_region -> base = (intptr_t ) module -> segment_base ;
651
- if (module -> state_handle ) {
652
- memcpy (state_region -> btl_handle_data , module -> state_handle , module -> selected_btl -> btl_registration_handle_size );
666
+ if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
667
+ state_region -> base = (intptr_t ) module -> segment_base ;
668
+ if (module -> state_handle ) {
669
+ memcpy (state_region -> btl_handle_data , module -> state_handle , module -> selected_btl -> btl_registration_handle_size );
670
+ }
653
671
}
654
672
}
655
673
656
- /* barrier to make sure memory is registered */
657
- shared_comm -> c_coll -> coll_barrier (shared_comm , shared_comm -> c_coll -> coll_barrier_module );
674
+ /* synchronization to make sure memory is registered */
675
+ ret = synchronize_errorcode (ret , shared_comm );
676
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
677
+ break ;
678
+ }
658
679
659
680
if (MPI_WIN_FLAVOR_CREATE == module -> flavor ) {
660
681
ret = ompi_osc_rdma_initialize_region (module , base , size );
661
- if (OMPI_SUCCESS != ret ) {
662
- break ;
663
- }
664
- }
665
-
666
- if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
682
+ } else if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
667
683
ompi_osc_rdma_region_t * region = (ompi_osc_rdma_region_t * ) module -> state -> regions ;
668
684
module -> state -> disp_unit = module -> disp_unit ;
669
685
module -> state -> region_count = 1 ;
@@ -674,8 +690,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
674
690
}
675
691
}
676
692
677
- /* barrier to make sure all ranks have set up their region data */
678
- shared_comm -> c_coll -> coll_barrier (shared_comm , shared_comm -> c_coll -> coll_barrier_module );
693
+ /* synchronization to make sure all ranks have set up their region data */
694
+ ret = synchronize_errorcode (ret , shared_comm );
695
+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
696
+ break ;
697
+ }
679
698
680
699
offset = data_base ;
681
700
for (int i = 0 ; i < local_size ; ++ i ) {
@@ -994,13 +1013,7 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
994
1013
free (temp );
995
1014
} while (0 );
996
1015
997
-
998
- ret = module -> comm -> c_coll -> coll_allreduce (& ret , & global_result , 1 , MPI_INT , MPI_MIN , module -> comm ,
999
- module -> comm -> c_coll -> coll_allreduce_module );
1000
-
1001
- if (OMPI_SUCCESS != ret ) {
1002
- global_result = ret ;
1003
- }
1016
+ global_result = synchronize_errorcode (ret , module -> comm );
1004
1017
1005
1018
/* none of these communicators are needed anymore so free them now*/
1006
1019
if (MPI_COMM_NULL != module -> local_leaders ) {
@@ -1235,6 +1248,9 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
1235
1248
1236
1249
/* fill in our part */
1237
1250
ret = allocate_state_shared (module , base , size );
1251
+
1252
+ /* notify all others if something went wrong */
1253
+ ret = synchronize_errorcode (ret , module -> comm );
1238
1254
if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
1239
1255
OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to allocate internal state" );
1240
1256
ompi_osc_rdma_free (win );
0 commit comments