@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420
420
}
421
421
return "UNDEFINED STATUS" ;
422
422
}
423
+ #endif /* OPAL_ENABLE_DEBUG */
423
424
424
425
static const char * era_msg_type_to_string (int type ) {
425
426
switch (type ) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432
433
}
433
434
return "UNDEFINED MESSAGE TYPE" ;
434
435
}
435
- #endif /* OPAL_ENABLE_DEBUG */
436
436
437
437
static ompi_coll_ftagree_era_agreement_info_t * era_lookup_agreement_info (era_identifier_t agreement_id )
438
438
{
@@ -808,12 +808,12 @@ static void era_update_new_dead_list(ompi_coll_ftagree_era_agreement_info_t *ci)
808
808
}
809
809
810
810
OPAL_OUTPUT_VERBOSE ((30 , ompi_ftmpi_output_handle ,
811
- "%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes" ,
811
+ "%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes (%d currently; AFR size is %d) " ,
812
812
OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
813
813
ci -> agreement_id .ERAID_FIELDS .contextid ,
814
814
ci -> agreement_id .ERAID_FIELDS .epoch ,
815
815
ci -> agreement_id .ERAID_FIELDS .agreementid ,
816
- r ));
816
+ r , ci -> current_value -> new_dead_array , ags -> afr_size ));
817
817
818
818
#if OPAL_ENABLE_DEBUG
819
819
{
@@ -1372,16 +1372,14 @@ static void era_build_tree_structure(ompi_coll_ftagree_era_agreement_info_t *ci)
1372
1372
1373
1373
era_call_tree_fn (ci );
1374
1374
1375
- if ( ompi_comm_rank (ci -> comm ) == 0 ) {
1376
- OPAL_OUTPUT_VERBOSE ((4 , ompi_ftmpi_output_handle ,
1377
- "%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n" ,
1378
- OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
1379
- ci -> agreement_id .ERAID_FIELDS .contextid ,
1380
- ci -> agreement_id .ERAID_FIELDS .epoch ,
1381
- ci -> agreement_id .ERAID_FIELDS .agreementid ,
1382
- AGS (ci -> comm )-> tree_size ,
1383
- era_debug_tree (ci -> ags -> tree , ci -> ags -> tree_size )));
1384
- }
1375
+ OPAL_OUTPUT_VERBOSE (((ompi_comm_rank (ci -> comm ) == 0 )? 4 : 50 , ompi_ftmpi_output_handle ,
1376
+ "%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n" ,
1377
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
1378
+ ci -> agreement_id .ERAID_FIELDS .contextid ,
1379
+ ci -> agreement_id .ERAID_FIELDS .epoch ,
1380
+ ci -> agreement_id .ERAID_FIELDS .agreementid ,
1381
+ AGS (ci -> comm )-> tree_size ,
1382
+ era_debug_tree (ci -> ags -> tree , ci -> ags -> tree_size )));
1385
1383
1386
1384
#if OPAL_ENABLE_DEBUG
1387
1385
era_tree_check (ci -> ags -> tree , ci -> ags -> tree_size , 0 );
@@ -2184,7 +2182,21 @@ static void send_msg(ompi_communicator_t *comm,
2184
2182
}
2185
2183
assert (NULL != peer );
2186
2184
endpoint = mca_bml_base_get_endpoint (peer );
2187
- assert (NULL != endpoint );
2185
+ if (NULL == endpoint ) {
2186
+ opal_output_verbose (5 , ompi_ftmpi_output_handle ,
2187
+ "%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n" ,
2188
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
2189
+ agreement_id .ERAID_FIELDS .contextid ,
2190
+ agreement_id .ERAID_FIELDS .epoch ,
2191
+ agreement_id .ERAID_FIELDS .agreementid ,
2192
+ era_msg_type_to_string (type ),
2193
+ (NULL != value -> bytes )? * (int * )value -> bytes : 0 ,
2194
+ value -> header .ret ,
2195
+ value -> header .nb_new_dead ,
2196
+ dst ,
2197
+ NULL != proc_name ? OMPI_NAME_PRINT (proc_name ) : "(null)" );
2198
+ return ; /* bail out: the algorithm should reconnect when the failed proc is detected */
2199
+ }
2188
2200
bml_btl = mca_bml_base_btl_array_get_index (& endpoint -> btl_eager , 0 );
2189
2201
assert (NULL != bml_btl );
2190
2202
btl_endpoint = bml_btl -> btl_endpoint ;
@@ -2570,7 +2582,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
2570
2582
*/
2571
2583
return ;
2572
2584
}
2573
- /** if I receive a down message on an agreement I know about, I already participated.
2585
+ /** if I receive a down message on an agreement I know about, I already participated.
2574
2586
* There is a non-erroneous code; erroneous execution that may also trigger this assert:
2575
2587
* consider the following case with false detection:
2576
2588
* 1. some ancestor A has detected the current process C as failed
0 commit comments