@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420
420
}
421
421
return "UNDEFINED STATUS" ;
422
422
}
423
+ #endif /* OPAL_ENABLE_DEBUG */
423
424
424
425
static const char * era_msg_type_to_string (int type ) {
425
426
switch (type ) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432
433
}
433
434
return "UNDEFINED MESSAGE TYPE" ;
434
435
}
435
- #endif /* OPAL_ENABLE_DEBUG */
436
436
437
437
static ompi_coll_ftagree_era_agreement_info_t * era_lookup_agreement_info (era_identifier_t agreement_id )
438
438
{
@@ -2184,7 +2184,21 @@ static void send_msg(ompi_communicator_t *comm,
2184
2184
}
2185
2185
assert (NULL != peer );
2186
2186
endpoint = mca_bml_base_get_endpoint (peer );
2187
- assert (NULL != endpoint );
2187
+ if (NULL == endpoint ) {
2188
+ opal_output_verbose (5 , ompi_ftmpi_output_handle ,
2189
+ "%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n" ,
2190
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
2191
+ agreement_id .ERAID_FIELDS .contextid ,
2192
+ agreement_id .ERAID_FIELDS .epoch ,
2193
+ agreement_id .ERAID_FIELDS .agreementid ,
2194
+ era_msg_type_to_string (type ),
2195
+ (NULL != value -> bytes )? * (int * )value -> bytes : 0 ,
2196
+ value -> header .ret ,
2197
+ value -> header .nb_new_dead ,
2198
+ dst ,
2199
+ NULL != proc_name ? OMPI_NAME_PRINT (proc_name ) : "(null)" );
2200
+ return ; /* bail out: the algorithm should reconnect when the failed proc is detected */
2201
+ }
2188
2202
bml_btl = mca_bml_base_btl_array_get_index (& endpoint -> btl_eager , 0 );
2189
2203
assert (NULL != bml_btl );
2190
2204
btl_endpoint = bml_btl -> btl_endpoint ;
@@ -2570,7 +2584,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
2570
2584
*/
2571
2585
return ;
2572
2586
}
2573
- /** if I receive a down message on an agreement I know about, I already participated.
2587
+ /** if I receive a down message on an agreement I know about, I already participated.
2574
2588
* There is a non-erroneous code; erroneous execution that may also trigger this assert:
2575
2589
* consider the following case with false detection:
2576
2590
* 1. some ancestor A has detected the current process C as failed
0 commit comments