Skip to content

Commit 7732085

Browse files
committed
ulfm/agree_era: when a process is dead, send_msg may not succeed at
obtaining the modex info for that process, so we should not assert/abort in this case. Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 75b7eb5 commit 7732085

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed

ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420420
}
421421
return "UNDEFINED STATUS";
422422
}
423+
#endif /* OPAL_ENABLE_DEBUG */
423424

424425
static const char *era_msg_type_to_string(int type) {
425426
switch(type) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432433
}
433434
return "UNDEFINED MESSAGE TYPE";
434435
}
435-
#endif /* OPAL_ENABLE_DEBUG */
436436

437437
static ompi_coll_ftagree_era_agreement_info_t *era_lookup_agreement_info(era_identifier_t agreement_id)
438438
{
@@ -2184,7 +2184,21 @@ static void send_msg(ompi_communicator_t *comm,
21842184
}
21852185
assert(NULL != peer);
21862186
endpoint = mca_bml_base_get_endpoint(peer);
2187-
assert(NULL != endpoint);
2187+
if(NULL == endpoint) {
2188+
opal_output_verbose(5, ompi_ftmpi_output_handle,
2189+
"%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n",
2190+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
2191+
agreement_id.ERAID_FIELDS.contextid,
2192+
agreement_id.ERAID_FIELDS.epoch,
2193+
agreement_id.ERAID_FIELDS.agreementid,
2194+
era_msg_type_to_string(type),
2195+
(NULL != value->bytes)? *(int*)value->bytes: 0,
2196+
value->header.ret,
2197+
value->header.nb_new_dead,
2198+
dst,
2199+
NULL != proc_name ? OMPI_NAME_PRINT(proc_name) : "(null)");
2200+
return; /* bail out: the algorithm should reconnect when the failed proc is detected */
2201+
}
21882202
bml_btl = mca_bml_base_btl_array_get_index(&endpoint->btl_eager, 0);
21892203
assert(NULL != bml_btl);
21902204
btl_endpoint = bml_btl->btl_endpoint;
@@ -2570,7 +2584,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
25702584
*/
25712585
return;
25722586
}
2573-
/** if I receive a down message on an agreement I know about, I already participated.
2587+
/** if I receive a down message on an agreement I know about, I already participated.
25742588
* There is a non-erroneous code; erroneous execution that may also trigger this assert:
25752589
* consider the following case with false detection:
25762590
* 1. some ancestor A has detected the current process C as failed

0 commit comments

Comments
 (0)