Skip to content

Commit c0e3a7c

Browse files
authored
Merge pull request #11261 from abouteiller/ulfm/era_null_modex
ulfm fix comm_agree deadlocks due to faults not being visible
2 parents c332b1a + d1a1192 commit c0e3a7c

File tree

3 files changed

+33
-15
lines changed

3 files changed

+33
-15
lines changed

ompi/communicator/ft/comm_ft.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,11 @@ bool ompi_comm_is_proc_active(ompi_communicator_t *comm, int peer_id, bool remot
339339

340340
int ompi_comm_set_rank_failed(ompi_communicator_t *comm, int peer_id, bool remote)
341341
{
342+
/* populate the proc in the comm's group array so that it is not a sentinel and can be read as failed */
343+
ompi_proc_t *ompi_proc = ompi_group_get_proc_ptr((remote ? comm->c_remote_group : comm->c_local_group),
344+
peer_id, true);
345+
assert(NULL != ompi_proc);
346+
342347
/* Disable ANY_SOURCE */
343348
comm->any_source_enabled = false;
344349
opal_atomic_wmb(); /* non-locked update needs a memory barrier to propagate */

ompi/mca/coll/ftagree/coll_ftagree_earlyreturning.c

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ static const char *era_status_to_string(era_proc_status_t s) {
420420
}
421421
return "UNDEFINED STATUS";
422422
}
423+
#endif /* OPAL_ENABLE_DEBUG */
423424

424425
static const char *era_msg_type_to_string(int type) {
425426
switch(type) {
@@ -432,7 +433,6 @@ static const char *era_msg_type_to_string(int type) {
432433
}
433434
return "UNDEFINED MESSAGE TYPE";
434435
}
435-
#endif /* OPAL_ENABLE_DEBUG */
436436

437437
static ompi_coll_ftagree_era_agreement_info_t *era_lookup_agreement_info(era_identifier_t agreement_id)
438438
{
@@ -808,12 +808,12 @@ static void era_update_new_dead_list(ompi_coll_ftagree_era_agreement_info_t *ci)
808808
}
809809

810810
OPAL_OUTPUT_VERBOSE((30, ompi_ftmpi_output_handle,
811-
"%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes",
811+
"%s ftagree:agreement (ERA) agreement (%d.%d).%d -- adding %d procs to the list of newly dead processes (%d currently; AFR size is %d)",
812812
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
813813
ci->agreement_id.ERAID_FIELDS.contextid,
814814
ci->agreement_id.ERAID_FIELDS.epoch,
815815
ci->agreement_id.ERAID_FIELDS.agreementid,
816-
r));
816+
r, ci->current_value->new_dead_array, ags->afr_size));
817817

818818
#if OPAL_ENABLE_DEBUG
819819
{
@@ -1372,16 +1372,14 @@ static void era_build_tree_structure(ompi_coll_ftagree_era_agreement_info_t *ci)
13721372

13731373
era_call_tree_fn(ci);
13741374

1375-
if( ompi_comm_rank(ci->comm) == 0 ) {
1376-
OPAL_OUTPUT_VERBOSE((4, ompi_ftmpi_output_handle,
1377-
"%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n",
1378-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
1379-
ci->agreement_id.ERAID_FIELDS.contextid,
1380-
ci->agreement_id.ERAID_FIELDS.epoch,
1381-
ci->agreement_id.ERAID_FIELDS.agreementid,
1382-
AGS(ci->comm)->tree_size,
1383-
era_debug_tree(ci->ags->tree, ci->ags->tree_size)));
1384-
}
1375+
OPAL_OUTPUT_VERBOSE(((ompi_comm_rank(ci->comm) == 0)? 4: 50, ompi_ftmpi_output_handle,
1376+
"%s ftagree:agreement (ERA) Agreement (%d.%d).%d: re-built the tree structure with size %d: %s\n",
1377+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
1378+
ci->agreement_id.ERAID_FIELDS.contextid,
1379+
ci->agreement_id.ERAID_FIELDS.epoch,
1380+
ci->agreement_id.ERAID_FIELDS.agreementid,
1381+
AGS(ci->comm)->tree_size,
1382+
era_debug_tree(ci->ags->tree, ci->ags->tree_size)));
13851383

13861384
#if OPAL_ENABLE_DEBUG
13871385
era_tree_check(ci->ags->tree, ci->ags->tree_size, 0);
@@ -2184,7 +2182,21 @@ static void send_msg(ompi_communicator_t *comm,
21842182
}
21852183
assert(NULL != peer);
21862184
endpoint = mca_bml_base_get_endpoint(peer);
2187-
assert(NULL != endpoint);
2185+
if(NULL == endpoint) {
2186+
opal_output_verbose(5, ompi_ftmpi_output_handle,
2187+
"%s ftagree:agreement (ERA) CANNOT send message [(%d.%d).%d, %s, %08x.%d.%d..] to %d/%s (no endpoint)\n",
2188+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
2189+
agreement_id.ERAID_FIELDS.contextid,
2190+
agreement_id.ERAID_FIELDS.epoch,
2191+
agreement_id.ERAID_FIELDS.agreementid,
2192+
era_msg_type_to_string(type),
2193+
(NULL != value->bytes)? *(int*)value->bytes: 0,
2194+
value->header.ret,
2195+
value->header.nb_new_dead,
2196+
dst,
2197+
NULL != proc_name ? OMPI_NAME_PRINT(proc_name) : "(null)");
2198+
return; /* bail out: the algorithm should reconnect when the failed proc is detected */
2199+
}
21882200
bml_btl = mca_bml_base_btl_array_get_index(&endpoint->btl_eager, 0);
21892201
assert(NULL != bml_btl);
21902202
btl_endpoint = bml_btl->btl_endpoint;
@@ -2570,7 +2582,7 @@ static void msg_down(era_msg_header_t *msg_header, uint8_t *bytes, int *new_dead
25702582
*/
25712583
return;
25722584
}
2573-
/** if I receive a down message on an agreement I know about, I already participated.
2585+
/** if I receive a down message on an agreement I know about, I already participated.
25742586
* There is a non-erroneous code; erroneous execution that may also trigger this assert:
25752587
* consider the following case with false detection:
25762588
* 1. some ancestor A has detected the current process C as failed

ompi/proc/proc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ static inline void ompi_proc_mark_as_failed(ompi_proc_t *proc) {
471471
abort();
472472
}
473473
proc->proc_active = false;
474+
opal_atomic_wmb(); /* non-locked update needs a memory barrier to propagate */
474475
}
475476
#endif /* OPAL_ENABLE_FT_MPI */
476477

0 commit comments

Comments
 (0)