Skip to content

Commit b1844fc

Browse files
committed
Must use the key of the pmix event to screen for proc names in fault
notifications Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent a75f933 commit b1844fc

File tree

1 file changed

+9
-3
lines changed

1 file changed

+9
-3
lines changed

ompi/errhandler/errhandler.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -472,18 +472,24 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
472472
case PMIX_ERR_PROC_ABORTED_BY_SIG:
473473
case PMIX_ERR_PROC_ABORTED: /* that is, proc aborted by pmix_abort */
474474
for(int i = 0; i < event->nvalue; i++) {
475-
if (PMIX_PROC != event->info[i].value.type) {
475+
if (strcmp(PMIX_EVENT_AFFECTED_PROC, event->info[i].key)) {
476476
OPAL_OUTPUT_VERBOSE((70, ompi_ftmpi_output_handle,
477477
"%s ompi: ignoring the following key for a PMIx fault event: %s",
478478
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
479479
event->info[i].key));
480480
continue;
481481
}
482+
assert(event->info[i].value.type == PMIX_PROC);
482483
OPAL_PMIX_CONVERT_PROCT(rc, &prc, event->info[i].value.data.proc);
483484
if (OPAL_SUCCESS != rc) {
484485
OPAL_ERROR_LOG(rc);
485486
break;
486487
}
488+
OPAL_OUTPUT_VERBOSE((5, ompi_ftmpi_output_handle,
489+
"%s ompi: proc %s reported dead by PMIx event handler (found at %s): %s",
490+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
491+
OPAL_NAME_PRINT(prc), OPAL_NAME_PRINT(source),
492+
PMIx_Error_string(status)));
487493
ompi_proc_t *proc = (ompi_proc_t*)ompi_proc_for_name(prc);
488494
if( NULL == proc ) {
489495
continue; /* we are not 'MPI connected' with this proc. */
@@ -502,12 +508,12 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
502508
default:
503509
/* An unmanaged type of failure, let it do its thing. */
504510
opal_output_verbose(1, ompi_ftmpi_output_handle,
505-
"%s ompi: Error event reported through PMIx from %s (state = %d). "
511+
"%s ompi: Error event reported through PMIx from %s (state = %s). "
506512
"This error type is not handled by the fault tolerant layer "
507513
"and the application will now presumably abort.",
508514
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
509515
OPAL_NAME_PRINT(source),
510-
status );
516+
PMIx_Error_string(status));
511517
}
512518
#endif /* OPAL_ENABLE_FT_MPI */
513519
opal_event_del(&event->super);

0 commit comments

Comments
 (0)