Skip to content

Commit 1bd3e59

Browse files
authored
Merge pull request #11173 from abouteiller/ulfm/pmix_err_events
Change the PMIX event observed to capture faults from PRTE
2 parents 2ab1a15 + c4bd78c commit 1bd3e59

File tree

3 files changed

+37
-24
lines changed

3 files changed

+37
-24
lines changed

ompi/errhandler/errhandler.c

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2020 The University of Tennessee and The University
6+
* Copyright (c) 2004-2022 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -343,10 +343,10 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
343343
opal_mutex_unlock(&errhandler_ftmpi_lock);
344344

345345
opal_output_verbose(1, ompi_ftmpi_output_handle,
346-
"%s ompi: Process %s failed (state = %d).",
346+
"%s ompi: Process %s failed (state = %d %s).",
347347
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
348348
OMPI_NAME_PRINT(&ompi_proc->super.proc_name),
349-
status );
349+
status, PMIx_Error_string(status) );
350350

351351
if(90 < opal_output_get_verbosity(ompi_ftmpi_output_handle)) {
352352
/* how did we get there? */
@@ -413,7 +413,7 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
413413
* The wait function has a check, so all we need to do here is
414414
* signal it so it will check again.
415415
*/
416-
wait_sync_global_wakeup(MPI_ERR_PROC_FAILED);
416+
wait_sync_global_wakeup(PMIX_ERR_PROC_ABORTED == status? MPI_ERR_PROC_ABORTED: MPI_ERR_PROC_FAILED);
417417

418418
/* Collectives:
419419
* Propagate the error (this has been selected rather than the "roll
@@ -430,12 +430,11 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
430430
pmix_info_t pmix_info[1];
431431
pmix_status_t prc;
432432

433-
assert(OPAL_ERR_PROC_ABORTED == status);
434433
OPAL_PMIX_CONVERT_NAME(&pmix_source, OMPI_PROC_MY_NAME);
435434
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
436435
PMIX_INFO_CONSTRUCT(&pmix_info[0]);
437436
PMIX_INFO_LOAD(&pmix_info[0], PMIX_EVENT_AFFECTED_PROC, &pmix_proc, PMIX_PROC);
438-
prc = PMIx_Notify_event(PMIX_ERR_PROC_ABORTED, &pmix_source, PMIX_RANGE_LOCAL,
437+
prc = PMIx_Notify_event(PMIX_ERR_PROC_TERM_WO_SYNC, &pmix_source, PMIX_RANGE_LOCAL,
439438
pmix_info, 1, NULL, &active);
440439
if( PMIX_SUCCESS != prc &&
441440
PMIX_OPERATION_SUCCEEDED != prc ) {
@@ -468,10 +467,16 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
468467
opal_process_name_t prc;
469468
int rc;
470469
#if OPAL_ENABLE_FT_MPI
471-
if( PMIX_ERR_PROC_ABORTED == status ) {
472-
int i;
473-
for(i = 0; i < event->nvalue; i++) {
470+
switch( status ) {
471+
case PMIX_ERR_PROC_TERM_WO_SYNC:
472+
case PMIX_ERR_PROC_ABORTED_BY_SIG:
473+
case PMIX_ERR_PROC_ABORTED: /* that is, proc aborted by pmix_abort */
474+
for(int i = 0; i < event->nvalue; i++) {
474475
if (PMIX_PROC != event->info[i].value.type) {
476+
OPAL_OUTPUT_VERBOSE((70, ompi_ftmpi_output_handle,
477+
"%s ompi: ignoring the following key for a PMIx fault event: %s",
478+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
479+
event->info[i].key));
475480
continue;
476481
}
477482
OPAL_PMIX_CONVERT_PROCT(rc, &prc, event->info[i].value.data.proc);
@@ -484,20 +489,26 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
484489
continue; /* we are not 'MPI connected' with this proc. */
485490
}
486491
assert( !ompi_proc_is_sentinel(proc) );
487-
ompi_errhandler_proc_failed_internal(proc, OPAL_ERR_PROC_ABORTED, false);
492+
ompi_errhandler_proc_failed_internal(proc, status, false);
488493
}
489494
opal_event_del(&event->super);
490495
free(event);
491496
return NULL;
497+
case PMIX_ERR_LOST_CONNECTION:
498+
opal_output_verbose(1, ompi_ftmpi_output_handle,
499+
"%s ompi: Error event PMIX_ERR_LOST_CONNECTION reported, that usually means that my daemon died thus I need to go away.",
500+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME));
501+
break;
502+
default:
503+
/* An unmanaged type of failure, let it do its thing. */
504+
opal_output_verbose(1, ompi_ftmpi_output_handle,
505+
"%s ompi: Error event reported through PMIx from %s (state = %d). "
506+
"This error type is not handled by the fault tolerant layer "
507+
"and the application will now presumably abort.",
508+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
509+
OPAL_NAME_PRINT(source),
510+
status );
492511
}
493-
/* An unmanaged type of failure, let it do its thing. */
494-
opal_output_verbose(1, ompi_ftmpi_output_handle,
495-
"%s ompi: Error event reported through PMIx from %s (state = %d). "
496-
"This error type is not handled by the fault tolerant layer "
497-
"and the application will now presumably abort.",
498-
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
499-
OPAL_NAME_PRINT(source),
500-
status );
501512
#endif /* OPAL_ENABLE_FT_MPI */
502513
opal_event_del(&event->super);
503514
free(event);

ompi/errhandler/errhandler.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2021 The University of Tennessee and The University
6+
* Copyright (c) 2004-2022 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -453,7 +453,7 @@ struct ompi_proc_t;
453453

454454
OMPI_DECLSPEC int ompi_errhandler_proc_failed_internal(struct ompi_proc_t *ompi_proc, int status, bool forward);
455455
static inline int ompi_errhandler_proc_failed(struct ompi_proc_t* ompi_proc) {
456-
return ompi_errhandler_proc_failed_internal(ompi_proc, OPAL_ERR_PROC_ABORTED, true);
456+
return ompi_errhandler_proc_failed_internal(ompi_proc, PMIX_ERR_PROC_TERM_WO_SYNC, true);
457457
}
458458
#endif /* OPAL_ENABLE_FT_MPI */
459459

ompi/instance/instance.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
6+
* Copyright (c) 2022 The University of Tennessee and The University
7+
* of Tennessee Research Foundation. All rights
8+
* reserved.
69
* $COPYRIGHT$
710
*
811
* Additional copyrights may follow
@@ -435,11 +438,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
435438
/* give it a name so we can distinguish it */
436439
PMIX_INFO_LOAD(&info[1], PMIX_EVENT_HDLR_NAME, "ULFM-Event-handler", PMIX_STRING);
437440
OPAL_PMIX_CONSTRUCT_LOCK(&mylock);
438-
pmix_status_t codes[4] = {
439-
PMIX_ERR_PROC_ABORTED,
440-
PMIX_ERR_EXIT_NONZERO_TERM,
441+
pmix_status_t codes[3] = {
442+
PMIX_ERR_PROC_TERM_WO_SYNC,
441443
PMIX_ERR_PROC_ABORTED_BY_SIG,
442-
PMIX_ERR_LOST_CONNECTION
444+
PMIX_ERR_PROC_ABORTED
443445
};
444446
PMIx_Register_event_handler(codes, 3, info, 2, ompi_errhandler_callback, evhandler_reg_callbk, (void*)&mylock);
445447
OPAL_PMIX_WAIT_THREAD(&mylock);

0 commit comments

Comments
 (0)