3
3
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4
4
* University Research and Technology
5
5
* Corporation. All rights reserved.
6
- * Copyright (c) 2004-2020 The University of Tennessee and The University
6
+ * Copyright (c) 2004-2022 The University of Tennessee and The University
7
7
* of Tennessee Research Foundation. All rights
8
8
* reserved.
9
9
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -343,10 +343,10 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
343
343
opal_mutex_unlock (& errhandler_ftmpi_lock );
344
344
345
345
opal_output_verbose (1 , ompi_ftmpi_output_handle ,
346
- "%s ompi: Process %s failed (state = %d)." ,
346
+ "%s ompi: Process %s failed (state = %d %s )." ,
347
347
OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
348
348
OMPI_NAME_PRINT (& ompi_proc -> super .proc_name ),
349
- status );
349
+ status , PMIx_Error_string ( status ) );
350
350
351
351
if (90 < opal_output_get_verbosity (ompi_ftmpi_output_handle )) {
352
352
/* how did we get there? */
@@ -413,7 +413,7 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
413
413
* The wait function has a check, so all we need to do here is
414
414
* signal it so it will check again.
415
415
*/
416
- wait_sync_global_wakeup (MPI_ERR_PROC_FAILED );
416
+ wait_sync_global_wakeup (PMIX_ERR_PROC_ABORTED == status ? MPI_ERR_PROC_ABORTED : MPI_ERR_PROC_FAILED );
417
417
418
418
/* Collectives:
419
419
* Propagate the error (this has been selected rather than the "roll
@@ -430,12 +430,11 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
430
430
pmix_info_t pmix_info [1 ];
431
431
pmix_status_t prc ;
432
432
433
- assert (OPAL_ERR_PROC_ABORTED == status );
434
433
OPAL_PMIX_CONVERT_NAME (& pmix_source , OMPI_PROC_MY_NAME );
435
434
OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
436
435
PMIX_INFO_CONSTRUCT (& pmix_info [0 ]);
437
436
PMIX_INFO_LOAD (& pmix_info [0 ], PMIX_EVENT_AFFECTED_PROC , & pmix_proc , PMIX_PROC );
438
- prc = PMIx_Notify_event (PMIX_ERR_PROC_ABORTED , & pmix_source , PMIX_RANGE_LOCAL ,
437
+ prc = PMIx_Notify_event (PMIX_ERR_PROC_TERM_WO_SYNC , & pmix_source , PMIX_RANGE_LOCAL ,
439
438
pmix_info , 1 , NULL , & active );
440
439
if ( PMIX_SUCCESS != prc &&
441
440
PMIX_OPERATION_SUCCEEDED != prc ) {
@@ -468,10 +467,16 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
468
467
opal_process_name_t prc ;
469
468
int rc ;
470
469
#if OPAL_ENABLE_FT_MPI
471
- if ( PMIX_ERR_PROC_ABORTED == status ) {
472
- int i ;
473
- for (i = 0 ; i < event -> nvalue ; i ++ ) {
470
+ switch ( status ) {
471
+ case PMIX_ERR_PROC_TERM_WO_SYNC :
472
+ case PMIX_ERR_PROC_ABORTED_BY_SIG :
473
+ case PMIX_ERR_PROC_ABORTED : /* that is, proc aborted by pmix_abort */
474
+ for (int i = 0 ; i < event -> nvalue ; i ++ ) {
474
475
if (PMIX_PROC != event -> info [i ].value .type ) {
476
+ OPAL_OUTPUT_VERBOSE ((70 , ompi_ftmpi_output_handle ,
477
+ "%s ompi: ignoring the following key for a PMIx fault event: %s" ,
478
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
479
+ event -> info [i ].key ));
475
480
continue ;
476
481
}
477
482
OPAL_PMIX_CONVERT_PROCT (rc , & prc , event -> info [i ].value .data .proc );
@@ -484,20 +489,26 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
484
489
continue ; /* we are not 'MPI connected' with this proc. */
485
490
}
486
491
assert ( !ompi_proc_is_sentinel (proc ) );
487
- ompi_errhandler_proc_failed_internal (proc , OPAL_ERR_PROC_ABORTED , false);
492
+ ompi_errhandler_proc_failed_internal (proc , status , false);
488
493
}
489
494
opal_event_del (& event -> super );
490
495
free (event );
491
496
return NULL ;
497
+ case PMIX_ERR_LOST_CONNECTION :
498
+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
499
+ "%s ompi: Error event PMIX_ERR_LOST_CONNECTION reported, that usually means that my daemon died thus I need to go away." ,
500
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ));
501
+ break ;
502
+ default :
503
+ /* An unmanaged type of failure, let it do its thing. */
504
+ opal_output_verbose (1 , ompi_ftmpi_output_handle ,
505
+ "%s ompi: Error event reported through PMIx from %s (state = %d). "
506
+ "This error type is not handled by the fault tolerant layer "
507
+ "and the application will now presumably abort." ,
508
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
509
+ OPAL_NAME_PRINT (source ),
510
+ status );
492
511
}
493
- /* An unmanaged type of failure, let it do its thing. */
494
- opal_output_verbose (1 , ompi_ftmpi_output_handle ,
495
- "%s ompi: Error event reported through PMIx from %s (state = %d). "
496
- "This error type is not handled by the fault tolerant layer "
497
- "and the application will now presumably abort." ,
498
- OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
499
- OPAL_NAME_PRINT (source ),
500
- status );
501
512
#endif /* OPAL_ENABLE_FT_MPI */
502
513
opal_event_del (& event -> super );
503
514
free (event );
0 commit comments