@@ -472,18 +472,24 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
472
472
case PMIX_ERR_PROC_ABORTED_BY_SIG :
473
473
case PMIX_ERR_PROC_ABORTED : /* that is, proc aborted by pmix_abort */
474
474
for (int i = 0 ; i < event -> nvalue ; i ++ ) {
475
- if (PMIX_PROC != event -> info [i ].value . type ) {
475
+ if (strcmp ( PMIX_EVENT_AFFECTED_PROC , event -> info [i ].key ) ) {
476
476
OPAL_OUTPUT_VERBOSE ((70 , ompi_ftmpi_output_handle ,
477
477
"%s ompi: ignoring the following key for a PMIx fault event: %s" ,
478
478
OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
479
479
event -> info [i ].key ));
480
480
continue ;
481
481
}
482
+ assert (event -> info [i ].value .type == PMIX_PROC );
482
483
OPAL_PMIX_CONVERT_PROCT (rc , & prc , event -> info [i ].value .data .proc );
483
484
if (OPAL_SUCCESS != rc ) {
484
485
OPAL_ERROR_LOG (rc );
485
486
break ;
486
487
}
488
+ OPAL_OUTPUT_VERBOSE ((5 , ompi_ftmpi_output_handle ,
489
+ "%s ompi: proc %s reported dead by PMIx event handler (found at %s): %s" ,
490
+ OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
491
+ OPAL_NAME_PRINT (prc ), OPAL_NAME_PRINT (source ),
492
+ PMIx_Error_string (status )));
487
493
ompi_proc_t * proc = (ompi_proc_t * )ompi_proc_for_name (prc );
488
494
if ( NULL == proc ) {
489
495
continue ; /* we are not 'MPI connected' with this proc. */
@@ -502,12 +508,12 @@ static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
502
508
default :
503
509
/* An unmanaged type of failure, let it do its thing. */
504
510
opal_output_verbose (1 , ompi_ftmpi_output_handle ,
505
- "%s ompi: Error event reported through PMIx from %s (state = %d ). "
511
+ "%s ompi: Error event reported through PMIx from %s (state = %s ). "
506
512
"This error type is not handled by the fault tolerant layer "
507
513
"and the application will now presumably abort." ,
508
514
OMPI_NAME_PRINT (OMPI_PROC_MY_NAME ),
509
515
OPAL_NAME_PRINT (source ),
510
- status );
516
+ PMIx_Error_string ( status ) );
511
517
}
512
518
#endif /* OPAL_ENABLE_FT_MPI */
513
519
opal_event_del (& event -> super );
0 commit comments