Skip to content

Commit 7dfe6c1

Browse files
committed
Thread-shift errors reported by PMIx to the main MPI progress engine
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> make things happen before the terminal call Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 06c5636 commit 7dfe6c1

File tree

2 files changed

+47
-14
lines changed

2 files changed

+47
-14
lines changed

ompi/errhandler/errhandler.c

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,23 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
290290
return new_errhandler;
291291
}
292292

293+
/* helper to move the error report back from the RTE thread to the MPI thread */
294+
typedef struct ompi_errhandler_event_s {
295+
opal_event_t super;
296+
opal_process_name_t procname;
297+
int status;
298+
} ompi_errhandler_event_t;
299+
300+
static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
301+
ompi_errhandler_event_t *event = (ompi_errhandler_event_t*) context;
302+
int status = event->status;
303+
opal_event_del(&event->super);
304+
free(event);
305+
/* our default action is to abort */
306+
OMPI_ERRHANDLER_NOHANDLE_INVOKE(status, "PMIx Event notification");
307+
return NULL;
308+
}
309+
293310
/* registration callback */
294311
void ompi_errhandler_registration_callback(int status,
295312
size_t errhandler_ref,
@@ -312,13 +329,37 @@ void ompi_errhandler_callback(size_t refid, pmix_status_t status,
312329
pmix_event_notification_cbfunc_fn_t cbfunc,
313330
void *cbdata)
314331
{
332+
int rc;
333+
/* an error has been found, report to the MPI layer and let it take
334+
* further action. */
335+
/* transition this from the RTE thread to the MPI progress engine */
336+
ompi_errhandler_event_t *event = malloc(sizeof(*event));
337+
if(NULL == event) {
338+
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
339+
goto error;
340+
}
341+
OPAL_PMIX_CONVERT_PROCT(rc, &event->procname, (pmix_proc_t*)source);
342+
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
343+
OMPI_ERROR_LOG(rc);
344+
free(event);
345+
goto error;
346+
}
347+
event->status = status;
348+
opal_event_set(opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
349+
ompi_errhandler_event_cb, event);
350+
opal_event_active(&event->super, OPAL_EV_READ, 1);
315351
/* tell the event chain engine to go no further - we
316352
* will handle this */
317353
if (NULL != cbfunc) {
318354
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
319355
}
320-
/* our default action is to abort */
321-
OMPI_ERRHANDLER_NOHANDLE_INVOKE(status, "PMIx Event notification");
356+
return;
357+
358+
error:
359+
if (NULL != cbfunc) {
360+
/* We can't handle this, let the default action abort. */
361+
cbfunc(PMIX_EVENT_NO_ACTION_TAKEN, NULL, 0, NULL, NULL, cbdata);
362+
}
322363
}
323364

324365
/**************************************************************************

ompi/errhandler/errhandler.h

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -339,19 +339,11 @@ struct ompi_request_t;
339339
}
340340

341341
/* Same as OMPI_ERRHANDLER_RETURN for non-handle attached errors */
342-
#define OMPI_ERRHANDLER_NOHANDLE_RETURN(rc, err_code, message) \
342+
#define OMPI_ERRHANDLER_NOHANDLE_RETURN(rc, err_code, message) {\
343+
OMPI_ERRHANDLER_NOHANDLE_CHECK(rc, err_code, message) \
343344
OPAL_CR_EXIT_LIBRARY() \
344-
if ( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { \
345-
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
346-
ompi_errhandler_invoke(NULL, \
347-
NULL, \
348-
-1, \
349-
(__mpi_err_code), \
350-
(message)); \
351-
return (__mpi_err_code); \
352-
} else { \
353-
return MPI_SUCCESS; \
354-
}
345+
return MPI_SUCCESS; \
346+
}
355347

356348
/**
357349
* Initialize the error handler interface.

0 commit comments

Comments
 (0)