Skip to content

Commit efbc6ff

Browse files
authored
Merge pull request #7798 from abouteiller/mpi-next/unbounderr-self
MPI-4 error handling: 'unbound' errors to MPI_COMM_SELF
2 parents dfb0ae7 + ee149fc commit efbc6ff

File tree

346 files changed

+1146
-964
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

346 files changed

+1146
-964
lines changed

ompi/errhandler/errhandler.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,26 @@ ompi_errhandler_t *ompi_errhandler_create(ompi_errhandler_type_t object_type,
290290
return new_errhandler;
291291
}
292292

293+
/* helper to move the error report back from the RTE thread to the MPI thread */
294+
typedef struct ompi_errhandler_event_s {
295+
opal_event_t super;
296+
opal_process_name_t procname;
297+
int status;
298+
} ompi_errhandler_event_t;
299+
300+
static void *ompi_errhandler_event_cb(int fd, int flags, void *context) {
301+
ompi_errhandler_event_t *event = (ompi_errhandler_event_t*) context;
302+
int status = event->status;
303+
opal_event_del(&event->super);
304+
free(event);
305+
/* our default action is to abort */
306+
/* TODO: this error should return to the caller and invoke an error
307+
* handler from the MPI API call.
308+
* For now, it is fatal. */
309+
ompi_mpi_errors_are_fatal_comm_handler(NULL, status, "PMIx Even Notification");
310+
return NULL;
311+
}
312+
293313
/* registration callback */
294314
void ompi_errhandler_registration_callback(int status,
295315
size_t errhandler_ref,
@@ -312,13 +332,37 @@ void ompi_errhandler_callback(size_t refid, pmix_status_t status,
312332
pmix_event_notification_cbfunc_fn_t cbfunc,
313333
void *cbdata)
314334
{
335+
int rc;
336+
/* an error has been found, report to the MPI layer and let it take
337+
* further action. */
338+
/* transition this from the RTE thread to the MPI progress engine */
339+
ompi_errhandler_event_t *event = malloc(sizeof(*event));
340+
if(NULL == event) {
341+
OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
342+
goto error;
343+
}
344+
OPAL_PMIX_CONVERT_PROCT(rc, &event->procname, (pmix_proc_t*)source);
345+
if(OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
346+
OMPI_ERROR_LOG(rc);
347+
free(event);
348+
goto error;
349+
}
350+
event->status = status;
351+
opal_event_set(opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
352+
ompi_errhandler_event_cb, event);
353+
opal_event_active(&event->super, OPAL_EV_READ, 1);
315354
/* tell the event chain engine to go no further - we
316355
* will handle this */
317356
if (NULL != cbfunc) {
318357
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
319358
}
320-
/* our default action is to abort */
321-
ompi_mpi_abort(MPI_COMM_WORLD, status);
359+
return;
360+
361+
error:
362+
if (NULL != cbfunc) {
363+
/* We can't handle this, let the default action abort. */
364+
cbfunc(PMIX_EVENT_NO_ACTION_TAKEN, NULL, 0, NULL, NULL, cbdata);
365+
}
322366
}
323367

324368
/**************************************************************************

ompi/errhandler/errhandler.h

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "opal/mca/pmix/pmix-internal.h"
3939

4040
#include "ompi/runtime/mpiruntime.h"
41+
#include "ompi/runtime/params.h"
4142
#include "ompi/errhandler/errhandler_predefined.h"
4243
#include "ompi/errhandler/errcode-internal.h"
4344

@@ -256,10 +257,19 @@ struct ompi_request_t;
256257
*/
257258
#define OMPI_ERRHANDLER_INVOKE(mpi_object, err_code, message) \
258259
ompi_errhandler_invoke((mpi_object)->error_handler, \
259-
(mpi_object), \
260+
(mpi_object), \
260261
(int)(mpi_object)->errhandler_type, \
261262
ompi_errcode_get_mpi_code(err_code), \
262-
(message));
263+
(message));
264+
265+
/**
266+
* This is the macro to route errors to the 'default' communicator
267+
* for non-handle attached errors (e.g., a datatype operation error).
268+
*/
269+
#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err_code, message) \
270+
ompi_errhandler_invoke(NULL, NULL, -1, \
271+
ompi_errcode_get_mpi_code(err_code), \
272+
(message));
263273

264274
/**
265275
* Conditionally invoke an MPI error handler.
@@ -279,13 +289,26 @@ struct ompi_request_t;
279289
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
280290
OPAL_CR_EXIT_LIBRARY() \
281291
ompi_errhandler_invoke((mpi_object)->error_handler, \
282-
(mpi_object), \
292+
(mpi_object), \
283293
(int) (mpi_object)->errhandler_type, \
284294
(__mpi_err_code), \
285295
(message)); \
286296
return (__mpi_err_code); \
287297
}
288298

299+
/* Same as OMPI_ERRHANDLER_CHECK for non-handle attached errors */
300+
#define OMPI_ERRHANDLER_NOHANDLE_CHECK(rc, err_code, message) \
301+
if( OPAL_UNLIKELY(rc != OMPI_SUCCESS) ) { \
302+
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
303+
OPAL_CR_EXIT_LIBRARY() \
304+
ompi_errhandler_invoke(NULL, \
305+
NULL, \
306+
-1, \
307+
(__mpi_err_code), \
308+
(message)); \
309+
return (__mpi_err_code); \
310+
}
311+
289312
/**
290313
* Conditionally invoke an MPI error handler; if there is no error,
291314
* return MPI_SUCCESS.
@@ -315,7 +338,12 @@ struct ompi_request_t;
315338
return MPI_SUCCESS; \
316339
}
317340

318-
341+
/* Same as OMPI_ERRHANDLER_RETURN for non-handle attached errors */
342+
#define OMPI_ERRHANDLER_NOHANDLE_RETURN(rc, err_code, message) {\
343+
OMPI_ERRHANDLER_NOHANDLE_CHECK(rc, err_code, message) \
344+
OPAL_CR_EXIT_LIBRARY() \
345+
return MPI_SUCCESS; \
346+
}
319347

320348
/**
321349
* Initialize the error handler interface.

ompi/errhandler/errhandler_invoke.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object,
4747
int32_t state = ompi_mpi_state;
4848
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
4949
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
50-
comm = &ompi_mpi_comm_self.comm;
50+
comm = (ompi_mpi_compat_mpi3)? &ompi_mpi_comm_world.comm: &ompi_mpi_comm_self.comm;
5151
comm->error_handler->eh_comm_fn(&comm, &err_code, message, NULL);
5252
}
5353
else {

ompi/mca/fcoll/two_phase/fcoll_two_phase_support_fns.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2017 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -171,12 +171,16 @@ int mca_fcoll_two_phase_calc_aggregator(ompio_file_t *fh,
171171
}
172172

173173
if (rank_index >= num_aggregators || rank_index < 0) {
174+
int err = MPI_ERR_INTERN;
174175
fprintf(stderr,
175176
"Error in mca_fcoll_two_phase_calc_aggregator:");
176177
fprintf(stderr,
177178
"rank_index(%d) >= num_aggregators(%d) fd_size=%lld off=%lld min_off=%lld striping_unit=%d\n",
178179
rank_index, num_aggregators, fd_size, off, min_off, striping_unit);
179-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
180+
/* TODO: this error should return to the caller and invoke an error
181+
* handler from the MPI API call.
182+
* For now, it is fatal. */
183+
ompi_mpi_errors_are_fatal_file_handler(NULL, &err, "Invalid rank in fcoll aggregator");
180184
}
181185

182186

ompi/mca/osc/base/osc_base_obj_convert.c

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
/*
33
* Copyright (c) 2004-2005 The Trustees of Indiana University.
44
* All rights reserved.
5-
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
6-
* All rights reserved.
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
6+
* of Tennessee Research Foundation. All rights
7+
* reserved.
78
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
89
* University of Stuttgart. All rights reserved.
910
* Copyright (c) 2004-2005 The Regents of the University of California.
@@ -84,16 +85,20 @@ int ompi_osc_base_process_op (void *outbuf, void *inbuf, size_t inbuflen,
8485
MPI_DOUBLE_INT == datatype ||
8586
MPI_LONG_INT == datatype ||
8687
MPI_LONG_DOUBLE_INT == datatype) {
87-
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
88-
opal_output(0, "Error: %s datatype is currently "
89-
"unsupported for MPI_MINLOC/MPI_MAXLOC "
90-
"operation\n", datatype->name);
91-
opal_show_help("help-mpi-api.txt", "mpi-abort", true,
92-
comm->c_my_rank,
93-
('\0' != comm->c_name[0]) ? comm->c_name : "<Unknown>",
94-
-1);
95-
96-
ompi_mpi_abort(comm, -1);
88+
int err = MPI_ERR_UNSUPPORTED_DATAREP;
89+
char *reason = NULL;
90+
opal_asprintf(&reason,
91+
"%s datatype is currently "
92+
"unsupported for MPI_MINLOC/MPI_MAXLOC "
93+
"operation\n", datatype->name);
94+
opal_show_help("help-mpi-api.txt", "MPI function not supported", true,
95+
(MPI_MINLOC==op)?"MPI_MINLOC":"MPI_MAXLOC",
96+
reason);
97+
free(reason);
98+
/* TODO: this error should return to the caller and invoke an error
99+
* handler from the MPI API call.
100+
* For now, it is fatal. */
101+
ompi_mpi_errors_are_fatal_win_handler(NULL, &err, "OSC unsupported MINLOC/MAXLOC datatype");
97102
}
98103
}
99104

ompi/mca/pml/ob1/pml_ob1.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,10 @@ void mca_pml_ob1_error_handler(
826826
return;
827827
}
828828

829-
ompi_rte_abort(-1, btlinfo);
829+
/* TODO: this error should return to the caller and invoke an error
830+
* handler from the MPI API call.
831+
* For now, it is fatal. */
832+
ompi_mpi_errors_are_fatal_comm_handler(NULL, -1, btlinfo);
830833
}
831834

832835
#if OPAL_ENABLE_FT_CR == 0

ompi/mca/pml/ob1/pml_ob1_recvreq.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@@ -179,7 +179,9 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
179179

180180
if(true == recvreq->req_recv.req_base.req_free_called) {
181181
if( MPI_SUCCESS != recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR ) {
182-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
182+
/* An error after freeing the request MUST be fatal
183+
* MPI3 ch3.7: MPI_REQUEST_FREE */
184+
ompi_mpi_errors_are_fatal_comm_handler(NULL, MPI_ERR_REQUEST, "Recv error after request freed");
183185
}
184186
MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
185187
} else {

ompi/mca/pml/ob1/pml_ob1_sendreq.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -276,7 +276,9 @@ send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq)
276276
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
277277
} else {
278278
if( MPI_SUCCESS != sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR ) {
279-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
279+
/* An error after freeing the request MUST be fatal
280+
* MPI3 ch3.7: MPI_REQUEST_FREE */
281+
ompi_mpi_errors_are_fatal_comm_handler(NULL, MPI_ERR_REQUEST, "Send error after request freed");
280282
}
281283
}
282284
} else {

ompi/mca/pml/ucx/pml_ucx_datatype.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
33
* Copyright (c) 2019 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2020 The University of Tennessee and The University
6+
* of Tennessee Research Foundation. All rights
7+
* reserved.
58
* $COPYRIGHT$
69
*
710
* Additional copyrights may follow
@@ -176,8 +179,12 @@ pml_ucx_datatype_t *mca_pml_ucx_init_nbx_datatype(ompi_datatype_t *datatype,
176179

177180
pml_datatype = malloc(sizeof(*pml_datatype));
178181
if (pml_datatype == NULL) {
182+
int err = MPI_ERR_INTERN;
179183
PML_UCX_ERROR("Failed to allocate datatype structure");
180-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
184+
/* TODO: this error should return to the caller and invoke an error
185+
* handler from the MPI API call.
186+
* For now, it is fatal. */
187+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
181188
}
182189

183190
pml_datatype->datatype = ucp_datatype;
@@ -219,8 +226,12 @@ ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
219226
status = ucp_dt_create_generic(&pml_ucx_generic_datatype_ops,
220227
datatype, &ucp_datatype);
221228
if (status != UCS_OK) {
229+
int err = MPI_ERR_INTERN;
222230
PML_UCX_ERROR("Failed to create UCX datatype for %s", datatype->name);
223-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
231+
/* TODO: this error should return to the caller and invoke an error
232+
* handler from the MPI API call.
233+
* For now, it is fatal. */
234+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
224235
}
225236

226237
/* Add custom attribute, to clean up UCX resources when OMPI datatype is
@@ -234,9 +245,13 @@ ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
234245
ompi_pml_ucx.datatype_attr_keyval,
235246
(void*)ucp_datatype, false);
236247
if (ret != OMPI_SUCCESS) {
248+
int err = MPI_ERR_INTERN;
237249
PML_UCX_ERROR("Failed to add UCX datatype attribute for %s: %d",
238250
datatype->name, ret);
239-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
251+
/* TODO: this error should return to the caller and invoke an error
252+
* handler from the MPI API call.
253+
* For now, it is fatal. */
254+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
240255
}
241256
}
242257
out:

ompi/mca/vprotocol/pessimist/vprotocol_pessimist_sender_based.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
2-
* Copyright (c) 2004-2014 The Trustees of the University of Tennessee.
3-
* All rights reserved.
2+
* Copyright (c) 2004-2020 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
45
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
56
* $COPYRIGHT$
67
*
@@ -56,7 +57,7 @@ static void sb_mmap_alloc(void)
5657
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: ftruncate: %s",
5758
strerror(errno));
5859
close(sb.sb_fd);
59-
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE);
60+
ompi_mpi_abort(MPI_COMM_SELF, MPI_ERR_NO_SPACE);
6061
}
6162
sb.sb_addr = (uintptr_t) mmap((void *) sb.sb_addr, sb.sb_length,
6263
PROT_WRITE | PROT_READ,
@@ -67,7 +68,7 @@ static void sb_mmap_alloc(void)
6768
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: mmap: %s",
6869
strerror(errno));
6970
close(sb.sb_fd);
70-
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE);
71+
ompi_mpi_abort(MPI_COMM_SELF, MPI_ERR_NO_SPACE);
7172
}
7273
}
7374

0 commit comments

Comments
 (0)