Skip to content

Commit b37202c

Browse files
committed
Add compliance mode with MPI-4 routing of errors to MPI_COMM_SELF by
default And other streamlining of aborting behavior. Signed-off-by: Aurélien Bouteiller <bouteill@icl.utk.edu> Remove OMPI_COMM_ERRORS and use NOHANDLE macros instead. Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> route unbound errors to self error handler Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> Do not raise the error handler from within components Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 366e92c commit b37202c

File tree

343 files changed

+1021
-961
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

343 files changed

+1021
-961
lines changed

ompi/errhandler/errhandler.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ void ompi_errhandler_callback(size_t refid, pmix_status_t status,
318318
cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
319319
}
320320
/* our default action is to abort */
321-
ompi_mpi_abort(MPI_COMM_WORLD, status);
321+
OMPI_ERRHANDLER_NOHANDLE_INVOKE(status, "PMIx Event notification");
322322
}
323323

324324
/**************************************************************************

ompi/errhandler/errhandler.h

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "opal/mca/pmix/pmix-internal.h"
3939

4040
#include "ompi/runtime/mpiruntime.h"
41+
#include "ompi/runtime/params.h"
4142
#include "ompi/errhandler/errhandler_predefined.h"
4243
#include "ompi/errhandler/errcode-internal.h"
4344

@@ -256,10 +257,19 @@ struct ompi_request_t;
256257
*/
257258
#define OMPI_ERRHANDLER_INVOKE(mpi_object, err_code, message) \
258259
ompi_errhandler_invoke((mpi_object)->error_handler, \
259-
(mpi_object), \
260+
(mpi_object), \
260261
(int)(mpi_object)->errhandler_type, \
261262
ompi_errcode_get_mpi_code(err_code), \
262-
(message));
263+
(message));
264+
265+
/**
266+
* This is the macro to route errors to the 'default' communicator
267+
* for non-handle attached errors (e.g., a datatype operation error).
268+
*/
269+
#define OMPI_ERRHANDLER_NOHANDLE_INVOKE(err_code, message) \
270+
ompi_errhandler_invoke(NULL, NULL, -1, \
271+
ompi_errcode_get_mpi_code(err_code), \
272+
(message));
263273

264274
/**
265275
* Conditionally invoke an MPI error handler.
@@ -279,13 +289,26 @@ struct ompi_request_t;
279289
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
280290
OPAL_CR_EXIT_LIBRARY() \
281291
ompi_errhandler_invoke((mpi_object)->error_handler, \
282-
(mpi_object), \
292+
(mpi_object), \
283293
(int) (mpi_object)->errhandler_type, \
284294
(__mpi_err_code), \
285295
(message)); \
286296
return (__mpi_err_code); \
287297
}
288298

299+
/* Same as OMPI_ERRHANDLER_CHECK for non-handle attached errors */
300+
#define OMPI_ERRHANDLER_NOHANDLE_CHECK(rc, err_code, message) \
301+
if( OPAL_UNLIKELY(rc != OMPI_SUCCESS) ) { \
302+
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
303+
OPAL_CR_EXIT_LIBRARY() \
304+
ompi_errhandler_invoke(NULL, \
305+
NULL, \
306+
-1, \
307+
(__mpi_err_code), \
308+
(message)); \
309+
return (__mpi_err_code); \
310+
}
311+
289312
/**
290313
* Conditionally invoke an MPI error handler; if there is no error,
291314
* return MPI_SUCCESS.
@@ -315,7 +338,20 @@ struct ompi_request_t;
315338
return MPI_SUCCESS; \
316339
}
317340

318-
341+
/* Same as OMPI_ERRHANDLER_RETURN for non-handle attached errors */
342+
#define OMPI_ERRHANDLER_NOHANDLE_RETURN(rc, err_code, message) \
343+
OPAL_CR_EXIT_LIBRARY() \
344+
if ( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { \
345+
int __mpi_err_code = ompi_errcode_get_mpi_code(err_code); \
346+
ompi_errhandler_invoke(NULL, \
347+
NULL, \
348+
-1, \
349+
(__mpi_err_code), \
350+
(message)); \
351+
return (__mpi_err_code); \
352+
} else { \
353+
return MPI_SUCCESS; \
354+
}
319355

320356
/**
321357
* Initialize the error handler interface.

ompi/errhandler/errhandler_invoke.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object,
4747
int32_t state = ompi_mpi_state;
4848
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
4949
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
50-
comm = &ompi_mpi_comm_self.comm;
50+
comm = (ompi_mpi_errors_mpi3)? &ompi_mpi_comm_world.comm: &ompi_mpi_comm_self.comm;
5151
comm->error_handler->eh_comm_fn(&comm, &err_code, message, NULL);
5252
}
5353
else {

ompi/mca/fcoll/two_phase/fcoll_two_phase_support_fns.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2017 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -171,12 +171,16 @@ int mca_fcoll_two_phase_calc_aggregator(ompio_file_t *fh,
171171
}
172172

173173
if (rank_index >= num_aggregators || rank_index < 0) {
174+
int err = MPI_ERR_INTERN;
174175
fprintf(stderr,
175176
"Error in mca_fcoll_two_phase_calc_aggregator:");
176177
fprintf(stderr,
177178
"rank_index(%d) >= num_aggregators(%d) fd_size=%lld off=%lld min_off=%lld striping_unit=%d\n",
178179
rank_index, num_aggregators, fd_size, off, min_off, striping_unit);
179-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
180+
/* TODO: this error should return to the caller and invoke an error
181+
* handler from the MPI API call.
182+
* For now, it is fatal. */
183+
ompi_mpi_errors_are_fatal_file_handler(NULL, &err, "Invalid rank in fcoll aggregator");
180184
}
181185

182186

ompi/mca/osc/base/osc_base_obj_convert.c

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
/*
33
* Copyright (c) 2004-2005 The Trustees of Indiana University.
44
* All rights reserved.
5-
* Copyright (c) 2004-2006 The Trustees of the University of Tennessee.
6-
* All rights reserved.
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
6+
* of Tennessee Research Foundation. All rights
7+
* reserved.
78
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
89
* University of Stuttgart. All rights reserved.
910
* Copyright (c) 2004-2005 The Regents of the University of California.
@@ -84,16 +85,20 @@ int ompi_osc_base_process_op (void *outbuf, void *inbuf, size_t inbuflen,
8485
MPI_DOUBLE_INT == datatype ||
8586
MPI_LONG_INT == datatype ||
8687
MPI_LONG_DOUBLE_INT == datatype) {
87-
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
88-
opal_output(0, "Error: %s datatype is currently "
89-
"unsupported for MPI_MINLOC/MPI_MAXLOC "
90-
"operation\n", datatype->name);
91-
opal_show_help("help-mpi-api.txt", "mpi-abort", true,
92-
comm->c_my_rank,
93-
('\0' != comm->c_name[0]) ? comm->c_name : "<Unknown>",
94-
-1);
95-
96-
ompi_mpi_abort(comm, -1);
88+
int err = MPI_ERR_UNSUPPORTED_DATAREP;
89+
char *reason = NULL;
90+
opal_asprintf(&reason,
91+
"%s datatype is currently "
92+
"unsupported for MPI_MINLOC/MPI_MAXLOC "
93+
"operation\n", datatype->name);
94+
opal_show_help("help-mpi-api.txt", "MPI function not supported", true,
95+
(MPI_MINLOC==op)?"MPI_MINLOC":"MPI_MAXLOC",
96+
reason);
97+
free(reason);
98+
/* TODO: this error should return to the caller and invoke an error
99+
* handler from the MPI API call.
100+
* For now, it is fatal. */
101+
ompi_mpi_errors_are_fatal_win_handler(NULL, &err, "OSC unsupported MINLOC/MAXLOC datatype");
97102
}
98103
}
99104

ompi/mca/pml/ob1/pml_ob1_recvreq.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@@ -179,7 +179,7 @@ recv_request_pml_complete(mca_pml_ob1_recv_request_t *recvreq)
179179

180180
if(true == recvreq->req_recv.req_base.req_free_called) {
181181
if( MPI_SUCCESS != recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR ) {
182-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
182+
OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_REQUEST, "Recv error after request freed.");
183183
}
184184
MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
185185
} else {

ompi/mca/pml/ob1/pml_ob1_sendreq.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -276,7 +276,7 @@ send_request_pml_complete(mca_pml_ob1_send_request_t *sendreq)
276276
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, true);
277277
} else {
278278
if( MPI_SUCCESS != sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR ) {
279-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, MPI_ERR_REQUEST);
279+
OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_REQUEST, "Send error after request freed");
280280
}
281281
}
282282
} else {

ompi/mca/pml/ucx/pml_ucx_datatype.c

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
* Copyright (C) Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
33
* Copyright (c) 2019 Research Organization for Information Science
44
* and Technology (RIST). All rights reserved.
5+
* Copyright (c) 2020 The University of Tennessee and The University
6+
* of Tennessee Research Foundation. All rights
7+
* reserved.
58
* $COPYRIGHT$
69
*
710
* Additional copyrights may follow
@@ -176,8 +179,12 @@ pml_ucx_datatype_t *mca_pml_ucx_init_nbx_datatype(ompi_datatype_t *datatype,
176179

177180
pml_datatype = malloc(sizeof(*pml_datatype));
178181
if (pml_datatype == NULL) {
182+
int err = MPI_ERR_INTERN;
179183
PML_UCX_ERROR("Failed to allocate datatype structure");
180-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
184+
/* TODO: this error should return to the caller and invoke an error
185+
* handler from the MPI API call.
186+
* For now, it is fatal. */
187+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
181188
}
182189

183190
pml_datatype->datatype = ucp_datatype;
@@ -219,8 +226,12 @@ ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
219226
status = ucp_dt_create_generic(&pml_ucx_generic_datatype_ops,
220227
datatype, &ucp_datatype);
221228
if (status != UCS_OK) {
229+
int err = MPI_ERR_INTERN;
222230
PML_UCX_ERROR("Failed to create UCX datatype for %s", datatype->name);
223-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
231+
/* TODO: this error should return to the caller and invoke an error
232+
* handler from the MPI API call.
233+
* For now, it is fatal. */
234+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
224235
}
225236

226237
/* Add custom attribute, to clean up UCX resources when OMPI datatype is
@@ -234,9 +245,13 @@ ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
234245
ompi_pml_ucx.datatype_attr_keyval,
235246
(void*)ucp_datatype, false);
236247
if (ret != OMPI_SUCCESS) {
248+
int err = MPI_ERR_INTERN;
237249
PML_UCX_ERROR("Failed to add UCX datatype attribute for %s: %d",
238250
datatype->name, ret);
239-
ompi_mpi_abort(&ompi_mpi_comm_world.comm, 1);
251+
/* TODO: this error should return to the caller and invoke an error
252+
* handler from the MPI API call.
253+
* For now, it is fatal. */
254+
ompi_mpi_errors_are_fatal_comm_handler(NULL, &err, "Failed to allocate datatype structure");
240255
}
241256
}
242257
out:

ompi/mca/vprotocol/pessimist/vprotocol_pessimist_sender_based.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/*
2-
* Copyright (c) 2004-2014 The Trustees of the University of Tennessee.
3-
* All rights reserved.
2+
* Copyright (c) 2004-2020 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
45
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
56
* $COPYRIGHT$
67
*
@@ -56,7 +57,7 @@ static void sb_mmap_alloc(void)
5657
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: ftruncate: %s",
5758
strerror(errno));
5859
close(sb.sb_fd);
59-
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE);
60+
ompi_mpi_abort(MPI_COMM_SELF, MPI_ERR_NO_SPACE);
6061
}
6162
sb.sb_addr = (uintptr_t) mmap((void *) sb.sb_addr, sb.sb_length,
6263
PROT_WRITE | PROT_READ,
@@ -67,7 +68,7 @@ static void sb_mmap_alloc(void)
6768
V_OUTPUT_ERR("pml_v: vprotocol_pessimist: sender_based_alloc: mmap: %s",
6869
strerror(errno));
6970
close(sb.sb_fd);
70-
ompi_mpi_abort(MPI_COMM_NULL, MPI_ERR_NO_SPACE);
71+
ompi_mpi_abort(MPI_COMM_SELF, MPI_ERR_NO_SPACE);
7172
}
7273
}
7374

ompi/mpi/c/abort.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2014 The University of Tennessee and The University
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2008 High Performance Computing Center Stuttgart,
@@ -56,6 +56,7 @@ int MPI_Abort(MPI_Comm comm, int errorcode)
5656
opal_show_help("help-mpi-api.txt", "mpi-abort", true,
5757
ompi_comm_rank(comm),
5858
('\0' != comm->c_name[0]) ? comm->c_name : "<Unknown>",
59+
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME),
5960
errorcode);
6061
return ompi_mpi_abort(comm, errorcode);
6162
}

0 commit comments

Comments
 (0)