Skip to content

Commit 9c22ad8

Browse files
committed
Add the MPI_ERRORS_ABORT predefined error handler (conformance with
mpi-next) Signed-off-by: Aurélien Bouteiller <bouteill@icl.utk.edu> Ordering must match fortran definition index for errhandlers, and we don't want to change the old ones. Signed-off-by: Aurélien Bouteiller <bouteill@icl.utk.edu>
1 parent 6a3e781 commit 9c22ad8

15 files changed

+164
-43
lines changed

contrib/check_unnecessary_headers.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
# University Research and Technology
55
# Corporation. All rights reserved.
6-
# Copyright (c) 2004-2005 The University of Tennessee and The University
6+
# Copyright (c) 2004-2020 The University of Tennessee and The University
77
# of Tennessee Research Foundation. All rights
88
# reserved.
99
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -186,7 +186,7 @@ SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_
186186
SEARCH_HEADER[6]="ompi/datatype/datatype.h MPI_Datatype DT_MAX_PREDEFINED DT_FLAG_ MAX_DT_COMPONENT_COUNT opal_ddt_count_t dt_type_desc_t ompi_datatype_t ompi_predefined_datatype_t ompi_ddt_init ompi_ddt_finalize ompi_ddt_create_ ompi_ddt_duplicate ompi_ddt_is_predefined ompi_ddt_create_from_packed_description"
187187
SEARCH_HEADER[7]="ompi/datatype/datatype_internal.h DDT_DUMP_STACK DT_ ddt_elem_id_description ddt_elem_desc ddt_elem_desc_t ddt_loop_desc ddt_loop_desc_t ddt_endloop_desc ddt_endloop_desc_t dt_elem_desc CREATE_LOOP_START CREATE_LOOP_END CREATE_ELEM ompi_complex_float_t ompi_complex_double_t ompi_complex_long_double_t ompi_ddt_basicDatatypes BASIC_DDT_FROM_ELEM ompi_ddt_default_convertors_init ompi_ddt_default_convertors_fini SAVE_STACK PUSH_STACK ompi_ddt_safeguard_pointer_debug_breakpoint OMPI_DDT_SAFEGUARD_POINTER GET_FIRST_NON_LOOP UPDATE_INTERNAL_COUNTERS ompi_ddt_print_args"
188188
SEARCH_HEADER[8]="ompi/errhandler/errhandler.h OMPI_ERRHANDLER_LANG_ ompi_errhandler_lang_t OMPI_ERRHANDLER_TYPE_ ompi_errhandler_type_t ompi_errhandler_t ompi_predefined_errhandler_t ompi_mpi_errhandler_null OMPI_ERRHANDLER_CHECK OMPI_ERRHANDLER_RETURN ompi_errhandler_init ompi_errhandler_finalize OMPI_ERRHANDLER_INVOKE ompi_errhandler_invoke ompi_errhandler_request_invoke ompi_errhandler_create ompi_errhandler_is_intrinsic ompi_errhandler_fortran_handler_fn_t OMPI_ERR_INIT_FINALIZE MPI_Errhandler"
189-
SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_throw_exceptions"
189+
SEARCH_HEADER[9]="ompi/errhandler/errhandler_predefined.h ompi_mpi_errors_are_fatal_ ompi_mpi_errors_return_ ompi_mpi_errors_abort_ ompi_mpi_errors_throw_exceptions"
190190
###
191191
SEARCH_HEADER[10]="ompi/file/file.h OMPI_FILE_ISCLOSED OMPI_FILE_HIDDEN ompi_file_t ompi_predefined_file_t ompi_mpi_file_null ompi_file_f_to_c_table ompi_file_init ompi_file_open ompi_file_set_name ompi_file_close ompi_file_finalize ompi_file_invalid MPI_File MPI_FILE_NULL ompi_mpi_cxx_file_errhandler_invoke" # THE LAST ONE WAS FOR THE CXX INTERFACE
192192
SEARCH_HEADER[11]="ompi/group/group.h ompi_group_sporadic_list_t ompi_group_sporadic_data_t ompi_group_strided_data_t ompi_group_bitmap_data_t ompi_group_t ompi_predefined_group_t OMPI_GROUP_ ompi_group_f_to_c_table ompi_mpi_group_null ompi_group_allocate ompi_group_increment_proc_count ompi_group_decrement_proc_count ompi_group_size ompi_group_rank ompi_set_group_rank ompi_group_translate_ranks ompi_group_free ompi_group_get_proc_ptr ompi_group_calc_ ompi_group_peer_lookup ompi_group_div_ceil MPI_Group"

ompi/debuggers/MPI_Handles_interface.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,8 @@ C++: MPI::Errhandler
314314

315315
MPI allows applications to define their own error handlers. The
316316
default error handler is to abort the MPI job. Error handlers can be
317-
attached to communicators, files, and windows. There are 3 predefined
318-
error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN,
317+
attached to communicators, files, and windows. There are 4 predefined
318+
error handlers (MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN, MPI_ERRORS_ABORT,
319319
MPI::ERRORS_THROW_EXCEPTIONS), and applications can create their own
320320
error handlers.
321321

ompi/debuggers/ompi_mpihandles_dll.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
3-
* Copyright (c) 2004-2013 The University of Tennessee and The University
3+
* Copyright (c) 2004-2020 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
@@ -384,6 +384,8 @@ int mpidbg_init_per_process(mqs_process *process,
384384
int i = 0;
385385
fill_map(image, "MPI_ERRORS_ARE_FATAL", "ompi_mpi_errors_are_fatal",
386386
&mpidbg_errhandler_name_map[i++]);
387+
fill_map(image, "MPI_ERRORS_ABORT", "ompi_mpi_errors_abort",
388+
&mpidbg_errhandler_name_map[i++]);
387389
fill_map(image, "MPI_ERRORS_RETURN", "ompi_mpi_errors_return",
388390
&mpidbg_errhandler_name_map[i++]);
389391
fill_map(image, "MPI_ERRHANDLER_NULL", "ompi_mpi_errhandler_null",

ompi/errhandler/errhandler.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2017 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -68,6 +68,9 @@ ompi_predefined_errhandler_t *ompi_mpi_errhandler_null_addr =
6868
ompi_predefined_errhandler_t ompi_mpi_errors_are_fatal = {{{0}}};
6969
ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_addr =
7070
&ompi_mpi_errors_are_fatal;
71+
ompi_predefined_errhandler_t ompi_mpi_errors_abort = {{{0}}};
72+
ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr =
73+
&ompi_mpi_errors_abort;
7174
ompi_predefined_errhandler_t ompi_mpi_errors_return = {{{0}}};
7275
ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr =
7376
&ompi_mpi_errors_return;
@@ -127,6 +130,19 @@ int ompi_errhandler_init(void)
127130
opal_string_copy(ompi_mpi_errors_return.eh.eh_name, "MPI_ERRORS_RETURN",
128131
sizeof(ompi_mpi_errors_return.eh.eh_name));
129132

133+
OBJ_CONSTRUCT( &ompi_mpi_errors_abort.eh, ompi_errhandler_t );
134+
if( ompi_mpi_errors_abort.eh.eh_f_to_c_index != OMPI_ERRORS_ABORT_FORTRAN )
135+
return OMPI_ERROR;
136+
ompi_mpi_errors_abort.eh.eh_mpi_object_type = OMPI_ERRHANDLER_TYPE_PREDEFINED;
137+
ompi_mpi_errors_abort.eh.eh_lang = OMPI_ERRHANDLER_LANG_C;
138+
ompi_mpi_errors_abort.eh.eh_comm_fn = ompi_mpi_errors_abort_comm_handler;
139+
ompi_mpi_errors_abort.eh.eh_file_fn = ompi_mpi_errors_abort_file_handler;
140+
ompi_mpi_errors_abort.eh.eh_win_fn = ompi_mpi_errors_abort_win_handler ;
141+
ompi_mpi_errors_abort.eh.eh_fort_fn = NULL;
142+
opal_string_copy(ompi_mpi_errors_abort.eh.eh_name,
143+
"MPI_ERRORS_ABORT",
144+
sizeof(ompi_mpi_errors_abort.eh.eh_name));
145+
130146
/* If we're going to use C++, functions will be fixed up during
131147
MPI::Init. Note that it is proper to use ERRHANDLER_LANG_C here;
132148
the dispatch function is in C (although in libmpi_cxx); the

ompi/errhandler/errhandler.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2011 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -49,7 +49,8 @@ BEGIN_C_DECLS
4949
enum {
5050
OMPI_ERRHANDLER_NULL_FORTRAN = 0,
5151
OMPI_ERRORS_ARE_FATAL_FORTRAN,
52-
OMPI_ERRORS_RETURN_FORTRAN
52+
OMPI_ERRORS_RETURN_FORTRAN,
53+
OMPI_ERRORS_ABORT_FORTRAN,
5354
};
5455

5556

@@ -167,6 +168,12 @@ OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_are_fatal_add
167168
OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_return;
168169
OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_return_addr;
169170

171+
/*
172+
* Global variable for MPI_ERRORS_ABORT (_addr flavor is for F03 bindings)
173+
*/
174+
OMPI_DECLSPEC extern ompi_predefined_errhandler_t ompi_mpi_errors_abort;
175+
OMPI_DECLSPEC extern ompi_predefined_errhandler_t *ompi_mpi_errors_abort_addr;
176+
170177
/**
171178
* Global variable for MPI::ERRORS_THROW_EXCEPTIONS. Will abort if
172179
* MPI_INIT wasn't called as MPI::INIT (_addr flavor is for F03 bindings)

ompi/errhandler/errhandler_invoke.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2005 The University of Tennessee and The University
6+
* Copyright (c) 2004-2020 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -41,10 +41,10 @@ int ompi_errhandler_invoke(ompi_errhandler_t *errhandler, void *mpi_object,
4141
ompi_win_t *win;
4242
ompi_file_t *file;
4343

44-
/* If we got no errorhandler, then just invoke errors_abort */
44+
/* If we got no errorhandler, then just invoke errors_are_fatal */
4545
if (NULL == errhandler) {
4646
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, message);
47-
return err_code;
47+
return err_code;
4848
}
4949

5050
/* Figure out what kind of errhandler it is, figure out if it's

ompi/errhandler/errhandler_predefined.c

Lines changed: 91 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2014 The University of Tennessee and The University
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -47,7 +47,7 @@
4747
/*
4848
* Local functions
4949
*/
50-
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
50+
static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm,
5151
char *name, int *error_code, va_list arglist);
5252
static void out(char *str, char *arg);
5353

@@ -68,7 +68,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
6868
name = NULL;
6969
abort_comm = NULL;
7070
}
71-
backend_fatal("communicator", abort_comm, name, error_code, arglist);
71+
backend_abort(true, "communicator", abort_comm, name, error_code, arglist);
7272
va_end(arglist);
7373
}
7474

@@ -89,7 +89,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
8989
name = NULL;
9090
abort_comm = NULL;
9191
}
92-
backend_fatal("file", abort_comm, name, error_code, arglist);
92+
backend_abort(true, "file", abort_comm, name, error_code, arglist);
9393
va_end(arglist);
9494
}
9595

@@ -108,7 +108,67 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
108108
} else {
109109
name = NULL;
110110
}
111-
backend_fatal("win", abort_comm, name, error_code, arglist);
111+
backend_abort(true, "win", abort_comm, name, error_code, arglist);
112+
va_end(arglist);
113+
}
114+
115+
void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm,
116+
int *error_code, ...)
117+
{
118+
char *name;
119+
struct ompi_communicator_t *abort_comm;
120+
va_list arglist;
121+
122+
va_start(arglist, error_code);
123+
124+
if ( (NULL != comm) && (NULL != *comm) ) {
125+
name = (*comm)->c_name;
126+
abort_comm = *comm;
127+
} else {
128+
name = NULL;
129+
abort_comm = NULL;
130+
}
131+
backend_abort(false, "communicator", abort_comm, name, error_code, arglist);
132+
va_end(arglist);
133+
}
134+
135+
136+
void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file,
137+
int *error_code, ...)
138+
{
139+
char *name;
140+
struct ompi_communicator_t *abort_comm;
141+
va_list arglist;
142+
143+
va_start(arglist, error_code);
144+
145+
if (NULL != file) {
146+
name = (*file)->f_filename;
147+
abort_comm = (*file)->f_comm;
148+
} else {
149+
name = NULL;
150+
abort_comm = NULL;
151+
}
152+
backend_abort(false, "file", abort_comm, name, error_code, arglist);
153+
va_end(arglist);
154+
}
155+
156+
157+
void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win,
158+
int *error_code, ...)
159+
{
160+
char *name;
161+
struct ompi_communicator_t *abort_comm = NULL;
162+
va_list arglist;
163+
164+
va_start(arglist, error_code);
165+
166+
if (NULL != win) {
167+
name = (*win)->w_name;
168+
} else {
169+
name = NULL;
170+
}
171+
backend_abort(false, "win", abort_comm, name, error_code, arglist);
112172
va_end(arglist);
113173
}
114174

@@ -175,7 +235,7 @@ static void out(char *str, char *arg)
175235
* there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
176236
* errors here.
177237
*/
178-
static void backend_fatal_aggregate(char *type,
238+
static void backend_abort_aggregate(int fatal, char *type,
179239
struct ompi_communicator_t *comm,
180240
char *name, int *error_code,
181241
va_list arglist)
@@ -199,7 +259,7 @@ static void backend_fatal_aggregate(char *type,
199259
ompi_process_info.nodename,
200260
(int) ompi_process_info.pid) == -1) {
201261
prefix = NULL;
202-
// non-fatal, we could still go on to give useful information here...
262+
// non-abort, we could still go on to give useful information here...
203263
opal_output(0, "%s", "Could not write node and PID to prefix");
204264
opal_output(0, "Node: %s", ompi_process_info.nodename);
205265
opal_output(0, "PID: %d", (int) ompi_process_info.pid);
@@ -224,7 +284,7 @@ static void backend_fatal_aggregate(char *type,
224284

225285
if (NULL != name) {
226286
opal_show_help("help-mpi-errors.txt",
227-
"mpi_errors_are_fatal",
287+
fatal? "mpi_errors_are_fatal": "mpi_errors_abort",
228288
false,
229289
usable_prefix,
230290
(NULL == arg) ? "" : "in",
@@ -267,15 +327,15 @@ static void backend_fatal_aggregate(char *type,
267327

268328
/*
269329
* Note that this function has to handle pre-MPI_INIT and
270-
* post-MPI_FINALIZE errors, which backend_fatal_aggregate() does not
330+
* post-MPI_FINALIZE errors, which backend_abort_aggregate() does not
271331
* have to handle.
272332
*
273333
* This function also intentionally does not call malloc(), just in
274334
* case we're being called due to some kind of stack/memory error --
275335
* we *might* be able to get a message out if we're not further
276336
* corrupting the stack by calling malloc()...
277337
*/
278-
static void backend_fatal_no_aggregate(char *type,
338+
static void backend_abort_no_aggregate(int fatal, char *type,
279339
struct ompi_communicator_t *comm,
280340
char *name, int *error_code,
281341
va_list arglist)
@@ -303,7 +363,7 @@ static void backend_fatal_no_aggregate(char *type,
303363
"*** Unfortunately, no further information is available on *which* MPI\n"
304364
"*** function was invoked, sorry. :-(\n", NULL);
305365
}
306-
out("*** Your MPI job will now abort.\n", NULL);
366+
if(fatal) out("*** Your MPI job will now abort.\n", NULL);
307367
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
308368
if (NULL != arg) {
309369
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
@@ -314,7 +374,7 @@ static void backend_fatal_no_aggregate(char *type,
314374
"*** Unfortunately, no further information is available on *which* MPI\n"
315375
"*** function was invoked, sorry. :-(\n", NULL);
316376
}
317-
out("*** Your MPI job will now abort.\n", NULL);
377+
if(fatal) out("*** Your MPI job will now abort.\n", NULL);
318378
}
319379

320380
else {
@@ -365,23 +425,30 @@ static void backend_fatal_no_aggregate(char *type,
365425
out("*** Error code: %d (no associated error message)\n", intbuf);
366426
}
367427
}
368-
/* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
369-
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
370-
out("*** and potentially your MPI job)\n", NULL);
371-
428+
/* out("*** MPI_ERRORS_ABORT: your MPI job will now abort\n", NULL); */
429+
if(fatal) {
430+
out("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n", type);
431+
out("*** and MPI will try to terminate your MPI job as well)\n", NULL);
432+
}
433+
else {
434+
out("*** MPI_ERRORS_ABORT (processes in this %s will now abort,\n", type);
435+
out("*** and potentially the rest of your MPI job)\n", NULL);
436+
}
372437
}
373438
va_end(arglist);
374439
}
375440

376-
static void backend_fatal(char *type, struct ompi_communicator_t *comm,
441+
static void backend_abort(int fatal, char *type, struct ompi_communicator_t *comm,
377442
char *name, int *error_code,
378443
va_list arglist)
379444
{
445+
int err = MPI_ERR_UNKNOWN;
446+
380447
/* We only want aggregation while the rte is initialized */
381448
if (ompi_rte_initialized) {
382-
backend_fatal_aggregate(type, comm, name, error_code, arglist);
449+
backend_abort_aggregate(fatal, type, comm, name, error_code, arglist);
383450
} else {
384-
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
451+
backend_abort_no_aggregate(fatal, type, comm, name, error_code, arglist);
385452
}
386453

387454
/* In most instances the communicator will be valid. If not, we are either early in
@@ -392,9 +459,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
392459
comm = &ompi_mpi_comm_self.comm;
393460
}
394461

395-
if (NULL != error_code) {
396-
ompi_mpi_abort(comm, *error_code);
397-
} else {
398-
ompi_mpi_abort(comm, 1);
399-
}
462+
if (NULL != error_code)
463+
err = *error_code;
464+
465+
/* Call abort without a specified comm to force RTE Job termination */
466+
ompi_mpi_abort(fatal? NULL: comm, err);
400467
}

ompi/errhandler/errhandler_predefined.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2005 The University of Tennessee and The University
5+
* Copyright (c) 2004-2020 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -35,6 +35,16 @@ OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **f
3535
OMPI_DECLSPEC void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
3636
int *error_code, ...);
3737

38+
/**
39+
* Handler function for MPI_ERRORS_ABORT
40+
*/
41+
OMPI_DECLSPEC void ompi_mpi_errors_abort_comm_handler(struct ompi_communicator_t **comm,
42+
int *error_code, ...);
43+
OMPI_DECLSPEC void ompi_mpi_errors_abort_file_handler(struct ompi_file_t **file,
44+
int *error_code, ...);
45+
OMPI_DECLSPEC void ompi_mpi_errors_abort_win_handler(struct ompi_win_t **win,
46+
int *error_code, ...);
47+
3848
/**
3949
* Handler function for MPI_ERRORS_RETURN
4050
*/

0 commit comments

Comments
 (0)