2
2
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3
3
* University Research and Technology
4
4
* Corporation. All rights reserved.
5
- * Copyright (c) 2004-2014 The University of Tennessee and The University
5
+ * Copyright (c) 2004-2020 The University of Tennessee and The University
6
6
* of Tennessee Research Foundation. All rights
7
7
* reserved.
8
8
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
47
47
/*
48
48
* Local functions
49
49
*/
50
- static void backend_fatal ( char * type , struct ompi_communicator_t * comm ,
50
+ static void backend_abort ( int fatal , char * type , struct ompi_communicator_t * comm ,
51
51
char * name , int * error_code , va_list arglist );
52
52
static void out (char * str , char * arg );
53
53
@@ -68,7 +68,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
68
68
name = NULL ;
69
69
abort_comm = NULL ;
70
70
}
71
- backend_fatal ( "communicator" , abort_comm , name , error_code , arglist );
71
+ backend_abort (true, "communicator" , abort_comm , name , error_code , arglist );
72
72
va_end (arglist );
73
73
}
74
74
@@ -89,7 +89,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
89
89
name = NULL ;
90
90
abort_comm = NULL ;
91
91
}
92
- backend_fatal ( "file" , abort_comm , name , error_code , arglist );
92
+ backend_abort (true, "file" , abort_comm , name , error_code , arglist );
93
93
va_end (arglist );
94
94
}
95
95
@@ -108,7 +108,67 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
108
108
} else {
109
109
name = NULL ;
110
110
}
111
- backend_fatal ("win" , abort_comm , name , error_code , arglist );
111
+ backend_abort (true, "win" , abort_comm , name , error_code , arglist );
112
+ va_end (arglist );
113
+ }
114
+
115
+ void ompi_mpi_errors_abort_comm_handler (struct ompi_communicator_t * * comm ,
116
+ int * error_code , ...)
117
+ {
118
+ char * name ;
119
+ struct ompi_communicator_t * abort_comm ;
120
+ va_list arglist ;
121
+
122
+ va_start (arglist , error_code );
123
+
124
+ if ( (NULL != comm ) && (NULL != * comm ) ) {
125
+ name = (* comm )-> c_name ;
126
+ abort_comm = * comm ;
127
+ } else {
128
+ name = NULL ;
129
+ abort_comm = NULL ;
130
+ }
131
+ backend_abort (false, "communicator" , abort_comm , name , error_code , arglist );
132
+ va_end (arglist );
133
+ }
134
+
135
+
136
+ void ompi_mpi_errors_abort_file_handler (struct ompi_file_t * * file ,
137
+ int * error_code , ...)
138
+ {
139
+ char * name ;
140
+ struct ompi_communicator_t * abort_comm ;
141
+ va_list arglist ;
142
+
143
+ va_start (arglist , error_code );
144
+
145
+ if (NULL != file ) {
146
+ name = (* file )-> f_filename ;
147
+ abort_comm = (* file )-> f_comm ;
148
+ } else {
149
+ name = NULL ;
150
+ abort_comm = NULL ;
151
+ }
152
+ backend_abort (false, "file" , abort_comm , name , error_code , arglist );
153
+ va_end (arglist );
154
+ }
155
+
156
+
157
+ void ompi_mpi_errors_abort_win_handler (struct ompi_win_t * * win ,
158
+ int * error_code , ...)
159
+ {
160
+ char * name ;
161
+ struct ompi_communicator_t * abort_comm = NULL ;
162
+ va_list arglist ;
163
+
164
+ va_start (arglist , error_code );
165
+
166
+ if (NULL != win ) {
167
+ name = (* win )-> w_name ;
168
+ } else {
169
+ name = NULL ;
170
+ }
171
+ backend_abort (false, "win" , abort_comm , name , error_code , arglist );
112
172
va_end (arglist );
113
173
}
114
174
@@ -175,7 +235,7 @@ static void out(char *str, char *arg)
175
235
* there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
176
236
* errors here.
177
237
*/
178
- static void backend_fatal_aggregate ( char * type ,
238
+ static void backend_abort_aggregate ( int fatal , char * type ,
179
239
struct ompi_communicator_t * comm ,
180
240
char * name , int * error_code ,
181
241
va_list arglist )
@@ -199,7 +259,7 @@ static void backend_fatal_aggregate(char *type,
199
259
ompi_process_info .nodename ,
200
260
(int ) ompi_process_info .pid ) == -1 ) {
201
261
prefix = NULL ;
202
- // non-fatal , we could still go on to give useful information here...
262
+ // non-abort , we could still go on to give useful information here...
203
263
opal_output (0 , "%s" , "Could not write node and PID to prefix" );
204
264
opal_output (0 , "Node: %s" , ompi_process_info .nodename );
205
265
opal_output (0 , "PID: %d" , (int ) ompi_process_info .pid );
@@ -224,7 +284,7 @@ static void backend_fatal_aggregate(char *type,
224
284
225
285
if (NULL != name ) {
226
286
opal_show_help ("help-mpi-errors.txt" ,
227
- "mpi_errors_are_fatal" ,
287
+ fatal ? "mpi_errors_are_fatal" : "mpi_errors_abort " ,
228
288
false,
229
289
usable_prefix ,
230
290
(NULL == arg ) ? "" : "in" ,
@@ -267,15 +327,15 @@ static void backend_fatal_aggregate(char *type,
267
327
268
328
/*
269
329
* Note that this function has to handle pre-MPI_INIT and
270
- * post-MPI_FINALIZE errors, which backend_fatal_aggregate () does not
330
+ * post-MPI_FINALIZE errors, which backend_abort_aggregate () does not
271
331
* have to handle.
272
332
*
273
333
* This function also intentionally does not call malloc(), just in
274
334
* case we're being called due to some kind of stack/memory error --
275
335
* we *might* be able to get a message out if we're not further
276
336
* corrupting the stack by calling malloc()...
277
337
*/
278
- static void backend_fatal_no_aggregate ( char * type ,
338
+ static void backend_abort_no_aggregate ( int fatal , char * type ,
279
339
struct ompi_communicator_t * comm ,
280
340
char * name , int * error_code ,
281
341
va_list arglist )
@@ -303,7 +363,7 @@ static void backend_fatal_no_aggregate(char *type,
303
363
"*** Unfortunately, no further information is available on *which* MPI\n"
304
364
"*** function was invoked, sorry. :-(\n" , NULL );
305
365
}
306
- out ("*** Your MPI job will now abort.\n" , NULL );
366
+ if ( fatal ) out ("*** Your MPI job will now abort.\n" , NULL );
307
367
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT ) {
308
368
if (NULL != arg ) {
309
369
out ("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
@@ -314,7 +374,7 @@ static void backend_fatal_no_aggregate(char *type,
314
374
"*** Unfortunately, no further information is available on *which* MPI\n"
315
375
"*** function was invoked, sorry. :-(\n" , NULL );
316
376
}
317
- out ("*** Your MPI job will now abort.\n" , NULL );
377
+ if ( fatal ) out ("*** Your MPI job will now abort.\n" , NULL );
318
378
}
319
379
320
380
else {
@@ -365,23 +425,30 @@ static void backend_fatal_no_aggregate(char *type,
365
425
out ("*** Error code: %d (no associated error message)\n" , intbuf );
366
426
}
367
427
}
368
- /* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
369
- out ("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n" , type );
370
- out ("*** and potentially your MPI job)\n" , NULL );
371
-
428
+ /* out("*** MPI_ERRORS_ABORT: your MPI job will now abort\n", NULL); */
429
+ if (fatal ) {
430
+ out ("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n" , type );
431
+ out ("*** and MPI will try to terminate your MPI job as well)\n" , NULL );
432
+ }
433
+ else {
434
+ out ("*** MPI_ERRORS_ABORT (processes in this %s will now abort,\n" , type );
435
+ out ("*** and potentially the rest of your MPI job)\n" , NULL );
436
+ }
372
437
}
373
438
va_end (arglist );
374
439
}
375
440
376
- static void backend_fatal ( char * type , struct ompi_communicator_t * comm ,
441
+ static void backend_abort ( int fatal , char * type , struct ompi_communicator_t * comm ,
377
442
char * name , int * error_code ,
378
443
va_list arglist )
379
444
{
445
+ int err = MPI_ERR_UNKNOWN ;
446
+
380
447
/* We only want aggregation while the rte is initialized */
381
448
if (ompi_rte_initialized ) {
382
- backend_fatal_aggregate ( type , comm , name , error_code , arglist );
449
+ backend_abort_aggregate ( fatal , type , comm , name , error_code , arglist );
383
450
} else {
384
- backend_fatal_no_aggregate ( type , comm , name , error_code , arglist );
451
+ backend_abort_no_aggregate ( fatal , type , comm , name , error_code , arglist );
385
452
}
386
453
387
454
/* In most instances the communicator will be valid. If not, we are either early in
@@ -392,9 +459,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
392
459
comm = & ompi_mpi_comm_self .comm ;
393
460
}
394
461
395
- if (NULL != error_code ) {
396
- ompi_mpi_abort ( comm , * error_code ) ;
397
- } else {
398
- ompi_mpi_abort ( comm , 1 );
399
- }
462
+ if (NULL != error_code )
463
+ err = * error_code ;
464
+
465
+ /* Call abort without a specified comm to force RTE Job termination */
466
+ ompi_mpi_abort ( fatal ? NULL : comm , err );
400
467
}
0 commit comments