Skip to content

Commit e2f53b7

Browse files
committed
Add a tester for the ERRORS_ABORT and communicator abort features
Signed-off-by: Aurélien Bouteiller <bouteill@icl.utk.edu>
1 parent 9c22ad8 commit e2f53b7

File tree

2 files changed

+175
-1
lines changed

2 files changed

+175
-1
lines changed

test/simple/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort simple_spawn \
1+
PROGS = mpi_no_op mpi_barrier hello hello_nodename abort multi_abort comm_abort simple_spawn \
22
concurrent_spawn spawn_multiple mpi_spin delayed_abort loop_spawn loop_child \
33
bad_exit pubsub hello_barrier segv accept connect hello_output hello_show_help \
44
crisscross read_write ziatest slave reduce-hang ziaprobe ziatest bcast_loop \

test/simple/comm_abort.c

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
/* -*- C -*-
2+
* Copyright (c) 2020 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
5+
* $COPYRIGHT$
6+
*
7+
* Additional copyrights may follow
8+
*
9+
* $HEADER$
10+
*
11+
* Test aborting communicators
12+
*/
13+
14+
#include <stdio.h>
15+
#include <unistd.h>
16+
#include "mpi.h"
17+
18+
#define print1(format...) if(0 == rank) printf(format)
19+
20+
21+
int main(int argc, char* argv[])
22+
{
23+
int rank, size, more;
24+
double start, now;
25+
MPI_Comm comm_pair_fatal, comm_pair_return, comm_pair_abort;
26+
27+
MPI_Init(&argc, &argv);
28+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
29+
MPI_Comm_size(MPI_COMM_WORLD, &size);
30+
31+
if(0 == rank && size%2) {
32+
fprintf(stderr, "This test requires an even number of processes\n\n");
33+
MPI_Abort(MPI_COMM_WORLD, size);
34+
}
35+
36+
/* Setup: split our world in a set of 2-processes islands */
37+
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_fatal);
38+
MPI_Comm_set_errhandler(comm_pair_fatal, MPI_ERRORS_ARE_FATAL);
39+
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_return);
40+
MPI_Comm_set_errhandler(comm_pair_return, MPI_ERRORS_RETURN);
41+
MPI_Comm_split(MPI_COMM_WORLD, rank/2, rank, &comm_pair_abort);
42+
/* If this code fails to compile, the MPI implementation is not compliant
43+
* with MPI-4 (TODO: add ref to chapter/line when MPI-4 published). */
44+
MPI_Comm_set_errhandler(comm_pair_abort, MPI_ERRORS_ABORT);
45+
MPI_Barrier(MPI_COMM_WORLD);
46+
47+
print1(
48+
"This program will test partial abort functionality (communicator scoped abort).\n"
49+
" Each test will perform a loop of communication on a subcommunicator for about\n"
50+
" 1 second between printouts, and then, a 1 second cooldown.\n");
51+
52+
print1("\n\n"
53+
"Test1: MPI_Abort(MPI_COMM_SELF) aborts only one process?\n"
54+
" In a high quality implementation, all ranks except %d\n"
55+
" should report their presence.\n", 1);
56+
if(rank == 1) {
57+
MPI_Abort(MPI_COMM_SELF, 1);
58+
}
59+
/* Spin on communication for 1 second to let time for Abort to have an
60+
* effect, if any. */
61+
more = 1; start = MPI_Wtime();
62+
do {
63+
now = MPI_Wtime();
64+
if(now - start > 1.) more = 0;
65+
if(rank > 1) /* don't reduce on aborted pairs */
66+
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
67+
} while(more);
68+
printf(" This is rank %d: still kickin after %d MPI_Abort'ed self\n", rank, 1);
69+
70+
sleep(1);
71+
print1("===============================================================\n");
72+
73+
print1("\n\n"
74+
"Test2: MPI_Abort(comm) aborts all processes in comm?\n"
75+
" In a high quality implementation, all ranks except %d--%d\n"
76+
" should report their presence.\n", 1, 3);
77+
if(rank == 3) {
78+
MPI_Abort(comm_pair_return, 2);
79+
}
80+
/* Spin on communication for 1 second to let time for Abort to have an
81+
* effect, if any. */
82+
more = 1; start = MPI_Wtime();
83+
do {
84+
now = MPI_Wtime();
85+
if(now - start > 1.) more = 0;
86+
if(rank > 3) /* don't reduce on aborted pairs */
87+
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
88+
} while(more);
89+
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 3, 2, 3);
90+
91+
/* This process should have aborted, give it an opportunity to do so if no
92+
* async progress: message to self to spin MPI progress. */
93+
if(rank == 2) {
94+
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
95+
&now, 1, MPI_DOUBLE, 0, 0,
96+
MPI_COMM_SELF, MPI_STATUS_IGNORE);
97+
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 2);
98+
}
99+
100+
sleep(1);
101+
print1("===============================================================\n");
102+
103+
print1("\n\n"
104+
"Test3: MPI_ERRORS_ABORT aborts all processes in comm?\n"
105+
" In a high quality implementation, all ranks except %d--%d\n"
106+
" should report their presence.\n", 1, 5);
107+
if(rank == 5) {
108+
MPI_Comm_call_errhandler(comm_pair_abort, 3);
109+
}
110+
/* Spin on communication for 1 second to let time for Abort to have an
111+
* effect, if any. */
112+
more = 1; start = MPI_Wtime();
113+
do {
114+
now = MPI_Wtime();
115+
if(now - start > 1.) more = 0;
116+
if(rank > 5) /* don't reduce on aborted pairs */
117+
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
118+
} while(more);
119+
printf(" This is rank %d: still kickin after %d aborted comm pair %d-%d\n", rank, 5, 4, 5);
120+
121+
/* This process should have aborted, give it an opportunity to do so if no
122+
* async progress: message to self to spin MPI progress. */
123+
if(rank == 4) {
124+
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
125+
&now, 1, MPI_DOUBLE, 0, 0,
126+
MPI_COMM_SELF, MPI_STATUS_IGNORE);
127+
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", 4);
128+
}
129+
130+
sleep(1);
131+
print1("===============================================================\n");
132+
133+
print1("\n\n"\
134+
"Test4: Communicating with an aborted process %d returns a good error code?\n"
135+
" In a high quality implementation, rank %d should print an error string;\n"
136+
" In a higher quality implementation the error should be of class\n"
137+
" MPI_ERR_PROC_ABORTED.\n", 1, 0);
138+
if(rank == 0) {
139+
int err, class, slen;
140+
char str[MPI_MAX_ERROR_STRING];
141+
/* remember, 1 aborted in test1 */
142+
MPI_Error_class(err, &class);
143+
MPI_Error_string(err, str, &slen);
144+
err = MPI_Recv(&more, 1, MPI_INT, 1, 0, comm_pair_return, MPI_STATUS_IGNORE);
145+
printf(" This is rank %d: Recv(from=%d) returned code=%d: class=%d: %s\n", 0, 1, err, class, str);
146+
}
147+
148+
sleep(1);
149+
print1("===============================================================\n");
150+
151+
print1("\n\n"
152+
"Test5: MPI_ERRORS_ARE_FATAL aborts all processes?\n");
153+
if(rank == 0) {
154+
MPI_Comm_call_errhandler(comm_pair_fatal, 5);
155+
}
156+
/* Spin on communication for 1 second to let time for Abort to have an
157+
* effect, if any. */
158+
more = 1; start = MPI_Wtime();
159+
do {
160+
now = MPI_Wtime();
161+
if(now - start > 1.) more = 0;
162+
if(rank > 5) /* don't reduce on aborted pairs */
163+
MPI_Allreduce(MPI_IN_PLACE, &more, 1, MPI_INT, MPI_MIN, comm_pair_fatal);
164+
} while(more);
165+
MPI_Sendrecv(&start, 1, MPI_DOUBLE, 0, 0,
166+
&now, 1, MPI_DOUBLE, 0, 0,
167+
MPI_COMM_SELF, MPI_STATUS_IGNORE);
168+
printf(" This is rank %d: ERROR: I SHOULD HAVE ABORTED!\n", rank);
169+
170+
/* Should never get there */
171+
172+
MPI_Finalize();
173+
return 0;
174+
}

0 commit comments

Comments
 (0)