Skip to content

Commit f1ba706

Browse files
author
David Wootton
committed
Add testcases for MPI_Exscan, MPI_Reduce_scatter and MPI_Scan
Add testcase for MPI_Alltoallv Add test_alltoallv.c testcase source missed in previous commit Signed-off-by: David Wootton <dwootton@us.ibm.com>
1 parent 3b931e1 commit f1ba706

File tree

7 files changed

+886
-2
lines changed

7 files changed

+886
-2
lines changed

collective-big-count/Makefile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,15 @@ TEST_PAYLOAD_SIZE=2147483647
4646
######################################################################
4747
BINCC = \
4848
test_alltoall \
49+
test_alltoallv \
4950
test_allgather test_allgatherv \
5051
test_allreduce \
5152
test_bcast \
53+
test_exscan \
5254
test_gather test_gatherv \
5355
test_reduce \
56+
test_reduce_scatter \
57+
test_scan \
5458
test_scatter test_scatterv \
5559
diagnostic
5660

@@ -84,10 +88,18 @@ test_alltoall: common.h test_alltoall.c
8488
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_alltoall.c
8589
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_alltoall.c
8690

91+
test_alltoallv: common.h test_alltoallv.c
92+
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_alltoallv.c
93+
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_alltoallv.c
94+
8795
test_bcast: common.h test_bcast.c
8896
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_bcast.c
8997
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_bcast.c
9098

99+
test_exscan: common.h test_exscan.c
100+
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_exscan.c
101+
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_exscan.c
102+
91103
test_gather: common.h test_gather.c
92104
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_gather.c
93105
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_gather.c
@@ -100,6 +112,14 @@ test_reduce: common.h test_reduce.c
100112
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_reduce.c
101113
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_reduce.c
102114

115+
test_reduce_scatter: common.h test_reduce_scatter.c
116+
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_reduce_scatter.c
117+
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_reduce_scatter.c
118+
119+
test_scan: common.h test_scan.c
120+
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_scan.c
121+
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_scan.c
122+
103123
test_scatter: common.h test_scatter.c
104124
$(CC) $(CC_FLAGS) -DTEST_PAYLOAD_SIZE=$(TEST_PAYLOAD_SIZE) -o $@ -I. test_scatter.c
105125
$(CC) $(CC_FLAGS) -DTEST_UNIFORM_COUNT=$(TEST_UNIFORM_COUNT) -o $@_uniform_count -I. test_scatter.c

collective-big-count/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,5 +111,3 @@ Rank 1: PASSED
111111
Collectives missing from this test suite:
112112
* Barrier (N/A)
113113
* Alltoallv
114-
* Reduce_scatter
115-
* Scan / Exscan

collective-big-count/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#include <string.h>
3333
#include <unistd.h>
3434

35+
#define PRIME_MODULUS 997
36+
3537
/*
3638
* Debugging messages
3739
* 0 = off

collective-big-count/test_alltoallv.c

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
/*
2+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
3+
*
4+
* $COPYRIGHT$
5+
*/
6+
#include <stdio.h>
7+
#include <stdlib.h>
8+
#include <unistd.h>
9+
10+
#include <mpi.h>
11+
#include "common.h"
12+
13+
int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking);
14+
15+
int main(int argc, char** argv) {
16+
// Initialize the MPI environment
17+
int ret = 0;
18+
19+
MPI_Init(NULL, NULL);
20+
init_environment(argc, argv);
21+
22+
// Run the tests
23+
#ifndef TEST_UNIFORM_COUNT
24+
// Buffer size: 2 GB
25+
// V_SIZE_INT tells us how many elements are needed to reach 2GB payload
26+
// Each rank will send/recv a count of V_SIZE_INT / world_size
27+
// The function will try to get as close to that as possible.
28+
//
29+
// Each rank contribues: V_SIZE_INT / world_size elements
30+
// Largest buffer is : V_SIZE_INT elements
31+
ret += my_c_test_core(MPI_INT, V_SIZE_INT, true);
32+
ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, true);
33+
if (allow_nonblocked) {
34+
ret += my_c_test_core(MPI_INT, V_SIZE_INT, false);
35+
ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, V_SIZE_DOUBLE_COMPLEX, false);
36+
}
37+
#else
38+
size_t proposed_count;
39+
40+
// Each rank contribues: TEST_UNIFORM_COUNT elements
41+
// Largest buffer is : TEST_UNIFORM_COUNT x world_size
42+
proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT,
43+
(size_t)world_size, (size_t)world_size);
44+
ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, true);
45+
46+
proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT,
47+
(size_t)world_size, (size_t)world_size);
48+
ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, true);
49+
if (allow_nonblocked) {
50+
proposed_count = calc_uniform_count(sizeof(int), TEST_UNIFORM_COUNT,
51+
(size_t)world_size, (size_t)world_size);
52+
ret += my_c_test_core(MPI_INT, proposed_count * (size_t)world_size, false);
53+
proposed_count = calc_uniform_count(sizeof(double _Complex), TEST_UNIFORM_COUNT,
54+
(size_t)world_size, (size_t)world_size);
55+
ret += my_c_test_core(MPI_C_DOUBLE_COMPLEX, proposed_count * (size_t)world_size, false);
56+
}
57+
#endif
58+
59+
/*
60+
* All done
61+
*/
62+
MPI_Finalize();
63+
return ret;
64+
}
65+
66+
int my_c_test_core(MPI_Datatype dtype, size_t total_num_elements, bool blocking)
67+
{
68+
int ret = 0;
69+
size_t i;
70+
size_t j;
71+
72+
// Actual payload size as divisible by the sizeof(dt)
73+
size_t payload_size_actual;
74+
size_t excess_size_actual;
75+
76+
int *my_int_recv_vector = NULL;
77+
int *my_int_send_vector = NULL;
78+
double _Complex *my_dc_recv_vector = NULL;
79+
double _Complex *my_dc_send_vector = NULL;
80+
MPI_Request request;
81+
int exp;
82+
size_t num_wrong;
83+
int excess_count;
84+
size_t current_base;
85+
int receive_counts[world_size];
86+
int receive_offsets[world_size];
87+
int send_counts[world_size];
88+
int send_offsets[world_size];
89+
char *mpi_function = blocking ? "MPI_Alltoallv" : "MPI_Ialltoallv";
90+
91+
assert(MPI_INT == dtype || MPI_C_DOUBLE_COMPLEX == dtype);
92+
if (total_num_elements > INT_MAX) {
93+
total_num_elements = INT_MAX;
94+
}
95+
96+
// Force unequal distribution of data across ranks
97+
if ((total_num_elements % world_size) == 0) {
98+
total_num_elements = total_num_elements - 1;
99+
}
100+
excess_count = total_num_elements % world_size;
101+
102+
// The value of total_num_elements passed to this function should not exceed
103+
// INT_MAX. By adding an extra element to force unequal distribution,
104+
// total_num_elements may exceed INT_MAX so the value must be adjusted
105+
// downward.
106+
if ((total_num_elements + excess_count) > INT_MAX) {
107+
total_num_elements = total_num_elements - world_size;
108+
}
109+
110+
// Data sent by all ranks to all ranks other than highest rank is
111+
// (total_num_elements / world_size) elements. All ranks send that
112+
// data plus the excess (total_num_elements % world_size) to the
113+
// highest rank. All ranks must receive exactly the number of elements
114+
// they were sent.
115+
current_base = 0;
116+
for (i = 0; i < world_size; i++) {
117+
send_counts[i] = total_num_elements / world_size;
118+
receive_counts[i] = total_num_elements / world_size;
119+
send_offsets[i] = current_base;
120+
receive_offsets[i] = current_base;
121+
current_base = current_base + send_counts[i];
122+
}
123+
send_counts[world_size - 1] += excess_count;
124+
125+
// Since the highest rank receives excess elements due to unequal distribution,
126+
// the receive counts and receive offsets need to be adjusted by that count.
127+
if (world_rank == (world_size - 1)) {
128+
current_base = 0;
129+
for (i = 0; i < world_size; i++) {
130+
receive_offsets[i] = current_base;
131+
receive_counts[i] = (total_num_elements / world_size) + excess_count;
132+
current_base = current_base + receive_counts[i];
133+
}
134+
}
135+
136+
// Allocate send and receive buffers. The send buffer for each rank is
137+
// allocated to hold the total_num_elements sent to all ranks. Since
138+
// total_num_elements is forced to a value not evenly divisible by the
139+
// world_size, and the excess elements are sent by each rank to the last
140+
// rank, the receive buffer for the last rank must be larger than the
141+
// send buffer by excess_count * world_size. For the other ranks, allocating
142+
// send and receive buffers identically is sufficient.
143+
if( MPI_INT == dtype ) {
144+
payload_size_actual = total_num_elements * sizeof(int);
145+
if (world_rank == (world_size - 1)) {
146+
excess_size_actual = world_size * excess_count * sizeof(int);
147+
my_int_recv_vector = (int*)safe_malloc(payload_size_actual +
148+
excess_size_actual);
149+
}
150+
else {
151+
my_int_recv_vector = (int*)safe_malloc(payload_size_actual);
152+
}
153+
my_int_send_vector = (int*)safe_malloc(payload_size_actual);
154+
} else {
155+
payload_size_actual = total_num_elements * sizeof(double _Complex);
156+
if (world_rank == (world_size - 1)) {
157+
excess_size_actual = world_size * excess_count * sizeof(double _Complex);
158+
my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual +
159+
excess_size_actual);
160+
}
161+
else {
162+
my_dc_recv_vector = (double _Complex*)safe_malloc(payload_size_actual);
163+
}
164+
my_dc_send_vector = (double _Complex*)safe_malloc(payload_size_actual);
165+
}
166+
167+
// Initialize blocks of data to be sent to each rank to a unique range of values
168+
// using array index modulo prime and offset by prime * rank
169+
if (MPI_INT == dtype) {
170+
for (i = 0; i < world_size; ++i) {
171+
for (j = 0; j < send_counts[i]; j++) {
172+
exp = (j % PRIME_MODULUS) + (PRIME_MODULUS * world_rank);
173+
my_int_send_vector[j + send_offsets[i]] = exp;
174+
}
175+
}
176+
}
177+
else {
178+
for (i = 0; i < world_size; ++i) {
179+
for (j = 0; j < send_counts[i]; j++) {
180+
exp = (j % PRIME_MODULUS) + (PRIME_MODULUS * world_rank);
181+
my_dc_send_vector[j + send_offsets[i]] = (1.0 * exp - 1.0 * exp * I);
182+
}
183+
}
184+
}
185+
186+
if (world_rank == 0) {
187+
printf("---------------------\nResults from %s(%s x %zu = %zu or %s): MPI_IN_PLACE\n",
188+
mpi_function, (MPI_INT == dtype ? "int" : "double _Complex"),
189+
total_num_elements, payload_size_actual, human_bytes(payload_size_actual));
190+
}
191+
192+
// Perform the MPI_Alltoallv operation
193+
if (blocking) {
194+
if( MPI_INT == dtype ) {
195+
MPI_Alltoallv(my_int_send_vector, send_counts,
196+
send_offsets, dtype,
197+
my_int_recv_vector, receive_counts,
198+
receive_offsets, dtype,
199+
MPI_COMM_WORLD);
200+
} else {
201+
MPI_Alltoallv(my_dc_send_vector, send_counts,
202+
send_offsets, dtype,
203+
my_dc_recv_vector, receive_counts,
204+
receive_offsets, dtype,
205+
MPI_COMM_WORLD);
206+
}
207+
}
208+
else {
209+
if( MPI_INT == dtype ) {
210+
MPI_Ialltoallv(my_int_send_vector, send_counts,
211+
send_offsets, dtype,
212+
my_int_recv_vector, receive_counts,
213+
receive_offsets, dtype,
214+
MPI_COMM_WORLD, &request);
215+
} else {
216+
MPI_Ialltoallv(my_dc_send_vector, send_counts,
217+
send_offsets, dtype,
218+
my_dc_recv_vector, receive_counts,
219+
receive_offsets, dtype,
220+
MPI_COMM_WORLD, &request);
221+
}
222+
MPI_Wait(&request, MPI_STATUS_IGNORE);
223+
}
224+
225+
// Check results. Each receive buffer segment must match the
226+
// values in the send buffer segment it was sent.
227+
num_wrong = 0;
228+
current_base = 0;
229+
if (MPI_INT == dtype) {
230+
for (i = 0; i < world_size; i++) {
231+
for (j = 0; j < receive_counts[i]; j++) {
232+
exp = (j % PRIME_MODULUS) + (PRIME_MODULUS * i);
233+
if (my_int_recv_vector[current_base + j] != exp) {
234+
num_wrong = num_wrong + 1;
235+
}
236+
}
237+
current_base = current_base + receive_counts[i];
238+
}
239+
}
240+
else {
241+
for (i = 0; i < world_size; i++) {
242+
for (j = 0; j < receive_counts[i]; j++) {
243+
exp = (j % PRIME_MODULUS) + (PRIME_MODULUS * i);
244+
if (my_dc_recv_vector[current_base + j] != (1.0 * exp - 1.0 * exp * I)) {
245+
num_wrong = num_wrong + 1;
246+
}
247+
}
248+
current_base = current_base + receive_counts[i];
249+
}
250+
}
251+
252+
if (0 == num_wrong) {
253+
printf("Rank %2d: PASSED\n", world_rank);
254+
} else {
255+
printf("Rank %2d: ERROR: DI in %14zu of %14zu slots (%6.1f %% wrong)\n", world_rank,
256+
num_wrong, total_num_elements,
257+
((num_wrong * 1.0) / total_num_elements * 100.0));
258+
ret = 1;
259+
}
260+
261+
if (NULL != my_int_send_vector) {
262+
free(my_int_send_vector);
263+
}
264+
if (NULL != my_int_recv_vector){
265+
free(my_int_recv_vector);
266+
}
267+
if (NULL != my_dc_send_vector) {
268+
free(my_dc_send_vector);
269+
}
270+
if (NULL != my_dc_recv_vector){
271+
free(my_dc_recv_vector);
272+
}
273+
274+
fflush(NULL);
275+
MPI_Barrier(MPI_COMM_WORLD);
276+
277+
return ret;
278+
}

0 commit comments

Comments
 (0)