Skip to content

Commit 4a8a330

Browse files
committed
common/ompio: use avg. file view size in the aggregator selection logic
This is a fix based on a bugreport on github/mailing list from CGNS. The core of the problem was that different processes entered different branches of our aggregator selection logic, due to the fact that in some cases processes had a matching file_view size and contiguous chunk size (thus assuming 1-D distribution), and some processes did not (thus assuming 2-D distribution). The fix is to calculate the avg. file view size across all processes and use this value, thus ensuring that all processes enter the same branch. Fixes issue #7809 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
1 parent 4e59d97 commit 4a8a330

File tree

3 files changed

+9
-46
lines changed

3 files changed

+9
-46
lines changed

ompi/mca/common/ompio/common_ompio.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2008-2019 University of Houston. All rights reserved.
13+
* Copyright (c) 2008-2020 University of Houston. All rights reserved.
1414
* Copyright (c) 2018 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* Copyright (c) 2018 DataDirect Networks. All rights reserved.
@@ -168,6 +168,7 @@ struct ompio_file_t {
168168
size_t f_stripe_size;
169169
int f_stripe_count;
170170
size_t f_cc_size;
171+
size_t f_avg_view_size;
171172
int f_bytes_per_agg;
172173
enum ompio_fs_type f_fstype;
173174
ompi_request_t *f_split_coll_req;

ompi/mca/common/ompio/common_ompio_aggregators.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
107107
/* Determine whether to use the formula for 1-D or 2-D data decomposition. Anything
108108
** that is not 1-D is assumed to be 2-D in this version
109109
*/
110-
mode = ( fh->f_cc_size == fh->f_view_size ) ? 1 : 2;
110+
mode = ( fh->f_cc_size == fh->f_avg_view_size ) ? 1 : 2;
111111

112112
/* Determine the increment size when searching the optimal
113113
** no. of aggregators

ompi/mca/common/ompio/common_ompio_file_view.c

Lines changed: 6 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -350,36 +350,28 @@ int mca_common_ompio_set_view (ompio_file_t *fh,
350350

351351
OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
352352
{
353-
int uniform = 0;
354353
OMPI_MPI_OFFSET_TYPE avg[3] = {0,0,0};
355354
OMPI_MPI_OFFSET_TYPE global_avg[3] = {0,0,0};
356355
int i = 0;
357356

358-
/* This function does two things: first, it determines the average data chunk
359-
** size in the file view for each process and across all processes.
360-
** Second, it establishes whether the view across all processes is uniform.
361-
** By definition, uniform means:
362-
** 1. the file view of each process has the same number of contiguous sections
363-
** 2. each section in the file view has exactly the same size
357+
/* This function determines the average data chunk
358+
** size in the file view for each process and across all processes,
359+
** and the avg. file_view size across processes.
364360
*/
365361

366362
if ( flag ) {
367363
global_avg[0] = MCA_IO_DEFAULT_FILE_VIEW_SIZE;
364+
fh->f_avg_view_size = fh->f_view_size;
368365
}
369366
else {
370367
for (i=0 ; i<(int)fh->f_iov_count ; i++) {
371368
avg[0] += fh->f_decoded_iov[i].iov_len;
372-
if (i && 0 == uniform) {
373-
if (fh->f_decoded_iov[i].iov_len != fh->f_decoded_iov[i-1].iov_len) {
374-
uniform = 1;
375-
}
376-
}
377369
}
378370
if ( 0 != fh->f_iov_count ) {
379371
avg[0] = avg[0]/fh->f_iov_count;
380372
}
381373
avg[1] = (OMPI_MPI_OFFSET_TYPE) fh->f_iov_count;
382-
avg[2] = (OMPI_MPI_OFFSET_TYPE) uniform;
374+
avg[2] = (OMPI_MPI_OFFSET_TYPE) fh->f_view_size;
383375

384376
fh->f_comm->c_coll->coll_allreduce (avg,
385377
global_avg,
@@ -390,37 +382,7 @@ OMPI_MPI_OFFSET_TYPE get_contiguous_chunk_size (ompio_file_t *fh, int flag)
390382
fh->f_comm->c_coll->coll_allreduce_module);
391383
global_avg[0] = global_avg[0]/fh->f_size;
392384
global_avg[1] = global_avg[1]/fh->f_size;
393-
394-
#if 0
395-
/* Disabling the feature since we are not using it anyway. Saves us one allreduce operation. */
396-
int global_uniform=0;
397-
398-
if ( global_avg[0] == avg[0] &&
399-
global_avg[1] == avg[1] &&
400-
0 == avg[2] &&
401-
0 == global_avg[2] ) {
402-
uniform = 0;
403-
}
404-
else {
405-
uniform = 1;
406-
}
407-
408-
/* second confirmation round to see whether all processes agree
409-
** on having a uniform file view or not
410-
*/
411-
fh->f_comm->c_coll->coll_allreduce (&uniform,
412-
&global_uniform,
413-
1,
414-
MPI_INT,
415-
MPI_MAX,
416-
fh->f_comm,
417-
fh->f_comm->c_coll->coll_allreduce_module);
418-
419-
if ( 0 == global_uniform ){
420-
/* yes, everybody agrees on having a uniform file view */
421-
fh->f_flags |= OMPIO_UNIFORM_FVIEW;
422-
}
423-
#endif
385+
fh->f_avg_view_size = global_avg[2]/fh->f_size;
424386
}
425387

426388
return global_avg[0];

0 commit comments

Comments
 (0)