Skip to content

Commit 0c4f4e2

Browse files
committed
fbtl/posix: ensure progressing aio requests
This commit fixes a bug discovered while debugging issue #8350 Running our testsuite on Mac OS revealed that posted a large number of non-blocking read/write operations leads to an error message on this platform. A fix is already available and will be committed shortly. The issue stems from limitations on macOs and the concurrent number of aio_read/aio_write operations that can be pending. While the code already handled that correctly for a single request, this bug exposed that the overall limited has to be respected across all pending requests. The solution is to invoke mca_common_ompio_progress if we cannot post new aio operations. Fixes issue #8368 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
1 parent 8c89e3c commit 0c4f4e2

File tree

2 files changed

+37
-17
lines changed

2 files changed

+37
-17
lines changed

ompi/mca/fbtl/posix/fbtl_posix_ipreadv.c

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2008-2015 University of Houston. All rights reserved.
12+
* Copyright (c) 2008-2021 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2018 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* $COPYRIGHT$
@@ -33,6 +33,8 @@
3333
#include "ompi/constants.h"
3434
#include "ompi/mca/fbtl/fbtl.h"
3535

36+
#define MAX_ATTEMPTS 10
37+
3638
ssize_t mca_fbtl_posix_ipreadv (ompio_file_t *fh,
3739
ompi_request_t *request)
3840
{
@@ -44,7 +46,7 @@ ssize_t mca_fbtl_posix_ipreadv (ompio_file_t *fh,
4446

4547
data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t));
4648
if ( NULL == data ) {
47-
opal_output (1,"could not allocate memory\n");
49+
opal_output (1,"mca_fbtl_posix_ipreadv: could not allocate memory\n");
4850
return 0;
4951
}
5052

@@ -56,14 +58,14 @@ ssize_t mca_fbtl_posix_ipreadv (ompio_file_t *fh,
5658
data->aio_reqs = (struct aiocb *) malloc (sizeof(struct aiocb) *
5759
fh->f_num_of_io_entries);
5860
if (NULL == data->aio_reqs) {
59-
opal_output(1, "OUT OF MEMORY\n");
61+
opal_output(1, "mca_fbtl_posix_ipreadv: could not allocate memory\n");
6062
free(data);
6163
return 0;
6264
}
6365

6466
data->aio_req_status = (int *) malloc (sizeof(int) * fh->f_num_of_io_entries);
6567
if (NULL == data->aio_req_status) {
66-
opal_output(1, "OUT OF MEMORY\n");
68+
opal_output(1, "mca_fbtl_posix_ipreadv: could not allocate memory\n");
6769
free(data->aio_reqs);
6870
free(data);
6971
return 0;
@@ -103,14 +105,22 @@ ssize_t mca_fbtl_posix_ipreadv (ompio_file_t *fh,
103105
}
104106

105107
for (i=0; i < data->aio_last_active_req; i++) {
106-
if (-1 == aio_read(&data->aio_reqs[i])) {
107-
opal_output(1, "mca_fbtl_posix_ipreadv: error in aio_read(): %s", strerror(errno));
108-
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
109-
free(data->aio_reqs);
110-
free(data->aio_req_status);
111-
free(data);
112-
return OMPI_ERROR;
113-
}
108+
int counter=0;
109+
while ( MAX_ATTEMPTS > counter ) {
110+
if ( -1 != aio_read(&data->aio_reqs[i]) ) {
111+
break;
112+
}
113+
counter++;
114+
mca_common_ompio_progress();
115+
}
116+
if ( MAX_ATTEMPTS == counter ) {
117+
opal_output(1, "mca_fbtl_posix_ipreadv: error in aio_read(): errno %d %s", errno, strerror(errno));
118+
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
119+
free(data->aio_reqs);
120+
free(data->aio_req_status);
121+
free(data);
122+
return OMPI_ERROR;
123+
}
114124
}
115125

116126
req->req_data = data;

ompi/mca/fbtl/posix/fbtl_posix_ipwritev.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2008-2015 University of Houston. All rights reserved.
12+
* Copyright (c) 2008-2021 University of Houston. All rights reserved.
1313
* Copyright (c) 2015-2018 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
* $COPYRIGHT$
@@ -32,6 +32,8 @@
3232
#include "ompi/constants.h"
3333
#include "ompi/mca/fbtl/fbtl.h"
3434

35+
#define MAX_ATTEMPTS 10
36+
3537
ssize_t mca_fbtl_posix_ipwritev (ompio_file_t *fh,
3638
ompi_request_t *request)
3739
{
@@ -43,7 +45,7 @@ ssize_t mca_fbtl_posix_ipwritev (ompio_file_t *fh,
4345

4446
data = (mca_fbtl_posix_request_data_t *) malloc ( sizeof (mca_fbtl_posix_request_data_t));
4547
if ( NULL == data ) {
46-
opal_output (1,"could not allocate memory\n");
48+
opal_output (1,"mca_fbtl_posix_ipwritev: could not allocate memory\n");
4749
return 0;
4850
}
4951

@@ -55,14 +57,14 @@ ssize_t mca_fbtl_posix_ipwritev (ompio_file_t *fh,
5557
data->aio_reqs = (struct aiocb *) malloc (sizeof(struct aiocb) *
5658
fh->f_num_of_io_entries);
5759
if (NULL == data->aio_reqs) {
58-
opal_output(1, "OUT OF MEMORY\n");
60+
opal_output (1,"mca_fbtl_posix_ipwritev: could not allocate memory\n");
5961
free(data);
6062
return 0;
6163
}
6264

6365
data->aio_req_status = (int *) malloc (sizeof(int) * fh->f_num_of_io_entries);
6466
if (NULL == data->aio_req_status) {
65-
opal_output(1, "OUT OF MEMORY\n");
67+
opal_output (1,"mca_fbtl_posix_ipwritev: could not allocate memory\n");
6668
free(data->aio_reqs);
6769
free(data);
6870
return 0;
@@ -102,7 +104,15 @@ ssize_t mca_fbtl_posix_ipwritev (ompio_file_t *fh,
102104
}
103105

104106
for (i=0; i < data->aio_last_active_req; i++) {
105-
if (-1 == aio_write(&data->aio_reqs[i])) {
107+
int counter=0;
108+
while ( MAX_ATTEMPTS > counter ) {
109+
if (-1 != aio_write(&data->aio_reqs[i])) {
110+
break;
111+
}
112+
counter++;
113+
mca_common_ompio_progress();
114+
}
115+
if ( MAX_ATTEMPTS == counter ) {
106116
opal_output(1, "mca_fbtl_posix_ipwritev: error in aio_write(): %s", strerror(errno));
107117
mca_fbtl_posix_unlock ( &data->aio_lock, data->aio_fh );
108118
free(data->aio_req_status);

0 commit comments

Comments
 (0)