Skip to content

Commit 1a41482

Browse files
ggouaillardethjelmn
authored andcommitted
coll/libnbc: do not recursively call opal_progress()
instead of invoking ompi_request_test_all(), that will end up calling opal_progress() recursively, manually check the status of the requests. the same method is used in ompi_comm_request_progress() Refs #3901 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
1 parent 9a8797a commit 1a41482

File tree

1 file changed

+14
-29
lines changed
  • ompi/mca/coll/libnbc

1 file changed

+14
-29
lines changed

ompi/mca/coll/libnbc/nbc.c

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* rights reserved.
1111
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1212
* reserved.
13-
* Copyright (c) 2015-2017 Research Organization for Information Science
13+
* Copyright (c) 2015-2018 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
1515
*
1616
* Author(s): Torsten Hoefler <htor@cs.indiana.edu>
@@ -315,7 +315,8 @@ static inline void NBC_Free (NBC_Handle* handle) {
315315
*
316316
* to be called *only* from the progress thread !!! */
317317
int NBC_Progress(NBC_Handle *handle) {
318-
int flag, res, ret=NBC_CONTINUE;
318+
int res, ret=NBC_CONTINUE;
319+
bool flag;
319320
unsigned long size = 0;
320321
char *delim;
321322
int i;
@@ -325,43 +326,27 @@ int NBC_Progress(NBC_Handle *handle) {
325326
return NBC_OK;
326327
}
327328

329+
flag = true;
330+
328331
if ((handle->req_count > 0) && (handle->req_array != NULL)) {
329332
NBC_DEBUG(50, "NBC_Progress: testing for %i requests\n", handle->req_count);
330333
#ifdef NBC_TIMING
331334
Test_time -= MPI_Wtime();
332335
#endif
333-
res = ompi_request_test_all(handle->req_count, handle->req_array, &flag, MPI_STATUSES_IGNORE);
334-
if(res != OMPI_SUCCESS) {
335-
// Attempt to cancel outstanding requests
336-
for(i = 0; i < handle->req_count; ++i ) {
337-
// If the request is complete, then try to report the error code
338-
if( handle->req_array[i]->req_complete ) {
339-
if( OMPI_SUCCESS != handle->req_array[i]->req_status.MPI_ERROR ) {
340-
NBC_Error ("MPI Error in MPI_Testall() (req %d = %d)", i, handle->req_array[i]->req_status.MPI_ERROR);
341-
}
342-
}
343-
else {
344-
ompi_request_cancel(handle->req_array[i]);
345-
// If the PML actually canceled the request, then wait on it
346-
if( handle->req_array[i]->req_status._cancelled) {
347-
ompi_request_wait(&handle->req_array[i], &status);
348-
}
349-
// Warn the user that we had to leave a PML message outstanding so
350-
// bad things could happen if they continue using nonblocking collectives
351-
else {
352-
NBC_Error ("MPI Error: Not able to cancel the internal request %d. "
353-
"Be aware that continuing to use nonblocking collectives on this communicator may result in undefined behavior.", i);
354-
}
336+
/* don't call ompi_request_test_all as it causes a recursive call into opal_progress */
337+
while (handle->req_count) {
338+
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
339+
if (REQUEST_COMPLETE(subreq)) {
340+
ompi_request_free(&subreq);
341+
handle->req_count--;
342+
} else {
343+
flag = false;
344+
break;
355345
}
356-
}
357-
358-
return OMPI_ERROR;
359346
}
360347
#ifdef NBC_TIMING
361348
Test_time += MPI_Wtime();
362349
#endif
363-
} else {
364-
flag = 1; /* we had no open requests -> proceed to next round */
365350
}
366351

367352
/* a round is finished */

0 commit comments

Comments
 (0)