Skip to content

Commit bd0d2b8

Browse files
authored
Merge pull request #6086 from ICLDisco/export/errors_nbc
Manage errors in NBC collective ops
2 parents 1be5358 + 65660e5 commit bd0d2b8

File tree

1 file changed

+29
-10
lines changed
  • ompi/mca/coll/libnbc

1 file changed

+29
-10
lines changed

ompi/mca/coll/libnbc/nbc.c

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2006 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2013 The University of Tennessee and The University
6+
* Copyright (c) 2013-2018 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2006 The Technical University of Chemnitz. All
@@ -335,8 +335,14 @@ int NBC_Progress(NBC_Handle *handle) {
335335
while (handle->req_count) {
336336
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
337337
if (REQUEST_COMPLETE(subreq)) {
338-
ompi_request_free(&subreq);
338+
if(OPAL_UNLIKELY( OMPI_SUCCESS != subreq->req_status.MPI_ERROR )) {
339+
NBC_Error ("MPI Error in NBC subrequest %p : %d", subreq, subreq->req_status.MPI_ERROR);
340+
/* copy the error code from the underlying request and let the
341+
* round finish */
342+
handle->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
343+
}
339344
handle->req_count--;
345+
ompi_request_free(&subreq);
340346
} else {
341347
flag = false;
342348
break;
@@ -349,6 +355,26 @@ int NBC_Progress(NBC_Handle *handle) {
349355

350356
/* a round is finished */
351357
if (flag) {
358+
/* reset handle for next round */
359+
if (NULL != handle->req_array) {
360+
/* free request array */
361+
free (handle->req_array);
362+
handle->req_array = NULL;
363+
}
364+
365+
handle->req_count = 0;
366+
367+
/* previous round had an error */
368+
if (OPAL_UNLIKELY(OMPI_SUCCESS != handle->super.req_status.MPI_ERROR)) {
369+
res = handle->super.req_status.MPI_ERROR;
370+
NBC_Error("NBC_Progress: an error %d was found during schedule %p at row-offset %li - aborting the schedule\n", res, handle->schedule, handle->row_offset);
371+
handle->nbc_complete = true;
372+
if (!handle->super.req_persistent) {
373+
NBC_Free(handle);
374+
}
375+
return res;
376+
}
377+
352378
/* adjust delim to start of current round */
353379
NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset);
354380
delim = handle->schedule->data + handle->row_offset;
@@ -358,14 +384,6 @@ int NBC_Progress(NBC_Handle *handle) {
358384
/* adjust delim to end of current round -> delimiter */
359385
delim = delim + size;
360386

361-
if (NULL != handle->req_array) {
362-
/* free request array */
363-
free (handle->req_array);
364-
handle->req_array = NULL;
365-
}
366-
367-
handle->req_count = 0;
368-
369387
if (*delim == 0) {
370388
/* this was the last round - we're done */
371389
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
@@ -638,6 +656,7 @@ int NBC_Start(NBC_Handle *handle) {
638656

639657
/* kick off first round */
640658
handle->super.req_state = OMPI_REQUEST_ACTIVE;
659+
handle->super.req_status.MPI_ERROR = OMPI_SUCCESS;
641660
res = NBC_Start_round(handle);
642661
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
643662
return res;

0 commit comments

Comments
 (0)