You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
Correctly bubble up errors in NBC collective operations
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
The error field of requests needs to be rearmed at start, not at create
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
@@ -349,6 +355,26 @@ int NBC_Progress(NBC_Handle *handle) {
349
355
350
356
/* a round is finished */
351
357
if (flag) {
358
+
/* reset handle for next round */
359
+
if (NULL!=handle->req_array) {
360
+
/* free request array */
361
+
free (handle->req_array);
362
+
handle->req_array=NULL;
363
+
}
364
+
365
+
handle->req_count=0;
366
+
367
+
/* previous round had an error */
368
+
if (OPAL_UNLIKELY(OMPI_SUCCESS!=handle->super.req_status.MPI_ERROR)) {
369
+
res=handle->super.req_status.MPI_ERROR;
370
+
NBC_Error("NBC_Progress: an error %d was found during schedule %p at row-offset %li - aborting the schedule\n", res, handle->schedule, handle->row_offset);
371
+
handle->nbc_complete= true;
372
+
if (!handle->super.req_persistent) {
373
+
NBC_Free(handle);
374
+
}
375
+
returnres;
376
+
}
377
+
352
378
/* adjust delim to start of current round */
353
379
NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset);
354
380
delim=handle->schedule->data+handle->row_offset;
@@ -358,14 +384,6 @@ int NBC_Progress(NBC_Handle *handle) {
358
384
/* adjust delim to end of current round -> delimiter */
359
385
delim=delim+size;
360
386
361
-
if (NULL!=handle->req_array) {
362
-
/* free request array */
363
-
free (handle->req_array);
364
-
handle->req_array=NULL;
365
-
}
366
-
367
-
handle->req_count=0;
368
-
369
387
if (*delim==0) {
370
388
/* this was the last round - we're done */
371
389
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
@@ -638,6 +656,7 @@ int NBC_Start(NBC_Handle *handle) {
0 commit comments