Skip to content

Commit 98c6319

Browse files
committed
sendrecv with errors: a second attempt at resolving issue #9160
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent ec297b0 commit 98c6319

File tree

1 file changed

+13
-18
lines changed

1 file changed

+13
-18
lines changed

ompi/mpi/c/sendrecv.c

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -92,30 +92,27 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
9292
if (dest != MPI_PROC_NULL) { /* send */
9393
rc = MCA_PML_CALL(send(sendbuf, sendcount, sendtype, dest,
9494
sendtag, MCA_PML_BASE_SEND_STANDARD, comm));
95-
#if OPAL_ENABLE_FT_MPI
96-
if (OPAL_UNLIKELY(MPI_ERR_PROC_FAILED == rc)) {
97-
/* If this is a recoverable error (e.g., ULFM error class),
98-
* we need to wait for the posted receive to complete so that the
99-
* receive buffer doesn't get updated after the completion of the call.
100-
* Hence we cannot return immediately, we need to wait on the recv
101-
* req first. */
95+
if (OPAL_UNLIKELY(MPI_SUCCESS != rc)) {
10296
rcs = rc;
103-
}
104-
else /* else intentionally spills outside ifdef */
97+
#if OPAL_ENABLE_FT_MPI
98+
/* If this is a PROC_FAILED error, we still need to proceed with
99+
* the receive, so that we do not propagate errors to the sender in
100+
* the case src != dst, and only dst is dead. In this case the
101+
* recv is garanteed to complete (either in error if the source is
102+
* dead, or successfully if the source is live). */
103+
if (OPAL_UNLIKELY(MPI_ERR_PROC_FAILED != rc))
104+
/* if intentionally spills outside ifdef */
105105
#endif
106-
/* If the error semantic does not garantee the completion of the wait on
107-
* the recv-req for that error class, we just invoke the errhandler asap
108-
* to avoid hanging. Note that in this case we are returning the recv
109-
* buffer in an undefined state and the application may not recover. */
110-
OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
106+
ompi_request_cancel(req);
107+
}
111108
}
112109

113110
if (source != MPI_PROC_NULL) { /* wait for recv */
114111
rc = ompi_request_wait(&req, status);
115112
#if OPAL_ENABLE_FT_MPI
116113
/* Sendrecv never returns ERR_PROC_FAILED_PENDING because it is
117-
* blocking. Lets complete now that irecv and promote the error
118-
* to ERR_PROC_FAILED */
114+
* blocking. Lets cancel that irecv to complete it NOW and promote
115+
* the error to ERR_PROC_FAILED */
119116
if( OPAL_UNLIKELY(MPI_ERR_PROC_FAILED_PENDING == rc) ) {
120117
ompi_request_cancel(req);
121118
ompi_request_wait(&req, MPI_STATUS_IGNORE);
@@ -128,11 +125,9 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
128125
}
129126
rc = MPI_SUCCESS;
130127
}
131-
#if OPAL_ENABLE_FT_MPI
132128
if( OPAL_UNLIKELY(MPI_SUCCESS != rcs && MPI_SUCCESS == rc) ) {
133129
rc = rcs;
134130
}
135-
#endif
136131

137132
OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME);
138133
}

0 commit comments

Comments
 (0)