@@ -92,30 +92,27 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
92
92
if (dest != MPI_PROC_NULL ) { /* send */
93
93
rc = MCA_PML_CALL (send (sendbuf , sendcount , sendtype , dest ,
94
94
sendtag , MCA_PML_BASE_SEND_STANDARD , comm ));
95
- #if OPAL_ENABLE_FT_MPI
96
- if (OPAL_UNLIKELY (MPI_ERR_PROC_FAILED == rc )) {
97
- /* If this is a recoverable error (e.g., ULFM error class),
98
- * we need to wait for the posted receive to complete so that the
99
- * receive buffer doesn't get updated after the completion of the call.
100
- * Hence we cannot return immediately, we need to wait on the recv
101
- * req first. */
95
+ if (OPAL_UNLIKELY (MPI_SUCCESS != rc )) {
102
96
rcs = rc ;
103
- }
104
- else /* else intentionally spills outside ifdef */
97
+ #if OPAL_ENABLE_FT_MPI
98
+ /* If this is a PROC_FAILED error, we still need to proceed with
99
+ * the receive, so that we do not propagate errors to the sender in
100
+ * the case src != dst, and only dst is dead. In this case the
101
+ * recv is garanteed to complete (either in error if the source is
102
+ * dead, or successfully if the source is live). */
103
+ if (OPAL_UNLIKELY (MPI_ERR_PROC_FAILED != rc ))
104
+ /* if intentionally spills outside ifdef */
105
105
#endif
106
- /* If the error semantic does not garantee the completion of the wait on
107
- * the recv-req for that error class, we just invoke the errhandler asap
108
- * to avoid hanging. Note that in this case we are returning the recv
109
- * buffer in an undefined state and the application may not recover. */
110
- OMPI_ERRHANDLER_CHECK (rc , comm , rc , FUNC_NAME );
106
+ ompi_request_cancel (req );
107
+ }
111
108
}
112
109
113
110
if (source != MPI_PROC_NULL ) { /* wait for recv */
114
111
rc = ompi_request_wait (& req , status );
115
112
#if OPAL_ENABLE_FT_MPI
116
113
/* Sendrecv never returns ERR_PROC_FAILED_PENDING because it is
117
- * blocking. Lets complete now that irecv and promote the error
118
- * to ERR_PROC_FAILED */
114
+ * blocking. Lets cancel that irecv to complete it NOW and promote
115
+ * the error to ERR_PROC_FAILED */
119
116
if ( OPAL_UNLIKELY (MPI_ERR_PROC_FAILED_PENDING == rc ) ) {
120
117
ompi_request_cancel (req );
121
118
ompi_request_wait (& req , MPI_STATUS_IGNORE );
@@ -128,11 +125,9 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
128
125
}
129
126
rc = MPI_SUCCESS ;
130
127
}
131
- #if OPAL_ENABLE_FT_MPI
132
128
if ( OPAL_UNLIKELY (MPI_SUCCESS != rcs && MPI_SUCCESS == rc ) ) {
133
129
rc = rcs ;
134
130
}
135
- #endif
136
131
137
132
OMPI_ERRHANDLER_RETURN (rc , comm , rc , FUNC_NAME );
138
133
}
0 commit comments