Skip to content

Commit 3ae6dfe

Browse files
authored
Merge pull request #9296 from abouteiller/bugfix/sendrecv-err
Prevent deadlock after an unmanaged error in MPI_SENDRECV
2 parents f882930 + 98c6319 commit 3ae6dfe

File tree

3 files changed

+21
-16
lines changed

3 files changed

+21
-16
lines changed

ompi/mca/pml/ob1/pml_ob1_isend.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ int mca_pml_ob1_isend(const void *buf,
206206

207207
#if OPAL_ENABLE_FT_MPI
208208
alloc_ft_req:
209-
#endif /* OPAL_ENABLE_FT_MPI */
210209
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
211210
if (NULL == sendreq)
212211
return OMPI_ERR_OUT_OF_RESOURCE;
@@ -224,10 +223,12 @@ int mca_pml_ob1_isend(const void *buf,
224223

225224
/* No point in starting the request, it won't go through, mark completed
226225
* in error for collection in future wait */
227-
sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = MPI_ERR_PROC_FAILED;
226+
sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR = ompi_comm_is_revoked(comm)? MPI_ERR_REVOKED: MPI_ERR_PROC_FAILED;
228227
MCA_PML_OB1_SEND_REQUEST_MPI_COMPLETE(sendreq, false);
228+
OPAL_OUTPUT_VERBOSE((2, "Allocating request in error %s (peer %d, seq %d) with error code %d", sendreq, dst, sendreq->req_send.req_base.req_sequence, sendreq->req_send.req_base.req_ompi.req_status.MPI_ERROR));
229229
*request = (ompi_request_t *) sendreq;
230230
return OMPI_SUCCESS;
231+
#endif /* OPAL_ENABLE_FT_MPI */
231232
}
232233

233234
int mca_pml_ob1_send(const void *buf,

ompi/mca/pml/ob1/pml_ob1_recvfrag.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ int mca_pml_ob1_revoke_comm( struct ompi_communicator_t* ompi_comm, bool coll_on
419419
assert( MCA_PML_OB1_HDR_TYPE_RGET == hdr->hdr_common.hdr_type ||
420420
MCA_PML_OB1_HDR_TYPE_RNDV == hdr->hdr_common.hdr_type );
421421
OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle,
422-
"ob1_revoke_comm: sending NACK to %d", hdr->hdr_rndv.hdr_match.hdr_src));
422+
"ob1_revoke_comm: sending NACK to %d for seq %d", hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_seq));
423423
/* Send a ACK with a NULL request to signify revocation */
424424
proc = mca_pml_ob1_peer_lookup(ompi_comm, hdr->hdr_rndv.hdr_match.hdr_src);
425425
mca_pml_ob1_recv_request_ack_send(NULL, proc->ompi_proc, hdr->hdr_rndv.hdr_src_req.lval, NULL, 0, 0, false);
@@ -428,7 +428,7 @@ int mca_pml_ob1_revoke_comm( struct ompi_communicator_t* ompi_comm, bool coll_on
428428
/* if it's a TYPE_MATCH, the sender is not expecting anything
429429
* from us. So we are done. */
430430
OPAL_OUTPUT_VERBOSE((15, ompi_ftmpi_output_handle,
431-
"ob1_revoke_comm: dropping silently frag from %d", hdr->hdr_rndv.hdr_match.hdr_src));
431+
"ob1_revoke_comm: dropping silently frag from %d for seq %d", hdr->hdr_rndv.hdr_match.hdr_src, hdr->hdr_rndv.hdr_match.hdr_seq));
432432
}
433433
MCA_PML_OB1_RECV_FRAG_RETURN(frag);
434434
}
@@ -681,7 +681,7 @@ void mca_pml_ob1_recv_frag_callback_ack (mca_btl_base_module_t *btl,
681681
#if OPAL_ENABLE_FT_MPI
682682
/* if the req_recv is NULL, the comm has been revoked at the receiver */
683683
if( OPAL_UNLIKELY(NULL == sendreq->req_recv.pval) ) {
684-
OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle, "Recvfrag: Received a NACK to the RDV/RGET match to %d on comm %d\n", sendreq->req_send.req_base.req_peer, sendreq->req_send.req_base.req_comm->c_contextid));
684+
OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle, "Recvfrag: Received a NACK to the RDV/RGET match to %d for seq %d on comm %d\n", sendreq->req_send.req_base.req_peer, sendreq->req_send.req_base.req_sequence, sendreq->req_send.req_base.req_comm->c_contextid));
685685
if (NULL != sendreq->rdma_frag) {
686686
MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag);
687687
sendreq->rdma_frag = NULL;
@@ -1074,7 +1074,7 @@ static int mca_pml_ob1_recv_frag_match (mca_btl_base_module_t *btl,
10741074
/* Send a ACK with a NULL request to signify revocation */
10751075
mca_pml_ob1_rendezvous_hdr_t* hdr_rndv = (mca_pml_ob1_rendezvous_hdr_t*) hdr;
10761076
mca_pml_ob1_recv_request_ack_send(NULL, proc->ompi_proc, hdr_rndv->hdr_src_req.lval, NULL, 0, 0, false);
1077-
OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle, "Recvfrag: comm %d is revoked or collectives force errors, sending a NACK to the RDV/RGET match from %d\n", hdr->hdr_ctx, hdr->hdr_src));
1077+
OPAL_OUTPUT_VERBOSE((2, ompi_ftmpi_output_handle, "Recvfrag: comm %d is revoked or collectives force errors, sending a NACK to the RDV/RGET match from %d for seq %d\n", hdr->hdr_ctx, hdr->hdr_src, hdr->hdr_seq));
10781078
}
10791079
else {
10801080
OPAL_OUTPUT_VERBOSE((15, ompi_ftmpi_output_handle,

ompi/mpi/c/sendrecv.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,27 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
9292
if (dest != MPI_PROC_NULL) { /* send */
9393
rc = MCA_PML_CALL(send(sendbuf, sendcount, sendtype, dest,
9494
sendtag, MCA_PML_BASE_SEND_STANDARD, comm));
95+
if (OPAL_UNLIKELY(MPI_SUCCESS != rc)) {
96+
rcs = rc;
9597
#if OPAL_ENABLE_FT_MPI
96-
/* If ULFM is enabled we need to wait for the posted receive to
97-
* complete, hence we cannot return here */
98-
rcs = rc;
99-
#else
100-
OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
101-
#endif /* OPAL_ENABLE_FT_MPI */
98+
/* If this is a PROC_FAILED error, we still need to proceed with
99+
* the receive, so that we do not propagate errors to the sender in
100+
* the case src != dst, and only dst is dead. In this case the
101+
* recv is garanteed to complete (either in error if the source is
102+
* dead, or successfully if the source is live). */
103+
if (OPAL_UNLIKELY(MPI_ERR_PROC_FAILED != rc))
104+
/* if intentionally spills outside ifdef */
105+
#endif
106+
ompi_request_cancel(req);
107+
}
102108
}
103109

104110
if (source != MPI_PROC_NULL) { /* wait for recv */
105111
rc = ompi_request_wait(&req, status);
106112
#if OPAL_ENABLE_FT_MPI
107113
/* Sendrecv never returns ERR_PROC_FAILED_PENDING because it is
108-
* blocking. Lets complete now that irecv and promote the error
109-
* to ERR_PROC_FAILED */
114+
* blocking. Lets cancel that irecv to complete it NOW and promote
115+
* the error to ERR_PROC_FAILED */
110116
if( OPAL_UNLIKELY(MPI_ERR_PROC_FAILED_PENDING == rc) ) {
111117
ompi_request_cancel(req);
112118
ompi_request_wait(&req, MPI_STATUS_IGNORE);
@@ -119,11 +125,9 @@ int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
119125
}
120126
rc = MPI_SUCCESS;
121127
}
122-
#if OPAL_ENABLE_FT_MPI
123128
if( OPAL_UNLIKELY(MPI_SUCCESS != rcs && MPI_SUCCESS == rc) ) {
124129
rc = rcs;
125130
}
126-
#endif
127131

128132
OMPI_ERRHANDLER_RETURN(rc, comm, rc, FUNC_NAME);
129133
}

0 commit comments

Comments
 (0)