Skip to content

Commit b7f8cae

Browse files
committed
pml/ob1: fix potential double return of RDMA fragment on get operation failure
The mca_pml_ob1_recv_request_get_frag_failed method is responsible for returning or queueing the fragment but mca_pml_ob1_rget_completion was freeing it unconditionally. This will lead to a double return of the fragment to the free list and may lead to other errors if the fragment was queued for retry. This commit fixes the issue by only returning the fragment if it did not fail. Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 27efeb9 commit b7f8cae

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

ompi/mca/pml/ob1/pml_ob1_recvreq.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_
413413
/* check completion status */
414414
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
415415
status = mca_pml_ob1_recv_request_get_frag_failed (frag, status);
416+
/* fragment was returned or queue by the above call */
416417
if (OPAL_UNLIKELY(OMPI_SUCCESS != status)) {
417418
size_t skipped_bytes = recvreq->req_send_offset - recvreq->req_rdma_offset;
418419
opal_output_verbose(mca_pml_ob1_output, 1, "pml:ob1: %s: operation failed with code %d", __func__, status);
@@ -435,12 +436,12 @@ static void mca_pml_ob1_rget_completion (mca_btl_base_module_t* btl, struct mca_
435436
mca_pml_ob1_send_fin (recvreq->req_recv.req_base.req_proc,
436437
bml_btl, frag->rdma_hdr.hdr_rget.hdr_frag,
437438
frag->rdma_length, 0, 0);
439+
440+
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
438441
}
439442

440443
recv_request_pml_complete_check(recvreq);
441444

442-
MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
443-
444445
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
445446
}
446447

0 commit comments

Comments
 (0)