Skip to content

Commit c522de1

Browse files
committed
BTL/OFI: retry posting receive buffer
There are cases under heavy load (at least for HPE CXI provider) that trying to post a receive buffer can return -FI_EAGAIN. This PR uses the OFI_RETRY_UNTIL_DONE macro to try reposting receive buffer in the event -FI_EAGAIN is returned from the fi_recv call. Signed-off-by: Howard Pritchard <hppritcha@gmail.com>
1 parent 6e99e02 commit c522de1

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

opal/mca/btl/ofi/btl_ofi_module.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
*
1717
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
1818
* Copyright (c) 2020 Google, LLC. All rights reserved.
19-
* Copyright (c) 2022-2023 Triad National Security, LLC. All rights
19+
* Copyright (c) 2022-2024 Triad National Security, LLC. All rights
2020
* reserved.
2121
* $COPYRIGHT$
2222
*
@@ -31,6 +31,7 @@
3131
#include "opal/mca/accelerator/accelerator.h"
3232
#include "opal/mca/accelerator/base/base.h"
3333
#include "opal/mca/btl/btl.h"
34+
#include "opal/mca/common/ofi/common_ofi.h"
3435
#include "opal/mca/mpool/base/base.h"
3536
#include "opal/mca/mpool/mpool.h"
3637
#include "opal/util/printf.h"
@@ -412,9 +413,8 @@ int mca_btl_ofi_post_recvs(mca_btl_base_module_t *module, mca_btl_ofi_context_t
412413

413414
comp = mca_btl_ofi_frag_completion_alloc(module, context, frag, MCA_BTL_OFI_TYPE_RECV);
414415

415-
rc = fi_recv(context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE, NULL, FI_ADDR_UNSPEC,
416-
&comp->comp_ctx);
417-
416+
OFI_RETRY_UNTIL_DONE(fi_recv(context->rx_ctx, &frag->hdr, MCA_BTL_OFI_RECV_SIZE, NULL, FI_ADDR_UNSPEC,
417+
&comp->comp_ctx), rc);
418418
if (FI_SUCCESS != rc) {
419419
BTL_ERROR(("cannot post recvs"));
420420
return OPAL_ERROR;

0 commit comments

Comments
 (0)