Skip to content

Commit 2e5303f

Browse files
authored
Merge pull request #8124 from rajachan/peek-probe-errorflow
mtl/ofi: Fix erroneous FI_PEEK/FI_CLAIM usage
2 parents ce97090 + 39f8a86 commit 2e5303f

File tree

1 file changed

+14
-19
lines changed

1 file changed

+14
-19
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -991,10 +991,20 @@ __opal_attribute_always_inline__ static inline int
991991
ompi_mtl_ofi_probe_error_callback(struct fi_cq_err_entry *error,
992992
ompi_mtl_ofi_request_t *ofi_req)
993993
{
994-
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
995994
ofi_req->completion_count--;
996995

997-
return OMPI_SUCCESS;
996+
/*
997+
* Receives posted with FI_PEEK and friends will get an error
998+
* completion with FI_ENOMSG. This just indicates the lack of a match for
999+
* the probe and is not an error case. All other error cases are
1000+
* provider-internal errors and should be flagged as such.
1001+
*/
1002+
if (error->err == FI_ENOMSG)
1003+
return OMPI_SUCCESS;
1004+
1005+
ofi_req->status.MPI_ERROR = MPI_ERR_INTERN;
1006+
1007+
return OMPI_ERROR;
9981008
}
9991009

10001010
__opal_attribute_always_inline__ static inline int
@@ -1039,7 +1049,6 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
10391049
/**
10401050
* fi_trecvmsg with FI_PEEK:
10411051
* Initiate a search for a match in the hardware or software queue.
1042-
* The search can complete immediately with -ENOMSG.
10431052
* If successful, libfabric will enqueue a context entry into the completion
10441053
* queue to make the search nonblocking. This code will poll until the
10451054
* entry is enqueued.
@@ -1060,13 +1069,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
10601069
ofi_req.match_state = 0;
10611070

10621071
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1063-
if (-FI_ENOMSG == ret) {
1064-
/**
1065-
* The search request completed but no matching message was found.
1066-
*/
1067-
*flag = 0;
1068-
return OMPI_SUCCESS;
1069-
} else if (OPAL_UNLIKELY(0 > ret)) {
1072+
if (OPAL_UNLIKELY(0 > ret)) {
10701073
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
10711074
return ompi_mtl_ofi_get_error(ret);
10721075
}
@@ -1136,7 +1139,6 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
11361139
/**
11371140
* fi_trecvmsg with FI_PEEK and FI_CLAIM:
11381141
* Initiate a search for a match in the hardware or software queue.
1139-
* The search can complete immediately with -ENOMSG.
11401142
* If successful, libfabric will enqueue a context entry into the completion
11411143
* queue to make the search nonblocking. This code will poll until the
11421144
* entry is enqueued.
@@ -1158,14 +1160,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
11581160
ofi_req->mask_bits = mask_bits;
11591161

11601162
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ofi_ctxt[ctxt_id].rx_ep, &msg, msgflags), ret);
1161-
if (-FI_ENOMSG == ret) {
1162-
/**
1163-
* The search request completed but no matching message was found.
1164-
*/
1165-
*matched = 0;
1166-
free(ofi_req);
1167-
return OMPI_SUCCESS;
1168-
} else if (OPAL_UNLIKELY(0 > ret)) {
1163+
if (OPAL_UNLIKELY(0 > ret)) {
11691164
MTL_OFI_LOG_FI_ERR(ret, "fi_trecvmsg failed");
11701165
free(ofi_req);
11711166
return ompi_mtl_ofi_get_error(ret);

0 commit comments

Comments
 (0)