Skip to content

Commit eb9405d

Browse files
Sergey OblomovSergey Oblomov
authored andcommitted
PML/UCX: improved error processing in MPI_Recv
- improved error processing in MPI_Recv implementation of pml UCX - added error handling for pml_ucx_mrecv call Signed-off-by: Sergey Oblomov <sergeyo@nvidia.com>
1 parent 487bbf3 commit eb9405d

File tree

2 files changed

+17
-9
lines changed

2 files changed

+17
-9
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
611611
ucp_tag_t ucp_tag, ucp_tag_mask;
612612
ucp_tag_recv_info_t info;
613613
ucs_status_t status;
614+
int result;
614615

615616
PML_UCX_TRACE_RECV("%s", buf, count, datatype, src, tag, comm, "recv");
616617

@@ -627,15 +628,15 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
627628
MCA_COMMON_UCX_PROGRESS_LOOP(ompi_pml_ucx.ucp_worker) {
628629
status = ucp_request_test(req, &info);
629630
if (status != UCS_INPROGRESS) {
630-
mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);
631+
result = mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);
631632

632633
#if SPC_ENABLE == 1
633634
size_t dt_size;
634635
ompi_datatype_type_size(datatype, &dt_size);
635636
SPC_USER_OR_MPI(tag, dt_size*count,
636637
OMPI_SPC_BYTES_RECEIVED_USER, OMPI_SPC_BYTES_RECEIVED_MPI);
637638
#endif
638-
return OMPI_SUCCESS;
639+
return result;
639640
}
640641
}
641642
}
@@ -1093,8 +1094,7 @@ int mca_pml_ucx_mrecv(void *buf, size_t count, ompi_datatype_t *datatype,
10931094

10941095
PML_UCX_MESSAGE_RELEASE(message);
10951096

1096-
ompi_request_wait(&req, status);
1097-
return OMPI_SUCCESS;
1097+
return ompi_request_wait(&req, status);
10981098
}
10991099

11001100
int mca_pml_ucx_start(size_t count, ompi_request_t** requests)

ompi/mca/pml/ucx/pml_ucx_request.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
165165
}
166166
}
167167

168-
static inline void mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
168+
static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
169169
ucs_status_t ucp_status,
170170
const ucp_tag_recv_info_t *info)
171171
{
@@ -186,15 +186,23 @@ static inline void mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
186186
} else {
187187
mpi_status->MPI_ERROR = MPI_ERR_INTERN;
188188
}
189+
190+
return mpi_status->MPI_ERROR;
189191
}
190192

191-
static inline void mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
192-
ucs_status_t ucp_status,
193-
const ucp_tag_recv_info_t *info)
193+
static inline int mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
194+
ucs_status_t ucp_status,
195+
const ucp_tag_recv_info_t *info)
194196
{
195197
if (mpi_status != MPI_STATUS_IGNORE) {
196-
mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
198+
return mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
199+
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
200+
return UCS_OK;
201+
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
202+
return MPI_ERR_TRUNCATE;
197203
}
204+
205+
return MPI_ERR_INTERN;
198206
}
199207

200208
OBJ_CLASS_DECLARATION(mca_pml_ucx_persistent_request_t);

0 commit comments

Comments
 (0)