Skip to content

Commit 9bcf213

Browse files
committed
Correct PML UCX setting values in status MPI_ERROR
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 3d59d84 commit 9bcf213

File tree

2 files changed

+51
-11
lines changed

2 files changed

+51
-11
lines changed

ompi/mca/pml/ucx/pml_ucx.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) 2001-2011 Mellanox Technologies Ltd. 2001-2011. ALL RIGHTS RESERVED.
3-
* Copyright (c) 2016-2020 The University of Tennessee and The University
3+
* Copyright (c) 2016-2021 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* Copyright (c) 2018-2019 Research Organization for Information Science
@@ -631,7 +631,7 @@ int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src
631631
MCA_COMMON_UCX_PROGRESS_LOOP(ompi_pml_ucx.ucp_worker) {
632632
status = ucp_request_test(req, &info);
633633
if (status != UCS_INPROGRESS) {
634-
result = mca_pml_ucx_set_recv_status_safe(mpi_status, status, &info);
634+
result = mca_pml_ucx_set_recv_status_public(mpi_status, status, &info);
635635

636636
#if SPC_ENABLE == 1
637637
size_t dt_size;
@@ -974,7 +974,7 @@ int mca_pml_ucx_iprobe(int src, int tag, struct ompi_communicator_t* comm,
974974
0, &info);
975975
if (ucp_msg != NULL) {
976976
*matched = 1;
977-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
977+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
978978
} else {
979979
(++progress_count % opal_common_ucx.progress_iterations) ?
980980
(void)ucp_worker_progress(ompi_pml_ucx.ucp_worker) : opal_progress();
@@ -998,7 +998,7 @@ int mca_pml_ucx_probe(int src, int tag, struct ompi_communicator_t* comm,
998998
ucp_msg = ucp_tag_probe_nb(ompi_pml_ucx.ucp_worker, ucp_tag,
999999
ucp_tag_mask, 0, &info);
10001000
if (ucp_msg != NULL) {
1001-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
1001+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
10021002
return OMPI_SUCCESS;
10031003
}
10041004
}
@@ -1023,7 +1023,7 @@ int mca_pml_ucx_improbe(int src, int tag, struct ompi_communicator_t* comm,
10231023
PML_UCX_MESSAGE_NEW(comm, ucp_msg, &info, message);
10241024
PML_UCX_VERBOSE(8, "got message %p (%p)", (void*)*message, (void*)ucp_msg);
10251025
*matched = 1;
1026-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
1026+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
10271027
} else {
10281028
(++progress_count % opal_common_ucx.progress_iterations) ?
10291029
(void)ucp_worker_progress(ompi_pml_ucx.ucp_worker) : opal_progress();
@@ -1049,7 +1049,7 @@ int mca_pml_ucx_mprobe(int src, int tag, struct ompi_communicator_t* comm,
10491049
if (ucp_msg != NULL) {
10501050
PML_UCX_MESSAGE_NEW(comm, ucp_msg, &info, message);
10511051
PML_UCX_VERBOSE(8, "got message %p (%p)", (void*)*message, (void*)ucp_msg);
1052-
mca_pml_ucx_set_recv_status_safe(mpi_status, UCS_OK, &info);
1052+
mca_pml_ucx_set_recv_status_public(mpi_status, UCS_OK, &info);
10531053
return OMPI_SUCCESS;
10541054
}
10551055
}

ompi/mca/pml/ucx/pml_ucx_request.h

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
22
* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED.
3-
* Copyright (c) 2016 The University of Tennessee and The University
3+
* Copyright (c) 2016-2021 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* $COPYRIGHT$
@@ -151,6 +151,12 @@ static inline void mca_pml_ucx_request_reset(ompi_request_t *req)
151151
req->req_complete = REQUEST_PENDING;
152152
}
153153

154+
/* Use when setting a request's status field.
155+
* Note that a new function 'mca_mpl_ucx_set_send_status_public' shall
156+
* be created and used instead if updating a publicly visible status becomes
157+
* necessary (i.e., the status argument in an user-visible procedure), see the
158+
* recv_status case below for rationale.
159+
*/
154160
__opal_attribute_always_inline__
155161
static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
156162
ucs_status_t status)
@@ -165,6 +171,11 @@ static inline void mca_pml_ucx_set_send_status(ompi_status_public_t* mpi_status,
165171
}
166172
}
167173

174+
/* Use when setting a request's status field.
175+
* Note that the next function 'mca_mpl_ucx_set_recv_status_public' shall
176+
* be used instead when updating a publicly visible status (i.e., the
177+
* status argument in an user-visible procedure).
178+
*/
168179
static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
169180
ucs_status_t ucp_status,
170181
const ucp_tag_recv_info_t *info)
@@ -180,6 +191,10 @@ static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
180191
mpi_status->_ucount = info->length;
181192
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
182193
mpi_status->MPI_ERROR = MPI_ERR_TRUNCATE;
194+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
195+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
196+
mpi_status->_cancelled = false;
197+
mpi_status->_ucount = info->length;
183198
} else if (ucp_status == UCS_ERR_CANCELED) {
184199
mpi_status->MPI_ERROR = MPI_SUCCESS;
185200
mpi_status->_cancelled = true;
@@ -190,16 +205,41 @@ static inline int mca_pml_ucx_set_recv_status(ompi_status_public_t* mpi_status,
190205
return mpi_status->MPI_ERROR;
191206
}
192207

193-
static inline int mca_pml_ucx_set_recv_status_safe(ompi_status_public_t* mpi_status,
208+
/* Use when setting a publicly visible status (i.e., the status argument in an
209+
* user-visible procedure).
210+
* Except in procedures that return MPI_ERR_IN_STATUS, the MPI_ERROR
211+
* field of a status object shall never be modified
212+
* See MPI-1.1 doc, sec 3.2.5, p.22
213+
*/
214+
static inline int mca_pml_ucx_set_recv_status_public(ompi_status_public_t* mpi_status,
194215
ucs_status_t ucp_status,
195216
const ucp_tag_recv_info_t *info)
196217
{
197218
if (mpi_status != MPI_STATUS_IGNORE) {
198-
return mca_pml_ucx_set_recv_status(mpi_status, ucp_status, info);
199-
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
200-
return UCS_OK;
219+
if (OPAL_LIKELY(ucp_status == UCS_OK)) {
220+
uint64_t tag = info->sender_tag;
221+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
222+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
223+
mpi_status->_cancelled = false;
224+
mpi_status->_ucount = info->length;
225+
return MPI_SUCCESS;
226+
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
227+
uint64_t tag = info->sender_tag;
228+
mpi_status->MPI_SOURCE = PML_UCX_TAG_GET_SOURCE(tag);
229+
mpi_status->MPI_TAG = PML_UCX_TAG_GET_MPI_TAG(tag);
230+
mpi_status->_cancelled = false;
231+
mpi_status->_ucount = info->length;
232+
return MPI_ERR_TRUNCATE;
233+
} else if (ucp_status == UCS_ERR_CANCELED) {
234+
mpi_status->_cancelled = true;
235+
return MPI_SUCCESS;
236+
} else {
237+
return MPI_ERR_INTERN;
238+
}
201239
} else if (ucp_status == UCS_ERR_MESSAGE_TRUNCATED) {
202240
return MPI_ERR_TRUNCATE;
241+
} else if (OPAL_LIKELY(ucp_status == UCS_OK) || (ucp_status == UCS_ERR_CANCELED)) {
242+
return MPI_SUCCESS;
203243
}
204244

205245
return MPI_ERR_INTERN;

0 commit comments

Comments
 (0)