Skip to content

Commit 12edc34

Browse files
author
Sergey Oblomov
committed
PML/UCX/DATATYPE: fixed potential race
- in multithread applications datatypes may be initialized in multiple threads simultaneously. to prevent race added lock - solution is not optimal, but allows to eliminate crash Signed-off-by: Sergey Oblomov <sergeyo@nvidia.com>
1 parent 8c1495d commit 12edc34

File tree

2 files changed

+25
-12
lines changed

2 files changed

+25
-12
lines changed

ompi/mca/pml/ucx/pml_ucx_datatype.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,11 +214,19 @@ pml_ucx_datatype_t *mca_pml_ucx_init_nbx_datatype(ompi_datatype_t *datatype,
214214

215215
ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
216216
{
217+
static opal_thread_internal_mutex_t lock = OPAL_THREAD_INTERNAL_MUTEX_INITIALIZER;
217218
size_t size = 0; /* init to suppress compiler warning */
218219
ucp_datatype_t ucp_datatype;
219220
ucs_status_t status;
220221
int ret;
221222

223+
opal_thread_internal_mutex_lock(&lock);
224+
225+
if (datatype->pml_data != PML_UCX_DATATYPE_INVALID) {
226+
/* datatype is already initialized in concurrent thread */
227+
goto out;
228+
}
229+
222230
if (mca_pml_ucx_datatype_is_contig(datatype)) {
223231
ompi_datatype_type_size(datatype, &size);
224232
ucp_datatype = ucp_dt_make_contig(size);
@@ -271,7 +279,10 @@ ucp_datatype_t mca_pml_ucx_init_datatype(ompi_datatype_t *datatype)
271279
datatype->pml_data = ucp_datatype;
272280
#endif
273281

274-
return ucp_datatype;
282+
out:
283+
opal_thread_internal_mutex_unlock(&lock);
284+
285+
return mca_pml_ucx_from_ompi_datatype(datatype);
275286
}
276287

277288
static void mca_pml_ucx_convertor_construct(mca_pml_ucx_convertor_t *convertor)

ompi/mca/pml/ucx/pml_ucx_datatype.h

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,23 +44,25 @@ OBJ_CLASS_DECLARATION(mca_pml_ucx_convertor_t);
4444

4545

4646
__opal_attribute_always_inline__
47-
static inline ucp_datatype_t mca_pml_ucx_get_datatype(ompi_datatype_t *datatype)
47+
static inline ucp_datatype_t
48+
mca_pml_ucx_from_ompi_datatype(ompi_datatype_t *datatype)
4849
{
4950
#ifdef HAVE_UCP_REQUEST_PARAM_T
50-
pml_ucx_datatype_t *ucp_type = (pml_ucx_datatype_t*)datatype->pml_data;
51-
52-
if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID)) {
53-
return ucp_type->datatype;
54-
}
51+
return ((pml_ucx_datatype_t*)datatype->pml_data)->datatype;
5552
#else
56-
ucp_datatype_t ucp_type = datatype->pml_data;
53+
return (ucp_datatype_t)datatype->pml_data;
54+
#endif
55+
}
5756

58-
if (OPAL_LIKELY(ucp_type != PML_UCX_DATATYPE_INVALID)) {
59-
return ucp_type;
57+
58+
__opal_attribute_always_inline__
59+
static inline ucp_datatype_t mca_pml_ucx_get_datatype(ompi_datatype_t *datatype)
60+
{
61+
if (OPAL_UNLIKELY(datatype->pml_data == PML_UCX_DATATYPE_INVALID)) {
62+
return mca_pml_ucx_init_datatype(datatype);
6063
}
61-
#endif
6264

63-
return mca_pml_ucx_init_datatype(datatype);
65+
return mca_pml_ucx_from_ompi_datatype(datatype);
6466
}
6567

6668
#ifdef HAVE_UCP_REQUEST_PARAM_T

0 commit comments

Comments
 (0)