Skip to content

Commit 0bd8ccc

Browse files
committed
UCX/common: fixes for issue 8860
This patch fixes ucx common so that it works correctly in cases where the mca var base for ucx components (btl, pml) are loaded/unloaded multiple times. Related to issue #8860 Signed-off-by: Howard Pritchard <howardp@lanl.gov>
1 parent 02b2010 commit 0bd8ccc

File tree

1 file changed

+43
-5
lines changed

1 file changed

+43
-5
lines changed

opal/mca/common/ucx/common_ucx.c

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
* Copyright (c) 2019 Intel, Inc. All rights reserved.
44
* Copyright (c) 2019 Research Organization for Information Science
55
* and Technology (RIST). All rights reserved.
6+
* Copyright (c) 2021 Triad National Security, LLC. All rights
7+
* reserved.
8+
*
69
* $COPYRIGHT$
710
*
811
* Additional copyrights may follow
@@ -33,6 +36,8 @@ opal_common_ucx_module_t opal_common_ucx = {.verbose = 0,
3336
.opal_mem_hooks = 0,
3437
.tls = NULL};
3538

39+
static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;
40+
3641
static void opal_common_ucx_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc)
3742
{
3843
ucm_vm_munmap(buf, length);
@@ -42,34 +47,58 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
4247
{
4348
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc";
4449
static const char *default_devices = "mlx*";
45-
static int registered = 0;
4650
static int hook_index;
4751
static int verbose_index;
4852
static int progress_index;
4953
static int tls_index;
5054
static int devices_index;
55+
int param;
5156

52-
if (!registered) {
57+
OPAL_THREAD_LOCK(&opal_common_ucx_mutex);
58+
59+
param = mca_base_var_find("opal", "opal_common", "ucx", "verbose");
60+
if (0 > param) {
5361
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
5462
"Verbose level of the UCX components",
5563
MCA_BASE_VAR_TYPE_INT, NULL, 0,
5664
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
5765
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
66+
}
67+
68+
param = mca_base_var_find("opal", "opal_common", "ucx", "progress_iterations");
69+
if (0 > param) {
5870
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
5971
"Set number of calls of internal UCX progress "
6072
"calls per opal_progress call",
6173
MCA_BASE_VAR_TYPE_INT, NULL, 0,
6274
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
6375
MCA_BASE_VAR_SCOPE_LOCAL,
6476
&opal_common_ucx.progress_iterations);
77+
}
78+
79+
param = mca_base_var_find("opal", "opal_common", "ucx", "opal_mem_hooks");
80+
if (0 > param) {
6581
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
6682
"Use OPAL memory hooks, instead of UCX internal "
6783
"memory hooks",
6884
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
6985
MCA_BASE_VAR_SCOPE_LOCAL,
7086
&opal_common_ucx.opal_mem_hooks);
87+
}
7188

72-
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
89+
param = mca_base_var_find("opal", "opal_common", "ucx", "tls");
90+
if (0 > param) {
91+
92+
/*
93+
* this monkey business is needed because of the way the MCA VARs framework tries to handle pointers to strings
94+
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
95+
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
96+
* Same for the devices pointer below.
97+
*/
98+
if (NULL == opal_common_ucx.tls) {
99+
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
100+
assert(NULL != opal_common_ucx.tls);
101+
}
73102
*opal_common_ucx.tls = strdup(default_tls);
74103
tls_index = mca_base_var_register(
75104
"opal", "opal_common", "ucx", "tls",
@@ -80,17 +109,24 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
80109
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
81110
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
82111
opal_common_ucx.tls);
112+
}
113+
114+
param = mca_base_var_find("opal", "opal_common", "ucx", "devices");
115+
if (0 > param) {
83116

84-
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
117+
if (NULL == opal_common_ucx.devices) {
118+
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
119+
assert(NULL != opal_common_ucx.devices);
120+
}
85121
*opal_common_ucx.devices = strdup(default_devices);
86122
devices_index = mca_base_var_register(
87123
"opal", "opal_common", "ucx", "devices",
88124
"List of device driver pattern names, which, if supported by UCX, will "
89125
"bump its priority above ob1. Special values: any (any available)",
90126
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
91127
opal_common_ucx.devices);
92-
registered = 1;
93128
}
129+
94130
if (component) {
95131
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
96132
component->mca_type_name, component->mca_component_name,
@@ -108,6 +144,8 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
108144
component->mca_type_name, component->mca_component_name,
109145
"devices", 0);
110146
}
147+
148+
OPAL_THREAD_UNLOCK(&opal_common_ucx_mutex);
111149
}
112150

113151
OPAL_DECLSPEC void opal_common_ucx_mca_register(void)

0 commit comments

Comments
 (0)