Skip to content

Commit 5ad772c

Browse files
committed
common/ucx: fix variable registration
The registration code in common/ucx was overly complex. It assumed that re-registration is not desireable when it essentially does the same as the find before the registration. The only thing that needs protection is resetting the char * variables before they are registered. The primary purpose of this commit to fix two memory leaks caused by the extra allocations to hold the strings. These allocations are unnecessary. Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 3990b3f commit 5ad772c

File tree

2 files changed

+62
-88
lines changed

2 files changed

+62
-88
lines changed

opal/mca/common/ucx/common_ucx.c

Lines changed: 60 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (C) Mellanox Technologies Ltd. 2018. ALL RIGHTS RESERVED.
34
* Copyright (c) 2019 Intel, Inc. All rights reserved.
45
* Copyright (c) 2019 Research Organization for Information Science
56
* and Technology (RIST). All rights reserved.
67
* Copyright (c) 2021 Triad National Security, LLC. All rights
78
* reserved.
9+
* Copyright (c) 2022 Google, LLC. All rights reserved.
810
*
911
* $COPYRIGHT$
1012
*
@@ -33,11 +35,11 @@ static int use_safety_valve = 0;
3335

3436
extern mca_base_framework_t opal_memory_base_framework;
3537

36-
opal_common_ucx_module_t opal_common_ucx = {.verbose = 0,
37-
.progress_iterations = 100,
38-
.registered = 0,
39-
.opal_mem_hooks = 1,
40-
.tls = NULL};
38+
opal_common_ucx_module_t opal_common_ucx =
39+
{
40+
.progress_iterations = 100,
41+
.opal_mem_hooks = 1,
42+
};
4143

4244
static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;
4345

@@ -48,87 +50,59 @@ static void opal_common_ucx_mem_release_cb(void *buf, size_t length, void *cbdat
4850

4951
OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *component)
5052
{
51-
static const char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
52-
static const char *default_devices = "mlx*";
53-
static int hook_index;
54-
static int verbose_index;
55-
static int progress_index;
56-
static int tls_index;
57-
static int devices_index;
58-
int param;
53+
char *default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,ud_mlx5,cuda_ipc,rocm_ipc";
54+
char *default_devices = "mlx*";
55+
int hook_index;
56+
int verbose_index;
57+
int progress_index;
58+
int tls_index;
59+
int devices_index;
5960

6061
OPAL_THREAD_LOCK(&opal_common_ucx_mutex);
6162

62-
param = mca_base_var_find("opal", "opal_common", "ucx", "verbose");
63-
if (0 > param) {
64-
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
65-
"Verbose level of the UCX components",
66-
MCA_BASE_VAR_TYPE_INT, NULL, 0,
67-
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
68-
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
69-
}
70-
71-
param = mca_base_var_find("opal", "opal_common", "ucx", "progress_iterations");
72-
if (0 > param) {
73-
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
74-
"Set number of calls of internal UCX progress "
75-
"calls per opal_progress call",
76-
MCA_BASE_VAR_TYPE_INT, NULL, 0,
77-
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
78-
MCA_BASE_VAR_SCOPE_LOCAL,
79-
&opal_common_ucx.progress_iterations);
80-
}
81-
82-
param = mca_base_var_find("opal", "opal_common", "ucx", "opal_mem_hooks");
83-
if (0 > param) {
84-
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
85-
"Use OPAL memory hooks, instead of UCX internal "
86-
"memory hooks",
87-
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
63+
/* It is harmless to re-register variables so go ahead an re-register. */
64+
verbose_index = mca_base_var_register("opal", "opal_common", "ucx", "verbose",
65+
"Verbose level of the UCX components",
66+
MCA_BASE_VAR_TYPE_INT, NULL, 0,
67+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
68+
MCA_BASE_VAR_SCOPE_LOCAL, &opal_common_ucx.verbose);
69+
progress_index = mca_base_var_register("opal", "opal_common", "ucx", "progress_iterations",
70+
"Set number of calls of internal UCX progress "
71+
"calls per opal_progress call",
72+
MCA_BASE_VAR_TYPE_INT, NULL, 0,
73+
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
8874
MCA_BASE_VAR_SCOPE_LOCAL,
89-
&opal_common_ucx.opal_mem_hooks);
75+
&opal_common_ucx.progress_iterations);
76+
hook_index = mca_base_var_register("opal", "opal_common", "ucx", "opal_mem_hooks",
77+
"Use OPAL memory hooks, instead of UCX internal "
78+
"memory hooks",
79+
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_3,
80+
MCA_BASE_VAR_SCOPE_LOCAL,
81+
&opal_common_ucx.opal_mem_hooks);
82+
83+
if (NULL == opal_common_ucx.tls) {
84+
opal_common_ucx.tls = default_tls;
9085
}
9186

92-
param = mca_base_var_find("opal", "opal_common", "ucx", "tls");
93-
if (0 > param) {
94-
95-
/*
96-
* this monkey business is needed because of the way the MCA VARs framework tries to handle pointers to strings
97-
* when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
98-
* to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
99-
* Same for the devices pointer below.
100-
*/
101-
if (NULL == opal_common_ucx.tls) {
102-
opal_common_ucx.tls = malloc(sizeof(*opal_common_ucx.tls));
103-
assert(NULL != opal_common_ucx.tls);
104-
}
105-
*opal_common_ucx.tls = strdup(default_tls);
106-
tls_index = mca_base_var_register(
107-
"opal", "opal_common", "ucx", "tls",
108-
"List of UCX transports which should be supported on the system, to enable "
109-
"selecting the UCX component. Special values: any (any available). "
110-
"A '^' prefix negates the list. "
111-
"For example, in order to exclude on shared memory and TCP transports, "
112-
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
113-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
114-
opal_common_ucx.tls);
115-
}
116-
117-
param = mca_base_var_find("opal", "opal_common", "ucx", "devices");
118-
if (0 > param) {
119-
120-
if (NULL == opal_common_ucx.devices) {
121-
opal_common_ucx.devices = malloc(sizeof(*opal_common_ucx.devices));
122-
assert(NULL != opal_common_ucx.devices);
123-
}
124-
*opal_common_ucx.devices = strdup(default_devices);
125-
devices_index = mca_base_var_register(
126-
"opal", "opal_common", "ucx", "devices",
127-
"List of device driver pattern names, which, if supported by UCX, will "
128-
"bump its priority above ob1. Special values: any (any available)",
129-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
130-
opal_common_ucx.devices);
87+
tls_index = mca_base_var_register(
88+
"opal", "opal_common", "ucx", "tls",
89+
"List of UCX transports which should be supported on the system, to enable "
90+
"selecting the UCX component. Special values: any (any available). "
91+
"A '^' prefix negates the list. "
92+
"For example, in order to exclude on shared memory and TCP transports, "
93+
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
94+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
95+
&opal_common_ucx.tls);
96+
97+
if (NULL == opal_common_ucx.devices) {
98+
opal_common_ucx.devices = default_devices;
13199
}
100+
devices_index = mca_base_var_register(
101+
"opal", "opal_common", "ucx", "devices",
102+
"List of device driver pattern names, which, if supported by UCX, will "
103+
"bump its priority above ob1. Special values: any (any available)",
104+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
105+
&opal_common_ucx.devices);
132106

133107
if (component) {
134108
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
@@ -261,8 +235,8 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
261235
int ret;
262236
#endif
263237

264-
is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
265-
is_any_device = !strcmp(*opal_common_ucx.devices, "any");
238+
is_any_tl = !strcmp(opal_common_ucx.tls, "any");
239+
is_any_device = !strcmp(opal_common_ucx.devices, "any");
266240

267241
/* Check for special value "any" */
268242
if (is_any_tl && is_any_device) {
@@ -273,19 +247,19 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_
273247

274248
#if HAVE_DECL_OPEN_MEMSTREAM
275249
/* Split transports list */
276-
negate = ('^' == (*opal_common_ucx.tls)[0]);
277-
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
250+
negate = ('^' == (opal_common_ucx.tls)[0]);
251+
tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ',');
278252
if (tl_list == NULL) {
279253
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
280-
*opal_common_ucx.tls);
254+
opal_common_ucx.tls);
281255
goto out;
282256
}
283257

284258
/* Split devices list */
285-
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
259+
device_list = opal_argv_split(opal_common_ucx.devices, ',');
286260
if (device_list == NULL) {
287261
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
288-
*opal_common_ucx.devices);
262+
opal_common_ucx.devices);
289263
goto out_free_tl_list;
290264
}
291265

opal/mca/common/ucx/common_ucx.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ typedef struct opal_common_ucx_module {
9090
int progress_iterations;
9191
int registered;
9292
bool opal_mem_hooks;
93-
char **tls;
94-
char **devices;
93+
char *tls;
94+
char *devices;
9595
} opal_common_ucx_module_t;
9696

9797
typedef struct opal_common_ucx_del_proc {

0 commit comments

Comments
 (0)