3
3
* Copyright (c) 2019 Intel, Inc. All rights reserved.
4
4
* Copyright (c) 2019 Research Organization for Information Science
5
5
* and Technology (RIST). All rights reserved.
6
+ * Copyright (c) 2021 Triad National Security, LLC. All rights
7
+ * reserved.
8
+ *
6
9
* $COPYRIGHT$
7
10
*
8
11
* Additional copyrights may follow
@@ -33,6 +36,8 @@ opal_common_ucx_module_t opal_common_ucx = {.verbose = 0,
33
36
.opal_mem_hooks = 0 ,
34
37
.tls = NULL };
35
38
39
+ static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT ;
40
+
36
41
static void opal_common_ucx_mem_release_cb (void * buf , size_t length , void * cbdata , bool from_alloc )
37
42
{
38
43
ucm_vm_munmap (buf , length );
@@ -42,34 +47,58 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
42
47
{
43
48
static const char * default_tls = "rc_verbs,ud_verbs,rc_mlx5,dc_mlx5,cuda_ipc,rocm_ipc" ;
44
49
static const char * default_devices = "mlx*" ;
45
- static int registered = 0 ;
46
50
static int hook_index ;
47
51
static int verbose_index ;
48
52
static int progress_index ;
49
53
static int tls_index ;
50
54
static int devices_index ;
55
+ int param ;
51
56
52
- if (!registered ) {
57
+ OPAL_THREAD_LOCK (& opal_common_ucx_mutex );
58
+
59
+ param = mca_base_var_find ("opal" , "opal_common" , "ucx" , "verbose" );
60
+ if (0 > param ) {
53
61
verbose_index = mca_base_var_register ("opal" , "opal_common" , "ucx" , "verbose" ,
54
62
"Verbose level of the UCX components" ,
55
63
MCA_BASE_VAR_TYPE_INT , NULL , 0 ,
56
64
MCA_BASE_VAR_FLAG_SETTABLE , OPAL_INFO_LVL_3 ,
57
65
MCA_BASE_VAR_SCOPE_LOCAL , & opal_common_ucx .verbose );
66
+ }
67
+
68
+ param = mca_base_var_find ("opal" , "opal_common" , "ucx" , "progress_iterations" );
69
+ if (0 > param ) {
58
70
progress_index = mca_base_var_register ("opal" , "opal_common" , "ucx" , "progress_iterations" ,
59
71
"Set number of calls of internal UCX progress "
60
72
"calls per opal_progress call" ,
61
73
MCA_BASE_VAR_TYPE_INT , NULL , 0 ,
62
74
MCA_BASE_VAR_FLAG_SETTABLE , OPAL_INFO_LVL_3 ,
63
75
MCA_BASE_VAR_SCOPE_LOCAL ,
64
76
& opal_common_ucx .progress_iterations );
77
+ }
78
+
79
+ param = mca_base_var_find ("opal" , "opal_common" , "ucx" , "opal_mem_hooks" );
80
+ if (0 > param ) {
65
81
hook_index = mca_base_var_register ("opal" , "opal_common" , "ucx" , "opal_mem_hooks" ,
66
82
"Use OPAL memory hooks, instead of UCX internal "
67
83
"memory hooks" ,
68
84
MCA_BASE_VAR_TYPE_BOOL , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
69
85
MCA_BASE_VAR_SCOPE_LOCAL ,
70
86
& opal_common_ucx .opal_mem_hooks );
87
+ }
71
88
72
- opal_common_ucx .tls = malloc (sizeof (* opal_common_ucx .tls ));
89
+ param = mca_base_var_find ("opal" , "opal_common" , "ucx" , "tls" );
90
+ if (0 > param ) {
91
+
92
+ /*
93
+ * this monkey business is needed because of the way the MCA VARs framework tries to handle pointers to strings
94
+ * when destructing the MCA var database. If you don't do something like this,the MCA var framework will try
95
+ * to dereference a pointer which itself is no longer a valid address owing to having been previously dlclosed.
96
+ * Same for the devices pointer below.
97
+ */
98
+ if (NULL == opal_common_ucx .tls ) {
99
+ opal_common_ucx .tls = malloc (sizeof (* opal_common_ucx .tls ));
100
+ assert (NULL != opal_common_ucx .tls );
101
+ }
73
102
* opal_common_ucx .tls = strdup (default_tls );
74
103
tls_index = mca_base_var_register (
75
104
"opal" , "opal_common" , "ucx" , "tls" ,
@@ -80,17 +109,24 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
80
109
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'." ,
81
110
MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 , MCA_BASE_VAR_SCOPE_LOCAL ,
82
111
opal_common_ucx .tls );
112
+ }
113
+
114
+ param = mca_base_var_find ("opal" , "opal_common" , "ucx" , "devices" );
115
+ if (0 > param ) {
83
116
84
- opal_common_ucx .devices = malloc (sizeof (* opal_common_ucx .devices ));
117
+ if (NULL == opal_common_ucx .devices ) {
118
+ opal_common_ucx .devices = malloc (sizeof (* opal_common_ucx .devices ));
119
+ assert (NULL != opal_common_ucx .devices );
120
+ }
85
121
* opal_common_ucx .devices = strdup (default_devices );
86
122
devices_index = mca_base_var_register (
87
123
"opal" , "opal_common" , "ucx" , "devices" ,
88
124
"List of device driver pattern names, which, if supported by UCX, will "
89
125
"bump its priority above ob1. Special values: any (any available)" ,
90
126
MCA_BASE_VAR_TYPE_STRING , NULL , 0 , 0 , OPAL_INFO_LVL_3 , MCA_BASE_VAR_SCOPE_LOCAL ,
91
127
opal_common_ucx .devices );
92
- registered = 1 ;
93
128
}
129
+
94
130
if (component ) {
95
131
mca_base_var_register_synonym (verbose_index , component -> mca_project_name ,
96
132
component -> mca_type_name , component -> mca_component_name ,
@@ -108,6 +144,8 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
108
144
component -> mca_type_name , component -> mca_component_name ,
109
145
"devices" , 0 );
110
146
}
147
+
148
+ OPAL_THREAD_UNLOCK (& opal_common_ucx_mutex );
111
149
}
112
150
113
151
OPAL_DECLSPEC void opal_common_ucx_mca_register (void )
0 commit comments