Skip to content

Commit 86b3d10

Browse files
markalleawlauria
authored andcommitted
Safety valve for callbacks
When an mca_foo.so gets unloaded any callbacks it registered become invalid and can segv if called. This happened in an oshemem testcase that did call shmem_init() and shmem_finalize(). But for whatever reason oshmem unloads the MCAs before they call MPI_Finalize() which they have in a conditional so they don't necessarily always call MPI_Finalize(). Anyway the failing path goes from oshmem_shmem_finalize() through _shmem_finalize() which unloads the MCAs (mca_base_framework_close which dlcloses them) and then after that it will call MPI_Finalize() which will deregister the callback That could be considered a bug, maybe the reference counters should be set up so the MCA's .fini call is always called before dlclose() and make sure all such .fini calls deregister the MCA's callbacks. But this checkin takes the conservative approach of assuming that bugs like that are always a possibility and inserts a safety valve around callback usage, so when an MCA gets unloaded it deregisters any callbacks rather than only having them deregistered if we go through a proper sequence of finalize calls. Signed-off-by: Austen Lauria <awlauria@us.ibm.com>
1 parent 252b6bf commit 86b3d10

File tree

4 files changed

+39
-2
lines changed

4 files changed

+39
-2
lines changed

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "coll_hcoll.h"
1717
#include "coll_hcoll_dtypes.h"
1818

19+
static int use_safety_valve = 0;
20+
1921
int hcoll_comm_attr_keyval;
2022
int hcoll_type_attr_keyval;
2123
mca_coll_hcoll_dtype_t zero_dte_mapping;
@@ -327,6 +329,7 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority)
327329
cm->using_mem_hooks = 1;
328330
opal_mem_hooks_register_release(mca_coll_hcoll_mem_release_cb, NULL);
329331
setenv("MXM_HCOLL_MEM_ON_DEMAND_MAP", "y", 0);
332+
use_safety_valve = 1;
330333
}
331334
}
332335
} else {
@@ -447,5 +450,9 @@ OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t,
447450
mca_coll_hcoll_module_construct,
448451
mca_coll_hcoll_module_destruct);
449452

450-
451-
453+
static void safety_valve(void) __attribute__((destructor));
454+
void safety_valve(void) {
455+
if (use_safety_valve) {
456+
opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb);
457+
}
458+
}

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
#include "btl_uct_am.h"
4444
#include "btl_uct_device_context.h"
4545

46+
static int use_safety_valve = 0;
47+
4648
static int mca_btl_uct_component_register(void)
4749
{
4850
mca_btl_uct_module_t *module = &mca_btl_uct_module_template;
@@ -143,6 +145,7 @@ static int mca_btl_uct_component_open(void)
143145
& opal_mem_hooks_support_level()))) {
144146
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
145147
opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
148+
use_safety_valve = 1;
146149
}
147150

148151
return OPAL_SUCCESS;
@@ -667,3 +670,10 @@ mca_btl_uct_component_t mca_btl_uct_component = {
667670
.btl_init = mca_btl_uct_component_init,
668671
.btl_progress = mca_btl_uct_component_progress,
669672
}};
673+
674+
static void safety_valve(void) __attribute__((destructor));
675+
void safety_valve(void) {
676+
if (use_safety_valve) {
677+
opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb);
678+
}
679+
}

opal/mca/common/ucx/common_ucx.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include <stdio.h>
2727
#include <ucm/api/ucm.h>
2828

29+
static int use_safety_valve = 0;
30+
2931
/***********************************************************************/
3032

3133
extern mca_base_framework_t opal_memory_base_framework;
@@ -179,6 +181,7 @@ OPAL_DECLSPEC void opal_common_ucx_mca_register(void)
179181
MCA_COMMON_UCX_VERBOSE(1, "%s", "using OPAL memory hooks as external events");
180182
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
181183
opal_mem_hooks_register_release(opal_common_ucx_mem_release_cb, NULL);
184+
use_safety_valve = 1;
182185
}
183186
}
184187
}
@@ -499,3 +502,10 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s
499502

500503
return opal_common_ucx_mca_pmix_fence(worker);
501504
}
505+
506+
static void safety_valve(void) __attribute__((destructor));
507+
void safety_valve(void) {
508+
if (use_safety_valve) {
509+
opal_mem_hooks_unregister_release(opal_common_ucx_mem_release_cb);
510+
}
511+
}

opal/mca/rcache/base/rcache_base_create.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
#include "opal/memoryhooks/memory.h"
3838
#include "opal/runtime/opal_params.h"
3939

40+
static int use_safety_valve = 0;
41+
4042
mca_rcache_base_module_t *
4143
mca_rcache_base_module_create(const char *name, void *user_data,
4244
struct mca_rcache_base_resources_t *resources)
@@ -70,6 +72,7 @@ mca_rcache_base_module_create(const char *name, void *user_data,
7072
opal_leave_pinned = !opal_leave_pinned_pipeline;
7173
}
7274
opal_mem_hooks_register_release(mca_rcache_base_mem_cb, NULL);
75+
use_safety_valve = 1;
7376
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
7477
opal_show_help("help-rcache-base.txt", "leave pinned failed", true, name,
7578
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), opal_process_info.nodename);
@@ -121,3 +124,10 @@ int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module)
121124

122125
return OPAL_ERR_NOT_FOUND;
123126
}
127+
128+
static void safety_valve(void) __attribute__((destructor));
129+
void safety_valve(void) {
130+
if (use_safety_valve) {
131+
opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb);
132+
}
133+
}

0 commit comments

Comments
 (0)