Skip to content

Commit 51e6944

Browse files
committed
update to safety valve for MCA callbacks
I used to have a "use_safety_valve" that was meant to conditionally turn on the extra "unregister" call at MCA .so unload time. Reviewers became conserned about whether this was module-safe and wanted locks, but really the entire "use_safety_valve" bookkeeping is unnecessary. The logic of a safety valve is to assume bad code (like the oshmem mismatch) could be using the register/unregister system and thus to always unconditionally call unregister at unload time, regardless of whether a nice matched pair of register/deregister already happened or if we're in the oshmem case where they failed to make such a match. With that being the design it was just necessary to update the opal_mem_hooks_unregister_release() function to be safe regardless of what context it's being called from (eg, if the callback is already unregistered, or if it was never registered in the first place). Also for compatibility I switched from directly using __attribute__((destructor)) to __opal_attribute_destructor__ Signed-off-by: Mark Allen <markalle@us.ibm.com>
1 parent 5571dba commit 51e6944

File tree

5 files changed

+45
-30
lines changed

5 files changed

+45
-30
lines changed

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
3-
* Copyright (c) 2016 IBM Corporation. All rights reserved.
3+
* Copyright (c) 2016-2022 IBM Corporation. All rights reserved.
44
* Copyright (c) 2017 The University of Tennessee and The University
55
* of Tennessee Research Foundation. All rights
66
* reserved.
@@ -18,8 +18,6 @@
1818
#include "coll_hcoll.h"
1919
#include "coll_hcoll_dtypes.h"
2020

21-
static int use_safety_valve = 0;
22-
2321
int hcoll_comm_attr_keyval;
2422
int hcoll_type_attr_keyval;
2523
mca_coll_hcoll_dtype_t zero_dte_mapping;
@@ -331,7 +329,6 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority)
331329
cm->using_mem_hooks = 1;
332330
opal_mem_hooks_register_release(mca_coll_hcoll_mem_release_cb, NULL);
333331
setenv("MXM_HCOLL_MEM_ON_DEMAND_MAP", "y", 0);
334-
use_safety_valve = 1;
335332
}
336333
}
337334
} else {
@@ -452,9 +449,7 @@ OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t,
452449
mca_coll_hcoll_module_construct,
453450
mca_coll_hcoll_module_destruct);
454451

455-
static void safety_valve(void) __attribute__((destructor));
452+
static void safety_valve(void) __opal_attribute_destructor__;
456453
void safety_valve(void) {
457-
if (use_safety_valve) {
458-
opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb);
459-
}
454+
opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb);
460455
}

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* reserved.
2020
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
2121
* Copyright (c) 2019 Intel, Inc. All rights reserved.
22+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
2223
* $COPYRIGHT$
2324
*
2425
* Additional copyrights may follow
@@ -43,8 +44,6 @@
4344
#include "btl_uct_am.h"
4445
#include "btl_uct_device_context.h"
4546

46-
static int use_safety_valve = 0;
47-
4847
static int mca_btl_uct_component_register(void)
4948
{
5049
mca_btl_uct_module_t *module = &mca_btl_uct_module_template;
@@ -147,7 +146,6 @@ static int mca_btl_uct_component_open(void)
147146
& opal_mem_hooks_support_level()))) {
148147
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
149148
opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
150-
use_safety_valve = 1;
151149
}
152150

153151
return OPAL_SUCCESS;
@@ -673,9 +671,7 @@ mca_btl_uct_component_t mca_btl_uct_component = {
673671
.btl_progress = mca_btl_uct_component_progress,
674672
}};
675673

676-
static void safety_valve(void) __attribute__((destructor));
674+
static void safety_valve(void) __opal_attribute_destructor__;
677675
void safety_valve(void) {
678-
if (use_safety_valve) {
679-
opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb);
680-
}
676+
opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb);
681677
}

opal/mca/common/ucx/common_ucx.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Copyright (c) 2021 Triad National Security, LLC. All rights
88
* reserved.
99
* Copyright (c) 2022 Google, LLC. All rights reserved.
10+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1011
*
1112
* $COPYRIGHT$
1213
*
@@ -29,8 +30,6 @@
2930
#include <stdio.h>
3031
#include <ucm/api/ucm.h>
3132

32-
static int use_safety_valve = 0;
33-
3433
/***********************************************************************/
3534

3635
extern mca_base_framework_t opal_memory_base_framework;
@@ -156,7 +155,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_register(void)
156155
MCA_COMMON_UCX_VERBOSE(1, "%s", "using OPAL memory hooks as external events");
157156
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
158157
opal_mem_hooks_register_release(opal_common_ucx_mem_release_cb, NULL);
159-
use_safety_valve = 1;
160158
}
161159
}
162160
}
@@ -478,9 +476,7 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s
478476
return opal_common_ucx_mca_pmix_fence(worker);
479477
}
480478

481-
static void safety_valve(void) __attribute__((destructor));
479+
static void safety_valve(void) __opal_attribute_destructor__;
482480
void safety_valve(void) {
483-
if (use_safety_valve) {
484-
opal_mem_hooks_unregister_release(opal_common_ucx_mem_release_cb);
485-
}
481+
opal_mem_hooks_unregister_release(opal_common_ucx_mem_release_cb);
486482
}

opal/mca/rcache/base/rcache_base_create.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2020 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -37,8 +38,6 @@
3738
#include "opal/memoryhooks/memory.h"
3839
#include "opal/runtime/opal_params.h"
3940

40-
static int use_safety_valve = 0;
41-
4241
mca_rcache_base_module_t *
4342
mca_rcache_base_module_create(const char *name, void *user_data,
4443
struct mca_rcache_base_resources_t *resources)
@@ -72,7 +71,6 @@ mca_rcache_base_module_create(const char *name, void *user_data,
7271
opal_leave_pinned = !opal_leave_pinned_pipeline;
7372
}
7473
opal_mem_hooks_register_release(mca_rcache_base_mem_cb, NULL);
75-
use_safety_valve = 1;
7674
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
7775
opal_show_help("help-rcache-base.txt", "leave pinned failed", true, name,
7876
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), opal_process_info.nodename);
@@ -125,9 +123,7 @@ int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module)
125123
return OPAL_ERR_NOT_FOUND;
126124
}
127125

128-
static void safety_valve(void) __attribute__((destructor));
126+
static void safety_valve(void) __opal_attribute_destructor__;
129127
void safety_valve(void) {
130-
if (use_safety_valve) {
131-
opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb);
132-
}
128+
opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb);
133129
}

opal/memoryhooks/memory.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -55,6 +56,7 @@ static int hooks_support = 0;
5556
static opal_list_t release_cb_list;
5657
static opal_atomic_lock_t release_lock;
5758
static int release_run_callbacks;
59+
static int is_initialized = false;
5860

5961
/**
6062
* Finalize the memory hooks subsystem
@@ -93,6 +95,7 @@ int opal_mem_hooks_init(void)
9395
OBJ_CONSTRUCT(&release_cb_list, opal_list_t);
9496

9597
opal_atomic_lock_init(&release_lock, OPAL_ATOMIC_LOCK_UNLOCKED);
98+
is_initialized = true;
9699

97100
/* delay running callbacks until there is something in the
98101
registration */
@@ -196,11 +199,40 @@ int opal_mem_hooks_unregister_release(opal_mem_hooks_callback_fn_t *func)
196199
callback_list_item_t *cbitem, *found_item = NULL;
197200
int ret = OPAL_ERR_NOT_FOUND;
198201

202+
// I've added "is_initialized" to allow this call to be safe even if
203+
// a memory hooks .so was merely loaded but never used so this file's
204+
// init function was never called. I'll give more context, first
205+
// describing a bug hit in open-shmem:
206+
//
207+
// Ordinarily the expected behavior of memhook users is they'd register a
208+
// callback and then deregister it in a nice matched pair. And I think
209+
// most OMPI code does, but the open-shmem code isn't as clear and it
210+
// was loading a callback and just leaving it loaded after open-shmem was
211+
// unloaded. This was a problem because upon every malloc/free/etc we're
212+
// going to keep trying to call their callback, and the function pointer
213+
// itself became illegal as soon as the open-shmem shared lib was unloaded.
214+
// So I figured the best solution was to add a "safety valve" to the callback
215+
// users where they would have a library-level destructor that un-conditionally
216+
// adds an extra unregister call regardless of whether the code is already
217+
// matched or not.
218+
//
219+
// With that happening, it's necessary to make sure
220+
// opal_mem_hooks_unregister_release() is safe when the system isn't
221+
// initialized and/or if it was initialized but the specified callback
222+
// is already removed.
223+
//
224+
// Note also, the reason for checking "cbitem" before looking at cbitem->cbfunc
225+
// is when the list is empty the OPAL_LIST_FOREACH() empirically still iterates
226+
// once and gives a null cbitem. I'm not sure I like that, but that's what
227+
// it did so I needed to make that case safe too.
228+
if (!is_initialized) {
229+
return 0;
230+
}
199231
opal_atomic_lock(&release_lock);
200232

201233
/* make sure the callback isn't already in the list */
202234
OPAL_LIST_FOREACH (cbitem, &release_cb_list, callback_list_item_t) {
203-
if (cbitem->cbfunc == func) {
235+
if (cbitem && cbitem->cbfunc == func) {
204236
opal_list_remove_item(&release_cb_list, (opal_list_item_t *) cbitem);
205237
found_item = cbitem;
206238
ret = OPAL_SUCCESS;

0 commit comments

Comments
 (0)