Skip to content

Commit 50dbcb6

Browse files
authored
Merge pull request #10728 from markalle/safety_valve_update
update to safety valve for MCA callbacks
2 parents 5571dba + 51e6944 commit 50dbcb6

File tree

5 files changed

+45
-30
lines changed

5 files changed

+45
-30
lines changed

ompi/mca/coll/hcoll/coll_hcoll_module.c

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
3-
* Copyright (c) 2016 IBM Corporation. All rights reserved.
3+
* Copyright (c) 2016-2022 IBM Corporation. All rights reserved.
44
* Copyright (c) 2017 The University of Tennessee and The University
55
* of Tennessee Research Foundation. All rights
66
* reserved.
@@ -18,8 +18,6 @@
1818
#include "coll_hcoll.h"
1919
#include "coll_hcoll_dtypes.h"
2020

21-
static int use_safety_valve = 0;
22-
2321
int hcoll_comm_attr_keyval;
2422
int hcoll_type_attr_keyval;
2523
mca_coll_hcoll_dtype_t zero_dte_mapping;
@@ -331,7 +329,6 @@ mca_coll_hcoll_comm_query(struct ompi_communicator_t *comm, int *priority)
331329
cm->using_mem_hooks = 1;
332330
opal_mem_hooks_register_release(mca_coll_hcoll_mem_release_cb, NULL);
333331
setenv("MXM_HCOLL_MEM_ON_DEMAND_MAP", "y", 0);
334-
use_safety_valve = 1;
335332
}
336333
}
337334
} else {
@@ -452,9 +449,7 @@ OBJ_CLASS_INSTANCE(mca_coll_hcoll_module_t,
452449
mca_coll_hcoll_module_construct,
453450
mca_coll_hcoll_module_destruct);
454451

455-
static void safety_valve(void) __attribute__((destructor));
452+
static void safety_valve(void) __opal_attribute_destructor__;
456453
void safety_valve(void) {
457-
if (use_safety_valve) {
458-
opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb);
459-
}
454+
opal_mem_hooks_unregister_release(mca_coll_hcoll_mem_release_cb);
460455
}

opal/mca/btl/uct/btl_uct_component.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* reserved.
2020
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
2121
* Copyright (c) 2019 Intel, Inc. All rights reserved.
22+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
2223
* $COPYRIGHT$
2324
*
2425
* Additional copyrights may follow
@@ -43,8 +44,6 @@
4344
#include "btl_uct_am.h"
4445
#include "btl_uct_device_context.h"
4546

46-
static int use_safety_valve = 0;
47-
4847
static int mca_btl_uct_component_register(void)
4948
{
5049
mca_btl_uct_module_t *module = &mca_btl_uct_module_template;
@@ -147,7 +146,6 @@ static int mca_btl_uct_component_open(void)
147146
& opal_mem_hooks_support_level()))) {
148147
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
149148
opal_mem_hooks_register_release(mca_btl_uct_mem_release_cb, NULL);
150-
use_safety_valve = 1;
151149
}
152150

153151
return OPAL_SUCCESS;
@@ -673,9 +671,7 @@ mca_btl_uct_component_t mca_btl_uct_component = {
673671
.btl_progress = mca_btl_uct_component_progress,
674672
}};
675673

676-
static void safety_valve(void) __attribute__((destructor));
674+
static void safety_valve(void) __opal_attribute_destructor__;
677675
void safety_valve(void) {
678-
if (use_safety_valve) {
679-
opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb);
680-
}
676+
opal_mem_hooks_unregister_release(mca_btl_uct_mem_release_cb);
681677
}

opal/mca/common/ucx/common_ucx.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Copyright (c) 2021 Triad National Security, LLC. All rights
88
* reserved.
99
* Copyright (c) 2022 Google, LLC. All rights reserved.
10+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1011
*
1112
* $COPYRIGHT$
1213
*
@@ -29,8 +30,6 @@
2930
#include <stdio.h>
3031
#include <ucm/api/ucm.h>
3132

32-
static int use_safety_valve = 0;
33-
3433
/***********************************************************************/
3534

3635
extern mca_base_framework_t opal_memory_base_framework;
@@ -156,7 +155,6 @@ OPAL_DECLSPEC void opal_common_ucx_mca_register(void)
156155
MCA_COMMON_UCX_VERBOSE(1, "%s", "using OPAL memory hooks as external events");
157156
ucm_set_external_event(UCM_EVENT_VM_UNMAPPED);
158157
opal_mem_hooks_register_release(opal_common_ucx_mem_release_cb, NULL);
159-
use_safety_valve = 1;
160158
}
161159
}
162160
}
@@ -478,9 +476,7 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s
478476
return opal_common_ucx_mca_pmix_fence(worker);
479477
}
480478

481-
static void safety_valve(void) __attribute__((destructor));
479+
static void safety_valve(void) __opal_attribute_destructor__;
482480
void safety_valve(void) {
483-
if (use_safety_valve) {
484-
opal_mem_hooks_unregister_release(opal_common_ucx_mem_release_cb);
485-
}
481+
opal_mem_hooks_unregister_release(opal_common_ucx_mem_release_cb);
486482
}

opal/mca/rcache/base/rcache_base_create.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
1414
* reserved.
1515
* Copyright (c) 2020 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -37,8 +38,6 @@
3738
#include "opal/memoryhooks/memory.h"
3839
#include "opal/runtime/opal_params.h"
3940

40-
static int use_safety_valve = 0;
41-
4241
mca_rcache_base_module_t *
4342
mca_rcache_base_module_create(const char *name, void *user_data,
4443
struct mca_rcache_base_resources_t *resources)
@@ -72,7 +71,6 @@ mca_rcache_base_module_create(const char *name, void *user_data,
7271
opal_leave_pinned = !opal_leave_pinned_pipeline;
7372
}
7473
opal_mem_hooks_register_release(mca_rcache_base_mem_cb, NULL);
75-
use_safety_valve = 1;
7674
} else if (1 == opal_leave_pinned || opal_leave_pinned_pipeline) {
7775
opal_show_help("help-rcache-base.txt", "leave pinned failed", true, name,
7876
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), opal_process_info.nodename);
@@ -125,9 +123,7 @@ int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module)
125123
return OPAL_ERR_NOT_FOUND;
126124
}
127125

128-
static void safety_valve(void) __attribute__((destructor));
126+
static void safety_valve(void) __opal_attribute_destructor__;
129127
void safety_valve(void) {
130-
if (use_safety_valve) {
131-
opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb);
132-
}
128+
opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb);
133129
}

opal/memoryhooks/memory.c

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2022 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -55,6 +56,7 @@ static int hooks_support = 0;
5556
static opal_list_t release_cb_list;
5657
static opal_atomic_lock_t release_lock;
5758
static int release_run_callbacks;
59+
static int is_initialized = false;
5860

5961
/**
6062
* Finalize the memory hooks subsystem
@@ -93,6 +95,7 @@ int opal_mem_hooks_init(void)
9395
OBJ_CONSTRUCT(&release_cb_list, opal_list_t);
9496

9597
opal_atomic_lock_init(&release_lock, OPAL_ATOMIC_LOCK_UNLOCKED);
98+
is_initialized = true;
9699

97100
/* delay running callbacks until there is something in the
98101
registration */
@@ -196,11 +199,40 @@ int opal_mem_hooks_unregister_release(opal_mem_hooks_callback_fn_t *func)
196199
callback_list_item_t *cbitem, *found_item = NULL;
197200
int ret = OPAL_ERR_NOT_FOUND;
198201

202+
// I've added "is_initialized" to allow this call to be safe even if
203+
// a memory hooks .so was merely loaded but never used so this file's
204+
// init function was never called. I'll give more context, first
205+
// describing a bug hit in open-shmem:
206+
//
207+
// Ordinarily the expected behavior of memhook users is they'd register a
208+
// callback and then deregister it in a nice matched pair. And I think
209+
// most OMPI code does, but the open-shmem code isn't as clear and it
210+
// was loading a callback and just leaving it loaded after open-shmem was
211+
// unloaded. This was a problem because upon every malloc/free/etc we're
212+
// going to keep trying to call their callback, and the function pointer
213+
// itself became illegal as soon as the open-shmem shared lib was unloaded.
214+
// So I figured the best solution was to add a "safety valve" to the callback
215+
// users where they would have a library-level destructor that un-conditionally
216+
// adds an extra unregister call regardless of whether the code is already
217+
// matched or not.
218+
//
219+
// With that happening, it's necessary to make sure
220+
// opal_mem_hooks_unregister_release() is safe when the system isn't
221+
// initialized and/or if it was initialized but the specified callback
222+
// is already removed.
223+
//
224+
// Note also, the reason for checking "cbitem" before looking at cbitem->cbfunc
225+
// is when the list is empty the OPAL_LIST_FOREACH() empirically still iterates
226+
// once and gives a null cbitem. I'm not sure I like that, but that's what
227+
// it did so I needed to make that case safe too.
228+
if (!is_initialized) {
229+
return 0;
230+
}
199231
opal_atomic_lock(&release_lock);
200232

201233
/* make sure the callback isn't already in the list */
202234
OPAL_LIST_FOREACH (cbitem, &release_cb_list, callback_list_item_t) {
203-
if (cbitem->cbfunc == func) {
235+
if (cbitem && cbitem->cbfunc == func) {
204236
opal_list_remove_item(&release_cb_list, (opal_list_item_t *) cbitem);
205237
found_item = cbitem;
206238
ret = OPAL_SUCCESS;

0 commit comments

Comments
 (0)