Skip to content

Commit 2349f6b

Browse files
committed
ofi: Delay monitor initialization
Delay initializing the import memory monitor from component open to a function which should be called immediately before the Libfabric domain is initialized. Registering the import memory monitor requires initializing the patcher memory hooks, which is not necessary if the OFI fabrics were not selected. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent bf36d24 commit 2349f6b

File tree

4 files changed

+136
-42
lines changed

4 files changed

+136
-42
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
887887
}
888888
}
889889

890+
/* this must be called during single threaded part of the code and
891+
* before Libfabric configures its memory monitors. Easiest to do
892+
* that before domain open. Silently ignore not-supported errors,
893+
* as they are not critical to program correctness, but only
894+
* indicate that LIbfabric will have to pick a different, possibly
895+
* less optimial, monitor. */
896+
ret = opal_common_ofi_export_memory_monitor();
897+
if (0 != ret && -FI_ENOSYS != ret) {
898+
opal_output_verbose(1, opal_common_ofi.output,
899+
"Failed to inject Libfabric memory monitor: %s",
900+
fi_strerror(-ret));
901+
}
902+
903+
890904
/**
891905
* Open fabric
892906
* The getinfo struct returns a fabric attribute struct that can be used to

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,19 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
446446
* to prevent races. */
447447
mca_btl_ofi_rcache_init(module);
448448

449+
/* for similar reasons to the rcache call, this must be called
450+
* during single threaded part of the code and before Libfabric
451+
* configures its memory monitors. Easiest to do that before
452+
* domain open. Silently ignore not-supported errors, as they
453+
* are not critical to program correctness, but only indicate
454+
* that LIbfabric will have to pick a different, possibly less
455+
* optimial, monitor. */
456+
rc = opal_common_ofi_export_memory_monitor();
457+
if (0 != rc && -FI_ENOSYS != rc) {
458+
BTL_VERBOSE(("Failed to inject Libfabric memory monitor: %s",
459+
fi_strerror(-rc)));
460+
}
461+
449462
linux_device_name = info->domain_attr->name;
450463
BTL_VERBOSE(
451464
("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name));

opal/mca/common/ofi/common_ofi.c

Lines changed: 97 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,21 @@ opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4141
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic";
4242
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
4343
static int opal_common_ofi_init_ref_cnt = 0;
44+
static bool opal_common_ofi_installed_memory_monitor = false;
4445

4546
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
4647

4748
/*
48-
* These no-op functions are necessary since libfabric does not allow null
49-
* function pointers here.
49+
* Monitor object to export into Libfabric to provide memory release
50+
* notifications using our own memory hooks framework. Monitors may
51+
* use the subscribe/unsubscribe notifications to reduce unnecessary
52+
* notifications, but are not required to do so. Because patcher
53+
* notifies about all releases, it is cheaper for us to not filter and
54+
* this monitor can safely ignore subscribe/unsubscribe notifications.
55+
*
56+
* Libfabric requires the object to be fully defined. Unlike most of
57+
* Open MPI, it does not have NULL function pointer checks in calling
58+
* code.
5059
*/
5160
static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor)
5261
{
@@ -76,8 +85,8 @@ static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor,
7685
return true;
7786
}
7887

79-
static struct fid_mem_monitor *opal_common_ofi_monitor;
80-
static struct fid *opal_common_ofi_cache_fid;
88+
static struct fid_mem_monitor *opal_common_ofi_monitor = NULL;
89+
static struct fid *opal_common_ofi_cache_fid = NULL;
8190
static struct fi_ops_mem_monitor opal_common_ofi_export_ops = {
8291
.size = sizeof(struct fi_ops_mem_monitor),
8392
.start = opal_common_ofi_monitor_start,
@@ -87,6 +96,12 @@ static struct fi_ops_mem_monitor opal_common_ofi_export_ops = {
8796
.valid = opal_common_ofi_monitor_valid,
8897
};
8998

99+
/**
100+
* Callback function from Open MPI memory monitor
101+
*
102+
* Translation function between the callback function from Open MPI's
103+
* memory notifier to the Libfabric memory monitor.
104+
*/
90105
static void opal_common_ofi_mem_release_cb(void *buf, size_t length,
91106
void *cbdata, bool from_alloc)
92107
{
@@ -96,68 +111,110 @@ static void opal_common_ofi_mem_release_cb(void *buf, size_t length,
96111

97112
#endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */
98113

99-
int opal_common_ofi_open(void)
114+
int opal_common_ofi_export_memory_monitor(void)
100115
{
101-
int ret;
116+
int ret = -FI_ENOSYS;
102117

103-
if ((opal_common_ofi_init_ref_cnt++) > 0) {
104-
return OPAL_SUCCESS;
105-
}
106118
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
119+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
120+
121+
if (NULL != opal_common_ofi_cache_fid) {
122+
return 0;
123+
}
124+
125+
/*
126+
* While the memory import functionality was introduced in 1.13,
127+
* some deadlock bugs exist in the 1.13 series. Require version
128+
* 1.14 before this code is activated. Not activating the code
129+
* should not break any functionality directly, but may lead to
130+
* sub-optimal memory monitors being used in Libfabric, as Open
131+
* MPI will almost certainly install a patcher first.
132+
*/
133+
if (FI_VERSION_LT(fi_version(), FI_VERSION(1, 14))) {
134+
ret = -FI_ENOSYS;
135+
goto err;
136+
}
107137

108-
mca_base_framework_open(&opal_memory_base_framework, 0);
138+
ret = mca_base_framework_open(&opal_memory_base_framework, 0);
139+
if (OPAL_SUCCESS != ret) {
140+
ret = -FI_ENOSYS;
141+
goto err;
142+
}
109143
if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)
110144
!= (((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT))
111145
& opal_mem_hooks_support_level())) {
112-
return OPAL_SUCCESS;
146+
ret = -FI_ENOSYS;
147+
goto err;
113148
}
114149

115150
/*
116-
* This cache object doesn't do much, but is necessary for the API to work.
117-
* It is required to call the fi_import_fid API. This API was introduced in
118-
* libfabric version 1.13.0 and "mr_cache" is a "well known" name (documented
119-
* in libfabric) to indicate the type of object that we are trying to open.
151+
* The monitor import object has the well known name "mr_cache"
152+
* and was introduced in Libfabric 1.13
120153
*/
121-
ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL);
122-
if (ret) {
154+
ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0,
155+
&opal_common_ofi_cache_fid, NULL);
156+
if (0 != ret) {
123157
goto err;
124158
}
125159

126160
opal_common_ofi_monitor = calloc(1, sizeof(*opal_common_ofi_monitor));
127-
if (!opal_common_ofi_monitor) {
161+
if (NULL == opal_common_ofi_monitor) {
162+
ret = -FI_ENOMEM;
128163
goto err;
129164
}
130165

131166
opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR;
132167
opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops;
133-
/*
134-
* This import_fid call must occur before the libfabric provider creates
135-
* its memory registration cache. This will typically occur during domain
136-
* open as it is a domain level object. We put it early in initialization
137-
* to guarantee this and share the import monitor between the ofi btl
138-
* and ofi mtl.
139-
*/
140-
ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0);
141-
if (ret) {
168+
ret = fi_import_fid(opal_common_ofi_cache_fid,
169+
&opal_common_ofi_monitor->fid, 0);
170+
if (0 != ret) {
142171
goto err;
143172
}
144173
opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL);
174+
opal_common_ofi_installed_memory_monitor = true;
175+
176+
ret = 0;
145177

146-
return OPAL_SUCCESS;
147178
err:
148-
if (opal_common_ofi_cache_fid) {
149-
fi_close(opal_common_ofi_cache_fid);
179+
if (0 != ret) {
180+
if (NULL != opal_common_ofi_cache_fid) {
181+
fi_close(opal_common_ofi_cache_fid);
182+
}
183+
if (NULL != opal_common_ofi_monitor) {
184+
free(opal_common_ofi_monitor);
185+
}
150186
}
151-
if (opal_common_ofi_monitor) {
187+
188+
opal_common_ofi_installed_memory_monitor = false;
189+
190+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
191+
#endif
192+
193+
return ret;
194+
}
195+
196+
static int opal_common_ofi_remove_memory_monitor(void)
197+
{
198+
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
199+
if (opal_common_ofi_installed_memory_monitor) {
200+
opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb);
201+
fi_close(opal_common_ofi_cache_fid);
202+
fi_close(&opal_common_ofi_monitor->fid);
152203
free(opal_common_ofi_monitor);
204+
opal_common_ofi_installed_memory_monitor = false;
153205
}
206+
#endif
154207

155-
opal_common_ofi_init_ref_cnt--;
208+
return OPAL_SUCCESS;
209+
}
210+
211+
int opal_common_ofi_open(void)
212+
{
213+
if ((opal_common_ofi_init_ref_cnt++) > 0) {
214+
return OPAL_SUCCESS;
215+
}
156216

157-
return OPAL_ERROR;
158-
#else
159217
return OPAL_SUCCESS;
160-
#endif
161218
}
162219

163220
int opal_common_ofi_close(void)
@@ -168,14 +225,12 @@ int opal_common_ofi_close(void)
168225
return OPAL_SUCCESS;
169226
}
170227

171-
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
172-
opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb);
173-
fi_close(opal_common_ofi_cache_fid);
174-
fi_close(&opal_common_ofi_monitor->fid);
175-
free(opal_common_ofi_monitor);
176-
#endif
228+
ret = opal_common_ofi_remove_memory_monitor();
229+
if (OPAL_SUCCESS != ret) {
230+
return ret;
231+
}
177232

178-
if (opal_common_ofi.output != -1) {
233+
if (-1 != opal_common_ofi.output) {
179234
opal_output_close(opal_common_ofi.output);
180235
opal_common_ofi.output = -1;
181236
if (OPAL_SUCCESS != ret) {

opal/mca/common/ofi/common_ofi.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@ OPAL_DECLSPEC int opal_common_ofi_open(void);
6767
*/
6868
OPAL_DECLSPEC int opal_common_ofi_close(void);
6969

70+
/**
71+
* Export our memory hooks into Libfabric monitor
72+
*
73+
* Use Open MPI's memory hooks to provide monitor notifications to
74+
* Libfabric via the external mr_cache facility. This must be called
75+
* before any domain is initialized (ie, before any Libfabric memory
76+
* monitor is configured).
77+
*
78+
* @returns A libfabric error code is returned on error
79+
*/
80+
OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void);
81+
7082
/**
7183
* Search function for provider names
7284
*

0 commit comments

Comments
 (0)