Skip to content

Commit 8f3c171

Browse files
authored
Merge pull request #13327 from bwbarrett/ofi-hmem
Fix Libfabric MR caching issues
2 parents 0ff5e81 + 72a7e0e commit 8f3c171

File tree

9 files changed

+202
-95
lines changed

9 files changed

+202
-95
lines changed

ompi/mca/mtl/ofi/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# and Technology (RIST). All rights reserved.
1010
# Copyright (c) 2020 Triad National Security, LLC. All rights
1111
# reserved.
12-
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
12+
# Copyright (c) 2022-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
1313
# Copyright (c) 2025 Jeffrey M. Squyres. All rights reserved.
1414
# $COPYRIGHT$
1515
#
@@ -48,6 +48,7 @@ mtl_ofi_sources = \
4848
mtl_ofi_component.c \
4949
mtl_ofi_endpoint.h \
5050
mtl_ofi_endpoint.c \
51+
mtl_ofi_mr.c \
5152
mtl_ofi_request.h \
5253
mtl_ofi_types.h \
5354
mtl_ofi_opt.h \

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 20 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
* reserved.
55
* Copyright (c) 2019-2024 Triad National Security, LLC. All rights
66
* reserved.
7-
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved.
8-
* reserved.
7+
* Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
98
* Copyright (c) 2021 Cisco Systems, Inc. All rights reserved
109
* Copyright (c) 2021 The University of Tennessee and The University
1110
* of Tennessee Research Foundation. All rights
@@ -73,6 +72,8 @@ extern int ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl,
7372

7473
int ompi_mtl_ofi_progress_no_inline(void);
7574

75+
int ompi_mtl_ofi_rcache_init(void);
76+
7677
#if OPAL_HAVE_THREAD_LOCAL
7778
extern opal_thread_local int ompi_mtl_ofi_per_thread_ctx;
7879
#endif
@@ -291,78 +292,37 @@ ompi_mtl_ofi_set_mr_null(ompi_mtl_ofi_request_t *ofi_req) {
291292
static
292293
int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor,
293294
ompi_mtl_ofi_request_t *ofi_req,
294-
void* buffer) {
295+
void* buffer)
296+
{
297+
int ret;
298+
uint32_t cache_flags = 0;
299+
295300
ofi_req->mr = NULL;
296301
if (ofi_req->length <= 0 || NULL == buffer) {
297302
return OMPI_SUCCESS;
298303
}
299304

300-
#if OPAL_OFI_HAVE_FI_MR_IFACE
301-
302-
if ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg) {
303-
/* Register buffer */
304-
int ret;
305-
struct fi_mr_attr attr = {0};
306-
struct iovec iov = {0};
307-
308-
iov.iov_base = buffer;
309-
iov.iov_len = ofi_req->length;
310-
attr.mr_iov = &iov;
311-
attr.iov_count = 1;
312-
attr.access = FI_SEND | FI_RECV;
313-
attr.offset = 0;
314-
attr.context = NULL;
315-
if (false == ompi_mtl_base_selected_component->accelerator_support) {
316-
goto reg;
317-
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
318-
attr.iface = FI_HMEM_CUDA;
319-
opal_accelerator.get_device(&attr.device.cuda);
320-
#if OPAL_OFI_HAVE_FI_HMEM_ROCR
321-
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) {
322-
attr.iface = FI_HMEM_ROCR;
323-
opal_accelerator.get_device(&attr.device.cuda);
324-
#endif
325-
#if OPAL_OFI_HAVE_FI_HMEM_ZE
326-
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) {
327-
attr.iface = FI_HMEM_ZE;
328-
opal_accelerator.get_device(&attr.device.ze);
329-
#endif
330-
} else {
331-
return OPAL_ERROR;
332-
}
333-
reg:
334-
ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &ofi_req->mr);
335-
336-
if (ret) {
337-
opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true,
338-
opal_accelerator_base_selected_component.base_version.mca_component_name,
339-
buffer, ofi_req->length,
340-
fi_strerror(-ret), ret);
341-
ofi_req->mr = NULL;
342-
return OMPI_ERROR;
343-
}
305+
if (! ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg)) {
306+
return OMPI_SUCCESS;
344307
}
345308

346-
#endif
347-
348-
return OMPI_SUCCESS;
309+
/* note - the cache access flags are a little broken, because rcache doesn't
310+
* understand send/recv requirements. Since this rcache is only used in the
311+
* MTL, that isn't a problem and we fix it in the underlying register call.
312+
*/
313+
ret = ompi_mtl_ofi.rcache->rcache_register(ompi_mtl_ofi.rcache, buffer, ofi_req->length,
314+
cache_flags, MCA_RCACHE_ACCESS_ANY,
315+
(mca_rcache_base_registration_t **) &ofi_req->mr);
316+
return ret;
349317
}
350318

351319
/** Deregister buffer */
352320
__opal_attribute_always_inline__ static inline int
353321
ompi_mtl_ofi_deregister_buffer(ompi_mtl_ofi_request_t *ofi_req) {
354322
if (ofi_req->mr) {
355-
int ret;
356-
ret = fi_close(&ofi_req->mr->fid);
357-
if (ret) {
358-
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
359-
"fi_close",
360-
ompi_process_info.nodename, __FILE__, __LINE__,
361-
fi_strerror(-ret), ofi_req->mr->fid);
362-
return OMPI_ERROR;
363-
}
364-
ofi_req->mr = NULL;
323+
(void)ompi_mtl_ofi.rcache->rcache_deregister(ompi_mtl_ofi.rcache, &ofi_req->mr->base);
365324
}
325+
366326
return OMPI_SUCCESS;
367327
}
368328

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* Copyright (c) 2014-2021 Cisco Systems, Inc. All rights reserved
66
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
77
* reserved.
8-
* Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
8+
* Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
99
* Copyright (c) 2020-2023 Triad National Security, LLC. All rights
1010
* reserved.
1111
* $COPYRIGHT$
@@ -823,27 +823,28 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
823823
}
824824
} else {
825825
*accelerator_support = true;
826-
ompi_mtl_ofi.hmem_needs_reg = true;
827-
/*
828-
* Workaround for the fact that the CXI provider actually doesn't need for accelerator memory to be registered
829-
* for local buffers, but if one does do so using fi_mr_regattr, one actually needs to manage the
830-
* requested_key field in the fi_mr_attr attr argument, and the OFI MTL doesn't track which requested_keys
831-
* have already been registered. So just set a flag to disable local registration. Note the OFI BTL doesn't
832-
* have a problem here since it uses fi_mr_regattr only within the context of an rcache, and manages the
833-
* requested_key field in this way.
834-
*/
835-
if ((NULL != strstr(prov->fabric_attr->prov_name, "cxi")) ||
836-
(NULL != strstr(prov->fabric_attr->prov_name, "CXI")) ) {
837-
ompi_mtl_ofi.hmem_needs_reg = false;
838-
}
839826

827+
/* Only explicitly register domain buffers if the provider requires it.
828+
For example, CXI does not require it but EFA does require it. */
829+
if ((prov->domain_attr->mr_mode & FI_MR_HMEM) != 0) {
830+
ompi_mtl_ofi.hmem_needs_reg = true;
831+
opal_output_verbose(50, opal_common_ofi.output,
832+
"Support for device buffers enabled with explicit registration");
833+
} else {
834+
opal_output_verbose(50, opal_common_ofi.output,
835+
"Support for device buffers enabled with implicit registration");
836+
}
840837
}
841838
#else
842839
opal_output_verbose(50, opal_common_ofi.output,
843840
"%s:%d: Libfabric provider does not support device buffers. Continuing with device to host copies.\n",
844841
__FILE__, __LINE__);
845842
#endif
846843

844+
if (ompi_mtl_ofi.hmem_needs_reg) {
845+
ompi_mtl_ofi_rcache_init();
846+
}
847+
847848
/**
848849
* Select the format of the OFI tag
849850
*/
@@ -1177,6 +1178,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
11771178
{
11781179
ssize_t ret;
11791180

1181+
if (NULL != ompi_mtl_ofi.rcache) {
1182+
mca_rcache_base_module_destroy(ompi_mtl_ofi.rcache);
1183+
ompi_mtl_ofi.rcache = NULL;
1184+
}
1185+
11801186
opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);
11811187

11821188
/* Close all the OFI objects */

ompi/mca/mtl/ofi/mtl_ofi_mr.c

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
3+
*
4+
* $COPYRIGHT$
5+
*
6+
* Additional copyrights may follow
7+
*
8+
* $HEADER$
9+
*/
10+
11+
#include "opal_config.h"
12+
13+
#include "mtl_ofi.h"
14+
15+
static int
16+
ompi_mtl_ofi_reg_mem(void *reg_data, void *base, size_t size,
17+
mca_rcache_base_registration_t *reg)
18+
{
19+
int ret;
20+
struct fi_mr_attr attr = {0};
21+
struct iovec iov = {0};
22+
ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg;
23+
int dev_id;
24+
uint64_t flags;
25+
26+
iov.iov_base = base;
27+
iov.iov_len = size;
28+
attr.mr_iov = &iov;
29+
attr.iov_count = 1;
30+
attr.access = FI_SEND | FI_RECV;
31+
attr.offset = 0;
32+
attr.context = NULL;
33+
34+
#if OPAL_OFI_HAVE_FI_MR_IFACE
35+
if (OPAL_LIKELY(NULL != base)) {
36+
ret = opal_accelerator.check_addr(base, &dev_id, &flags);
37+
if (ret < 0) {
38+
return ret;
39+
} else if (ret > 0 ) {
40+
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
41+
attr.iface = FI_HMEM_CUDA;
42+
opal_accelerator.get_device(&attr.device.cuda);
43+
#if OPAL_OFI_HAVE_FI_HMEM_ROCR
44+
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) {
45+
attr.iface = FI_HMEM_ROCR;
46+
opal_accelerator.get_device(&attr.device.cuda);
47+
#endif
48+
#if OPAL_OFI_HAVE_FI_HMEM_ZE
49+
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) {
50+
attr.iface = FI_HMEM_ZE;
51+
opal_accelerator.get_device(&attr.device.ze);
52+
#endif
53+
} else {
54+
return OPAL_ERROR;
55+
}
56+
}
57+
}
58+
#endif
59+
60+
ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &mtl_reg->ofi_mr);
61+
if (0 != ret) {
62+
opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true,
63+
opal_accelerator_base_selected_component.base_version.mca_component_name,
64+
base, size, fi_strerror(-ret), ret);
65+
mtl_reg->ofi_mr = NULL;
66+
return OPAL_ERR_OUT_OF_RESOURCE;
67+
}
68+
69+
mtl_reg->mem_desc = fi_mr_desc(mtl_reg->ofi_mr);
70+
71+
return OPAL_SUCCESS;
72+
}
73+
74+
75+
static int
76+
ompi_mtl_ofi_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg)
77+
{
78+
ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg;
79+
int ret;
80+
81+
if (mtl_reg->ofi_mr != NULL) {
82+
ret = fi_close(&mtl_reg->ofi_mr->fid);
83+
if (0 != ret) {
84+
opal_output_verbose(1, opal_common_ofi.output,
85+
"%s: error unpinning memory mr=%p: %s",
86+
__func__, (void *)mtl_reg->ofi_mr,
87+
fi_strerror(-ret));
88+
return OPAL_ERROR;
89+
}
90+
}
91+
92+
return OPAL_SUCCESS;
93+
}
94+
95+
96+
int
97+
ompi_mtl_ofi_rcache_init(void)
98+
{
99+
mca_rcache_base_resources_t rcache_resources;
100+
char *tmp;
101+
102+
if (NULL != ompi_mtl_ofi.rcache) {
103+
return OMPI_SUCCESS;
104+
}
105+
106+
tmp = strdup("mtl-ofi");
107+
rcache_resources.cache_name = tmp;
108+
rcache_resources.reg_data = NULL;
109+
rcache_resources.sizeof_reg = sizeof(ompi_mtl_ofi_reg_t);
110+
rcache_resources.register_mem = ompi_mtl_ofi_reg_mem;
111+
rcache_resources.deregister_mem = ompi_mtl_ofi_dereg_mem;
112+
113+
ompi_mtl_ofi.rcache = mca_rcache_base_module_create("grdma", &ompi_mtl_ofi, &rcache_resources);
114+
free(tmp);
115+
116+
if (NULL == ompi_mtl_ofi.rcache) {
117+
/* something when horribly wrong */
118+
opal_output_verbose(1, opal_common_ofi.output,
119+
"creating rcache failed");
120+
return OMPI_ERROR;
121+
}
122+
123+
return OMPI_SUCCESS;
124+
}

ompi/mca/mtl/ofi/mtl_ofi_request.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
33
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
56
*
67
* $COPYRIGHT$
78
*
@@ -25,6 +26,7 @@ typedef enum {
2526
OMPI_MTL_OFI_PROBE
2627
} ompi_mtl_ofi_request_type_t;
2728

29+
struct ompi_mtl_ofi_reg_t;
2830
struct ompi_mtl_ofi_request_t;
2931

3032
struct ompi_mtl_ofi_request_t {
@@ -89,8 +91,9 @@ struct ompi_mtl_ofi_request_t {
8991
struct mca_mtl_request_t *mrecv_req;
9092

9193
/** Stores reference to memory region from registration */
92-
/* Set to NULL if memory not registered or if non accelerator buffer */
93-
struct fid_mr *mr;
94+
95+
/* Set to NULL if memory not registered */
96+
struct ompi_mtl_ofi_reg_t *mr;
9497
};
9598
typedef struct ompi_mtl_ofi_request_t ompi_mtl_ofi_request_t;
9699

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
55
* Copyright (c) 2022-2023 Triad National Security, LLC. All rights
66
* reserved.
7+
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -16,6 +17,9 @@
1617

1718
#include "mtl_ofi.h"
1819

20+
#include "opal/mca/rcache/base/base.h"
21+
22+
1923
BEGIN_C_DECLS
2024

2125
/**
@@ -102,6 +106,8 @@ typedef struct mca_mtl_ofi_module_t {
102106
bool has_posted_initial_buffer;
103107
bool hmem_needs_reg;
104108

109+
/** registration cache */
110+
mca_rcache_base_module_t *rcache;
105111
} mca_mtl_ofi_module_t;
106112

107113
extern mca_mtl_ofi_module_t ompi_mtl_ofi;
@@ -116,6 +122,14 @@ typedef enum {
116122
OFI_SCALABLE_EP,
117123
} mca_mtl_ofi_ep_type;
118124

125+
struct ompi_mtl_ofi_reg_t {
126+
mca_rcache_base_registration_t base;
127+
struct fid_mr *ofi_mr;
128+
void *mem_desc;
129+
};
130+
typedef struct ompi_mtl_ofi_reg_t ompi_mtl_ofi_reg_t;
131+
132+
119133
/*
120134
* Define upper limit for number of events read from a CQ.
121135
* Setting this to 100 as this was deemed optimal from empirical data.

0 commit comments

Comments
 (0)