Skip to content

Commit 1a00a89

Browse files
committed
OFI: call fi_getname twice
first to get the length of the buffer needed for the endpoint name, then a second time with the properly sized buffer to receive the endpoint name. The FI_NAME_MAX enum was not supposed to have been exposed to libfabric consumers. See ofiwg/libfabric#7898. Related to #10617 Signed-off-by: Howard Pritchard <howardp@lanl.gov>
1 parent d5a8c20 commit 1a00a89

File tree

4 files changed

+113
-20
lines changed

4 files changed

+113
-20
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
604604
struct fi_info *providers = NULL;
605605
struct fi_info *prov = NULL;
606606
struct fi_info *prov_cq_data = NULL;
607-
char ep_name[FI_NAME_MAX] = {0};
607+
void *ep_name;
608608
size_t namelen;
609609
int universe_size;
610610
char *univ_size_str;
@@ -1070,15 +1070,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
10701070
fi_freeinfo(providers);
10711071
providers = NULL;
10721072

1073-
/**
1074-
* Get our address and publish it with modex.
1075-
*/
1076-
namelen = sizeof(ep_name);
1077-
ret = fi_getname((fid_t)ompi_mtl_ofi.sep,
1078-
&ep_name[0],
1079-
&namelen);
1080-
if (ret) {
1081-
MTL_OFI_LOG_FI_ERR(ret, "fi_getname failed");
1073+
ret = opal_common_ofi_fi_getname((fid_t)ompi_mtl_ofi.sep,
1074+
&ep_name,
1075+
&namelen);
1076+
if (OMPI_SUCCESS != ret) {
1077+
MTL_OFI_LOG_FI_ERR(ret, "opal_common_ofi_fi_getname failed");
10821078
goto error;
10831079
}
10841080

@@ -1094,6 +1090,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
10941090
}
10951091

10961092
ompi_mtl_ofi.epnamelen = namelen;
1093+
free(ep_name);
10971094

10981095
/**
10991096
* Set the ANY_SRC address.
@@ -1153,6 +1150,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
11531150
if (ompi_mtl_ofi.ofi_ctxt) {
11541151
free(ompi_mtl_ofi.ofi_ctxt);
11551152
}
1153+
if (ep_name) {
1154+
free(ep_name);
1155+
}
11561156

11571157
return NULL;
11581158
}

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
419419
size_t num_contexts_to_create;
420420

421421
char *linux_device_name;
422-
char ep_name[FI_NAME_MAX];
422+
void *ep_name;
423423

424424
struct fi_info *ofi_info;
425425
struct fi_ep_attr *ep_attr;
@@ -596,15 +596,14 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
596596
goto fail;
597597
}
598598

599-
/* create and send the modex for this device */
600-
namelen = sizeof(ep_name);
601-
rc = fi_getname((fid_t) ep, &ep_name[0], &namelen);
602-
if (0 != rc) {
603-
BTL_VERBOSE(("%s failed fi_getname with err=%s", linux_device_name, fi_strerror(-rc)));
599+
rc = opal_common_ofi_fi_getname((fid_t)ep,
600+
&ep_name,
601+
&namelen);
602+
if (OPAL_SUCCESS != rc) {
603+
BTL_VERBOSE(("%s failed opal_common_ofi_fi_getname with err=%d", linux_device_name, rc));
604604
goto fail;
605605
}
606606

607-
/* If we have two-sided support. */
608607
if (TWO_SIDED_ENABLED) {
609608

610609
/* post wildcard recvs */
@@ -618,8 +617,9 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
618617
}
619618

620619
/* post our endpoint name so peer can use it to connect to us */
621-
OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_ofi_component.super.btl_version, &ep_name, namelen);
620+
OPAL_MODEX_SEND(rc, PMIX_GLOBAL, &mca_btl_ofi_component.super.btl_version, ep_name, namelen);
622621
mca_btl_ofi_component.namelen = namelen;
622+
free(ep_name);
623623

624624
/* add this module to the list */
625625
mca_btl_ofi_component.modules[(*module_count)++] = module;
@@ -663,6 +663,10 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
663663
}
664664
free(module);
665665

666+
if (NULL != ep_name) {
667+
free(ep_name);
668+
}
669+
666670
/* not really a failure. just skip this device. */
667671
return OPAL_ERR_OUT_OF_RESOURCE;
668672
}

opal/mca/common/ofi/common_ofi.c

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
33
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2020-2021 Triad National Security, LLC. All rights
5+
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
88
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
@@ -22,6 +22,7 @@
2222
#include <unistd.h>
2323
#include <rdma/fabric.h>
2424
#include <rdma/fi_errno.h>
25+
#include <rdma/fi_cm.h>
2526
#ifdef HAVE_RDMA_FI_EXT_H
2627
#include <rdma/fi_ext.h>
2728
#endif
@@ -685,3 +686,71 @@ struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_lis
685686
free(provider_table);
686687
return provider;
687688
}
689+
690+
/**
691+
* Obtain EP endpoint name
692+
*
693+
* Obtain the EP endpoint name and length for the supplied endpoint fid.
694+
*
695+
* @param fid (IN) fid of (S)EP endpoint
696+
* @param addr (OUT) buffer containing endpoint name
697+
* @param addrlen (OUT) length of allocated buffer in bytes
698+
*
699+
* @return OPAL_SUCCESS or OPAL error code
700+
*
701+
* The caller is responsible for freeing the buffer allocated to
702+
* contain the endpoint name.
703+
*
704+
*/
705+
OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen)
706+
{
707+
int ret=OPAL_SUCCESS;
708+
size_t namelen = 0;
709+
char *ep_name = NULL;
710+
711+
/**
712+
* Get our address and publish it with modex.
713+
* Use the two step process of first getting the required
714+
* buffer size, then allocating the memory and calling
715+
* fi_getname again.
716+
*/
717+
namelen = 0;
718+
ret = fi_getname(fid,
719+
NULL,
720+
&namelen);
721+
if ((FI_SUCCESS != ret) && (-FI_ETOOSMALL != ret)) {
722+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:fi_endpoint (namelen) returned %s\n",
723+
__FILE__, __LINE__, fi_strerror(-ret));
724+
ret = OPAL_ERROR;
725+
goto error;
726+
}
727+
728+
ep_name = (char *)malloc(namelen);
729+
if (NULL == ep_name) {
730+
ret = OPAL_ERR_OUT_OF_RESOURCE;
731+
goto error;
732+
}
733+
734+
ret = fi_getname(fid,
735+
ep_name,
736+
&namelen);
737+
if (ret) {
738+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:fi_endpoint (ep_name) returned %s\n",
739+
__FILE__, __LINE__, fi_strerror(-ret));
740+
ret = OPAL_ERROR;
741+
goto error;
742+
}
743+
744+
*addr = ep_name;
745+
*addrlen = namelen;
746+
747+
return ret;
748+
749+
error:
750+
if (NULL != ep_name) {
751+
free(ep_name);
752+
}
753+
return ret;
754+
}
755+
756+

opal/mca/common/ofi/common_ofi.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2015 Intel, Inc. All rights reserved.
44
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
55
* reserved.
6-
* Copyright (c) 2020 Triad National Security, LLC. All rights
6+
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
77
* reserved.
88
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
99
* reserved.
@@ -21,6 +21,9 @@
2121
#include "opal/util/proc.h"
2222
#include "opal/memoryhooks/memory.h"
2323

24+
#include <rdma/fabric.h>
25+
#include <rdma/fi_cm.h>
26+
2427
BEGIN_C_DECLS
2528

2629
typedef struct opal_common_ofi_module {
@@ -151,6 +154,23 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
151154
OPAL_DECLSPEC struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list,
152155
opal_process_info_t *process_info);
153156

157+
/**
158+
* Obtain EP endpoint name
159+
*
160+
* Obtain the EP endpoint name and length for the supplied endpoint fid.
161+
*
162+
* @param fid (IN) fid of (S)EP endpoint
163+
* @param addr (OUT) buffer containing endpoint name
164+
* @param addrlen (OUT) length of allocated buffer in bytes
165+
*
166+
* @return OPAL_SUCCESS or OPAL error code
167+
*
168+
* The caller is responsible for freeing the buffer allocated to
169+
* contain the endpoint name.
170+
*
171+
*/
172+
OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *addrlen);
173+
154174
END_C_DECLS
155175

156176
#endif /* OPAL_MCA_COMMON_OFI_H */

0 commit comments

Comments
 (0)