Skip to content

Commit 167d75b

Browse files
committed
common/ofi: Added multi-NIC support to provider selection
Adds the capability to select a NIC based on hardware locality. Creates a list of NICs that share the same cpuset as the process, then selects the NIC based on the (local rank) % (number of NICs). If no NICs are available that share the same cpuset, the selection process will create a list of all available NICs and make a selection based on (local rank) % (number of NICs) Signed-off-by: Nikola Dancejic <dancejic@amazon.com>
1 parent 5929ee5 commit 167d75b

File tree

6 files changed

+305
-4
lines changed

6 files changed

+305
-4
lines changed

ompi/mca/mtl/ofi/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ mca_mtl_ofi_la_LDFLAGS = \
7878
$(opal_ofi_LDFLAGS) \
7979
-module -avoid-version
8080
mca_mtl_ofi_la_LIBADD = $(top_builddir)/ompi/lib@OMPI_LIBMPI_NAME@.la \
81+
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
8182
$(opal_ofi_LIBS)
8283

8384
noinst_LTLIBRARIES = $(component_noinst)

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "mtl_ofi.h"
1717
#include "opal/util/argv.h"
1818
#include "opal/util/printf.h"
19+
#include "opal/mca/common/ofi/common_ofi.h"
1920

2021
static int ompi_mtl_ofi_component_open(void);
2122
static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@@ -371,6 +372,28 @@ select_ofi_provider(struct fi_info *providers,
371372
__FILE__, __LINE__,
372373
(prov ? prov->fabric_attr->prov_name : "none"));
373374

375+
/* The initial fi_getinfo() call will return a list of providers
376+
* available for this process. once a provider is selected from the
377+
* list, we will cycle through the remaining list to identify NICs
378+
* serviced by this provider, and try to pick one on the same NUMA
379+
* node as this process. If there are no NICs on the same NUMA node,
380+
* we pick one in a manner which allows all ranks to make balanced
381+
* use of available NICs on the system.
382+
*
383+
* Most providers give a separate fi_info object for each NIC,
384+
* however some may have multiple info objects with different
385+
* attributes for the same NIC. The initial provider attributes
386+
* are used to ensure that all NICs we return provide the same
387+
* capabilities as the inital one.
388+
*/
389+
if (NULL != prov) {
390+
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
391+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
392+
"%s:%d: mtl:ofi:provider: %s\n",
393+
__FILE__, __LINE__,
394+
(prov ? prov->domain_attr->name : "none"));
395+
}
396+
374397
return prov;
375398
}
376399

opal/mca/btl/ofi/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ mca_btl_ofi_la_SOURCES = $(component_sources)
5858
mca_btl_ofi_la_LDFLAGS = -module -avoid-version \
5959
$(opal_ofi_LDFLAGS)
6060
mca_btl_ofi_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la \
61+
$(OPAL_TOP_BUILDDIR)/opal/mca/common/ofi/lib@OPAL_LIB_PREFIX@mca_common_ofi.la \
6162
$(opal_ofi_LIBS)
6263

6364
noinst_LTLIBRARIES = $(lib)

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "opal/mca/btl/btl.h"
3131
#include "opal/mca/btl/base/base.h"
3232
#include "opal/mca/hwloc/base/base.h"
33+
#include "opal/mca/common/ofi/common_ofi.h"
3334

3435
#include <string.h>
3536

@@ -240,7 +241,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
240241
return NULL;
241242
}
242243

243-
struct fi_info *info, *info_list;
244+
struct fi_info *info, *info_list, *selected_info;
244245
struct fi_info hints = {0};
245246
struct fi_ep_attr ep_attr = {0};
246247
struct fi_rx_attr rx_attr = {0};
@@ -331,10 +332,27 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
331332
rc = validate_info(info, required_caps);
332333
if (OPAL_SUCCESS == rc) {
333334
/* Device passed sanity check, let's make a module.
334-
* We only pick the first device we found valid */
335-
rc = mca_btl_ofi_init_device(info);
336-
if (OPAL_SUCCESS == rc)
335+
*
336+
* The initial fi_getinfo() call will return a list of providers
337+
* available for this process. once a provider is selected from the
338+
* list, we will cycle through the remaining list to identify NICs
339+
* serviced by this provider, and try to pick one on the same NUMA
340+
* node as this process. If there are no NICs on the same NUMA node,
341+
* we pick one in a manner which allows all ranks to make balanced
342+
* use of available NICs on the system.
343+
*
344+
* Most providers give a separate fi_info object for each NIC,
345+
* however some may have multiple info objects with different
346+
* attributes for the same NIC. The initial provider attributes
347+
* are used to ensure that all NICs we return provide the same
348+
* capabilities as the inital one.
349+
*/
350+
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
351+
rc = mca_btl_ofi_init_device(selected_info);
352+
if (OPAL_SUCCESS == rc) {
353+
info = selected_info;
337354
break;
355+
}
338356
}
339357
info = info->next;
340358
}

opal/mca/common/ofi/common_ofi.c

Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "opal_config.h"
1313
#include "opal/constants.h"
14+
#include "opal/mca/hwloc/base/base.h"
1415

1516
#include <errno.h>
1617
#include <unistd.h>
@@ -25,3 +26,258 @@ int mca_common_ofi_register_mca_variables(void)
2526
return OPAL_ERROR;
2627
}
2728
}
29+
30+
/* check that the tx attributes match */
31+
static int
32+
check_tx_attr(struct fi_tx_attr *provider_info,
33+
struct fi_tx_attr *provider)
34+
{
35+
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
36+
!(provider->op_flags & ~(provider_info->op_flags)) &&
37+
(provider->inject_size == provider_info->inject_size)) {
38+
return 0;
39+
} else {
40+
return OPAL_ERROR;
41+
}
42+
}
43+
44+
/* check that the rx attributes match */
45+
static int
46+
check_rx_attr(struct fi_rx_attr *provider_info,
47+
struct fi_rx_attr *provider)
48+
{
49+
if (!(provider->msg_order & ~(provider_info->msg_order)) &&
50+
!(provider->op_flags & ~(provider_info->op_flags))) {
51+
return 0;
52+
} else {
53+
return OPAL_ERROR;
54+
}
55+
}
56+
57+
/* check that the ep attributes match */
58+
static int
59+
check_ep_attr(struct fi_ep_attr *provider_info,
60+
struct fi_ep_attr *provider)
61+
{
62+
if (!(provider->type & ~(provider_info->type)) &&
63+
!(provider->mem_tag_format & ~(provider_info->mem_tag_format)) &&
64+
(provider->max_msg_size == provider_info->max_msg_size) &&
65+
(provider->tx_ctx_cnt == provider_info->tx_ctx_cnt) &&
66+
(provider->rx_ctx_cnt == provider_info->rx_ctx_cnt)) {
67+
return 0;
68+
} else {
69+
return OPAL_ERROR;
70+
}
71+
}
72+
73+
/* check that the provider attributes match */
74+
static int
75+
check_provider_attr(struct fi_info *provider_info,
76+
struct fi_info *provider)
77+
{
78+
/* make sure both info are the same provider and provide the same attributes */
79+
if (0 == strcmp(provider_info->fabric_attr->prov_name, provider->fabric_attr->prov_name) &&
80+
!check_tx_attr(provider_info->tx_attr, provider->tx_attr) &&
81+
!check_rx_attr(provider_info->rx_attr, provider->rx_attr) &&
82+
!check_ep_attr(provider_info->ep_attr, provider->ep_attr) &&
83+
!(provider_info->caps & ~(provider->caps)) &&
84+
!(provider_info->mode & ~(provider->mode))) {
85+
return 0;
86+
} else {
87+
return OPAL_ERROR;
88+
}
89+
}
90+
91+
/* Check if a process and a pci device share the same cpuset
92+
* @param (IN) pci struct fi_pci_attr pci device attributes,
93+
* used to find hwloc object for device.
94+
*
95+
* @param (IN) topology hwloc_topology_t topology to get the cpusets
96+
* from
97+
*
98+
* @param (OUT) returns true if cpusets match and false if
99+
* cpusets do not match or an error prevents comparison
100+
*
101+
* Uses a pci device to find an ancestor that contains a cpuset, and
102+
* determines if it intersects with the cpuset that the process is bound to.
103+
* if the process is not bound, or if a cpuset is unavailable for whatever
104+
* reason, returns false. Otherwise, returns the result of
105+
* hwloc_cpuset_intersects()
106+
*/
107+
static bool
108+
compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
109+
{
110+
bool result = false;
111+
int ret;
112+
hwloc_bitmap_t proc_cpuset;
113+
hwloc_obj_t obj = NULL;
114+
115+
/* Cannot find topology info if no topology is found */
116+
if (NULL == topology) {
117+
return false;
118+
}
119+
120+
/* Allocate memory for proc_cpuset */
121+
proc_cpuset = hwloc_bitmap_alloc();
122+
if (NULL == proc_cpuset) {
123+
return false;
124+
}
125+
126+
/* Fill cpuset with the collection of cpu cores that the process runs on */
127+
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
128+
if (0 > ret) {
129+
goto error;
130+
}
131+
132+
/* Get the pci device from bdf */
133+
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id,
134+
pci.device_id, pci.function_id);
135+
if (NULL == obj) {
136+
goto error;
137+
}
138+
139+
/* pcidev objects don't have cpusets so find the first non-io object above */
140+
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
141+
if (NULL != obj) {
142+
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
143+
}
144+
145+
error:
146+
hwloc_bitmap_free(proc_cpuset);
147+
return result;
148+
}
149+
150+
/* Count providers returns the number of providers present in an fi_info list
151+
* @param (IN) provider_list struct fi_info* list of providers available
152+
*
153+
* @param (OUT) int number of providers present in the list
154+
*
155+
* returns 0 if the list is NULL
156+
*/
157+
static int
158+
count_providers(struct fi_info* provider_list)
159+
{
160+
struct fi_info* dev = provider_list;
161+
int num_provider = 0;
162+
163+
while (NULL != dev) {
164+
num_provider++;
165+
dev = dev->next;
166+
}
167+
168+
return num_provider;
169+
}
170+
171+
/* Selects a NIC based on hardware locality to process cpuset and device BDF.
172+
*
173+
* @param provider_list (IN) struct fi_info* An initially selected
174+
* provider NIC. The provider name and
175+
* attributes are used to restrict NIC
176+
* selection. This provider is returned if the
177+
* NIC selection fails.
178+
*
179+
* @param local_index (IN) int The local rank of the process. Used to
180+
* select one valid NIC if there is a case
181+
* where more than one can be selected. This
182+
* could occur when more than one provider
183+
* shares the same cpuset as the process.
184+
*
185+
* @param provider (OUT) struct fi_info* object with the selected
186+
* provider if the selection succeeds
187+
* if the selection fails, returns the fi_info
188+
* object that was initially provided.
189+
*
190+
* If there is more than one provider that shares the same cpuset, we use
191+
* (local rank % number of valid providers that share the process cpuset)
192+
* to select one of the local providers.
193+
*
194+
* Likewise, If no providers share the same cpuset as the process, we use
195+
* (local rank % number of valid providers that share the process cpuset)
196+
* to select one of the valid providers.
197+
*
198+
* Initializes opal_hwloc_topology to access hardware topology if not previously
199+
* initialized
200+
*
201+
* If a provider does not provide a BDF, the locality can't be determined and it
202+
* is treated as though it does not share the same cpuset as the process.
203+
*
204+
* All errors should be recoverable and will return the initially provided
205+
* provider. However, if an error occurs this will no longer guarantee
206+
* that the provider returned is local to the process or that the processes will
207+
* balance across available NICs.
208+
*/
209+
struct fi_info*
210+
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
211+
{
212+
struct fi_info *provider = provider_list, *current_provider = provider_list;
213+
struct fi_info **provider_table;
214+
struct fi_pci_attr pci;
215+
int ret;
216+
unsigned int num_provider = 0, provider_limit = 0;
217+
bool provider_found = false, cpusets_match = false;
218+
219+
/* Initialize opal_hwloc_topology if it is not already */
220+
ret = opal_hwloc_base_get_topology();
221+
if (0 > ret) {
222+
/* Provider selection can continue but there is no guarantee of locality */
223+
opal_output(1, "%s:%d:Failed to initialize topology\n", __FILE__, __LINE__);
224+
}
225+
226+
provider_limit = count_providers(provider_list);
227+
228+
/* Allocate memory for provider table */
229+
provider_table = calloc(provider_limit, sizeof(struct fi_info*));
230+
if (NULL == provider_table) {
231+
opal_output(1, "%s:%d:Failed to allocate memory for provider table\n", __FILE__, __LINE__);
232+
return provider_list;
233+
}
234+
235+
current_provider = provider;
236+
237+
/* Cycle through remaining fi_info objects, looking for alike providers */
238+
while (NULL != current_provider) {
239+
if (!check_provider_attr(provider, current_provider)) {
240+
cpusets_match = false;
241+
if (NULL != current_provider->nic) {
242+
pci = current_provider->nic->bus_attr->attr.pci;
243+
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
244+
}
245+
246+
/* Reset the list if the cpusets match and no other provider was
247+
* found on the same cpuset as the process.
248+
*/
249+
if (cpusets_match && !provider_found) {
250+
provider_found = true;
251+
num_provider = 0;
252+
}
253+
254+
/* Add the provider to the provider list if the cpusets match or if
255+
* no other provider was found on the same cpuset as the process.
256+
*/
257+
if (cpusets_match || !provider_found) {
258+
provider_table[num_provider] = current_provider;
259+
num_provider++;
260+
}
261+
}
262+
current_provider = current_provider->next;
263+
}
264+
265+
/* Select provider from local rank % number of providers */
266+
if (num_provider > 0) {
267+
provider = provider_table[local_index % num_provider];
268+
}
269+
270+
#if OPAL_DEBUG_ENABLE
271+
if (NULL != provider->nic) {
272+
pci = provider->nic->bus_attr->attr.pci;
273+
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
274+
}
275+
276+
opal_output(10, "local rank: %d device: %s cpusets match: %s\n",
277+
local_index, provider->domain_attr->name, cpusets_match ? "true" : "false");
278+
#endif
279+
280+
err_free_table:
281+
free(provider_table);
282+
return provider;
283+
}

opal/mca/common/ofi/common_ofi.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,6 @@
1616

1717
OPAL_DECLSPEC int mca_common_ofi_register_mca_variables(void);
1818

19+
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
20+
1921
#endif /* OPAL_MCA_COMMON_OFI_H */

0 commit comments

Comments
 (0)