Skip to content

Commit 8017f12

Browse files
committed
Using package_rank to select between NIC of equal distance from the process.
If PMIX_PACKAGE_RANK is available, uses this value to select between multiple NIC of equal distance between the current process. If this value is not available, try to calculate it by getting the locality string from each local process and assign a package_rank. If everything fails, fall back to using process_id.rank to select the NIC. This last case is not ideal, but has a small chance of occuring, and causes an output to be displayed to notify that this is occuring. Signed-off-by: Nikola Dancejic <dancejic@amazon.com>
1 parent 3ba35c9 commit 8017f12

File tree

7 files changed

+121
-15
lines changed

7 files changed

+121
-15
lines changed

config/opal_check_ofi.m4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
133133
[$opal_check_fi_info_pci],
134134
[check if pci data is available in ofi])
135135

136+
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
137+
[],
138+
[],
139+
[#include <pmix.h>])
140+
136141
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
137142
LDFLAGS=$opal_check_ofi_save_LDFLAGS
138143
LIBS=$opal_check_ofi_save_LIBS

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* $HEADER$
1616
*/
1717

18+
#include "opal_config.h"
1819
#include "mtl_ofi.h"
1920
#include "opal/util/argv.h"
2021
#include "opal/util/printf.h"
@@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
337338
__FILE__, __LINE__,
338339
(prov ? prov->fabric_attr->prov_name : "none"));
339340

340-
/* The initial fi_getinfo() call will return a list of providers
341+
/** The initial provider selection will return a list of providers
341342
* available for this process. once a provider is selected from the
342343
* list, we will cycle through the remaining list to identify NICs
343344
* serviced by this provider, and try to pick one on the same NUMA
@@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers,
350351
* attributes for the same NIC. The initial provider attributes
351352
* are used to ensure that all NICs we return provide the same
352353
* capabilities as the inital one.
354+
*
355+
* We use package rank to select between NICs of equal distance
356+
* if we cannot calculate a package_rank, we fall back to using the
357+
* process id.
353358
*/
354359
if (NULL != prov) {
355-
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
360+
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info);
356361
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
357362
"%s:%d: mtl:ofi:provider: %s\n",
358363
__FILE__, __LINE__,
@@ -1170,6 +1175,3 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
11701175

11711176
return OMPI_ERROR;
11721177
}
1173-
1174-
1175-

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
391391
* are used to ensure that all NICs we return provide the same
392392
* capabilities as the inital one.
393393
*/
394-
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
394+
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info);
395395
rc = mca_btl_ofi_init_device(selected_info);
396396
if (OPAL_SUCCESS == rc) {
397397
info = selected_info;

opal/mca/common/ofi/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131

3232
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
3333

34+
dist_opaldata_DATA = help-common-ofi.txt
35+
3436
# Header files
3537

3638
headers = \

opal/mca/common/ofi/common_ofi.c

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@
1616
#include <errno.h>
1717
#include <unistd.h>
1818

19+
#include "opal_config.h"
1920
#include "common_ofi.h"
2021
#include "opal_config.h"
2122
#include "opal/constants.h"
2223
#include "opal/util/argv.h"
2324
#include "opal/mca/base/mca_base_var.h"
2425
#include "opal/mca/base/mca_base_framework.h"
2526
#include "opal/mca/hwloc/base/base.h"
27+
#include "opal/mca/pmix/base/base.h"
28+
#include "opal/util/show_help.h"
2629

2730
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
2831
.prov_include = NULL,
@@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
281284
return num_provider;
282285
}
283286

287+
/* Calculate the currrent process package rank.
288+
* @param (IN) process_info struct opal_process_info_t information
289+
* about the current process. used to get
290+
* num_local_peers, myprocid.rank, and
291+
* my_local_rank.
292+
*
293+
* @param (OUT) uint32_t package rank or myprocid.rank
294+
*
295+
* If successful, returns PMIX_PACKAGE_RANK, or an
296+
* equivalent calculated package rank.
297+
* otherwise falls back to using opal_process_info.myprocid.rank
298+
* this can affect performance, but is unlikely to happen.
299+
*/
300+
static uint32_t get_package_rank(opal_process_info_t process_info)
301+
{
302+
int i;
303+
uint16_t relative_locality, *package_rank_ptr;
304+
uint16_t current_package_rank = 0;
305+
uint16_t package_ranks[process_info.num_local_peers];
306+
opal_process_name_t pname;
307+
opal_status_t rc;
308+
char **peers = NULL;
309+
char *local_peers = NULL;
310+
char *locality_string = NULL;
311+
312+
pname.jobid = OPAL_PROC_MY_NAME.jobid;
313+
pname.vpid = OPAL_VPID_WILDCARD;
314+
315+
#if HAVE_DECL_PMIX_PACKAGE_RANK
316+
// Try to get the PACKAGE_RANK from PMIx
317+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK,
318+
&pname, &package_rank_ptr, PMIX_UINT16);
319+
if (PMIX_SUCCESS == rc) {
320+
return (uint32_t)*package_rank_ptr;
321+
}
322+
#endif
323+
324+
// Get the local peers
325+
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
326+
&pname, &local_peers, PMIX_STRING);
327+
if (PMIX_SUCCESS != rc || NULL == local_peers) {
328+
// We can't find package_rank, fall back to procid
329+
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
330+
return (uint32_t)process_info.myprocid.rank;
331+
}
332+
peers = opal_argv_split(local_peers, ',');
333+
free(local_peers);
334+
335+
for (i = 0; NULL != peers[i]; i++) {
336+
pname.vpid = strtoul(peers[i], NULL, 10);
337+
locality_string = NULL;
338+
// Get the LOCALITY_STRING for process[i]
339+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
340+
&pname, &locality_string, PMIX_STRING);
341+
if (PMIX_SUCCESS != rc || NULL == locality_string) {
342+
// If we don't have information about locality, fall back to procid
343+
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
344+
return (uint32_t)process_info.myprocid.rank;
345+
}
346+
347+
// compute relative locality
348+
relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string);
349+
free(locality_string);
350+
351+
if (relative_locality & OPAL_PROC_ON_SOCKET) {
352+
package_ranks[i] = current_package_rank;
353+
current_package_rank++;
354+
}
355+
}
356+
357+
return (uint32_t)package_ranks[process_info.my_local_rank];
358+
}
359+
284360
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
285361
*
286362
* Initializes opal_hwloc_topology to access hardware topology if not previously
@@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
318394
* selection. This provider is returned if the
319395
* NIC selection fails.
320396
*
321-
* @param local_index (IN) int The local rank of the process. Used to
397+
* @param package_rank (IN) uint32_t The rank of the process. Used to
322398
* select one valid NIC if there is a case
323399
* where more than one can be selected. This
324400
* could occur when more than one provider
325401
* shares the same cpuset as the process.
402+
* This could either be a package_rank if one is
403+
* successfully calculated, or the process id.
326404
*
327405
* @param provider (OUT) struct fi_info* object with the selected
328406
* provider if the selection succeeds
@@ -335,14 +413,15 @@ count_providers(struct fi_info* provider_list)
335413
* balance across available NICs.
336414
*/
337415
struct fi_info*
338-
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
416+
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info)
339417
{
340418
struct fi_info *provider = provider_list, *current_provider = provider_list;
341419
struct fi_info **provider_table;
342420
#if OPAL_OFI_PCI_DATA_AVAILABLE
343421
struct fi_pci_attr pci;
344422
#endif
345423
int ret;
424+
uint32_t package_rank;
346425
unsigned int num_provider = 0, provider_limit = 0;
347426
bool provider_found = false, cpusets_match = false;
348427

@@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
399478
}
400479

401480
/* Select provider from local rank % number of providers */
402-
if (num_provider > 0) {
403-
provider = provider_table[local_index % num_provider];
481+
if (num_provider >= 2) {
482+
// If there are multiple NICs "close" to the process, try to calculate package_rank
483+
package_rank = get_package_rank(process_info);
484+
provider = provider_table[package_rank % num_provider];
485+
} else if (num_provider == 1) {
486+
provider = provider_table[num_provider - 1];
404487
}
405488

406489
#if OPAL_OFI_PCI_DATA_AVAILABLE
@@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
412495

413496
#if OPAL_ENABLE_DEBUG
414497
opal_output_verbose(1, opal_common_ofi.output,
415-
"local rank: %d device: %s cpusets match: %s\n",
416-
local_index, provider->domain_attr->name,
498+
"package rank: %d device: %s cpusets match: %s\n",
499+
package_rank, provider->domain_attr->name,
417500
cpusets_match ? "true" : "false");
418501
#endif
419502

opal/mca/common/ofi/common_ofi.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "opal_config.h"
2020
#include "opal/mca/base/mca_base_var.h"
2121
#include "opal/mca/base/mca_base_framework.h"
22+
#include "opal/util/proc.h"
2223
#include <rdma/fabric.h>
2324

2425
BEGIN_C_DECLS
@@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
3637
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
3738
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
3839
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
39-
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
40-
char *framework_name);
40+
4141
/*
4242
* @param list (IN) List of strings corresponding to lower providers.
4343
* @param item (IN) Single string corresponding to a provider.
@@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
5656

5757
END_C_DECLS
5858

59-
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
59+
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info);
6060

6161
#endif /* OPAL_MCA_COMMON_OFI_H */
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
4+
# $COPYRIGHT$
5+
#
6+
# Additional copyrights may follow
7+
#
8+
# $HEADER$
9+
#
10+
[package_rank failed]
11+
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
12+
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
13+
This may negatively impact performance. A more modern PMIx server is necessary to
14+
resolve this issue.

0 commit comments

Comments
 (0)