Skip to content

Commit e9e5dab

Browse files
authored
Merge pull request #8153 from dancejic/multi
Using package_rank to select between NIC of equal distance from the process.
2 parents 5b25a06 + 8017f12 commit e9e5dab

File tree

7 files changed

+121
-15
lines changed

7 files changed

+121
-15
lines changed

config/opal_check_ofi.m4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ AC_DEFUN([_OPAL_CHECK_OFI],[
133133
[$opal_check_fi_info_pci],
134134
[check if pci data is available in ofi])
135135

136+
AC_CHECK_DECLS([PMIX_PACKAGE_RANK],
137+
[],
138+
[],
139+
[#include <pmix.h>])
140+
136141
CPPFLAGS=$opal_check_ofi_save_CPPFLAGS
137142
LDFLAGS=$opal_check_ofi_save_LDFLAGS
138143
LIBS=$opal_check_ofi_save_LIBS

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* $HEADER$
1616
*/
1717

18+
#include "opal_config.h"
1819
#include "mtl_ofi.h"
1920
#include "opal/util/argv.h"
2021
#include "opal/util/printf.h"
@@ -337,7 +338,7 @@ select_ofi_provider(struct fi_info *providers,
337338
__FILE__, __LINE__,
338339
(prov ? prov->fabric_attr->prov_name : "none"));
339340

340-
/* The initial fi_getinfo() call will return a list of providers
341+
/** The initial provider selection will return a list of providers
341342
* available for this process. once a provider is selected from the
342343
* list, we will cycle through the remaining list to identify NICs
343344
* serviced by this provider, and try to pick one on the same NUMA
@@ -350,9 +351,13 @@ select_ofi_provider(struct fi_info *providers,
350351
* attributes for the same NIC. The initial provider attributes
351352
* are used to ensure that all NICs we return provide the same
352353
* capabilities as the inital one.
354+
*
355+
* We use package rank to select between NICs of equal distance
356+
* if we cannot calculate a package_rank, we fall back to using the
357+
* process id.
353358
*/
354359
if (NULL != prov) {
355-
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info.my_local_rank);
360+
prov = opal_mca_common_ofi_select_provider(prov, ompi_process_info);
356361
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
357362
"%s:%d: mtl:ofi:provider: %s\n",
358363
__FILE__, __LINE__,
@@ -1170,6 +1175,3 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
11701175

11711176
return OMPI_ERROR;
11721177
}
1173-
1174-
1175-

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init (int *num_btl_modules,
391391
* are used to ensure that all NICs we return provide the same
392392
* capabilities as the inital one.
393393
*/
394-
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info.my_local_rank);
394+
selected_info = opal_mca_common_ofi_select_provider(info, opal_process_info);
395395
rc = mca_btl_ofi_init_device(selected_info);
396396
if (OPAL_SUCCESS == rc) {
397397
info = selected_info;

opal/mca/common/ofi/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131

3232
AM_CPPFLAGS = $(opal_ofi_CPPFLAGS)
3333

34+
dist_opaldata_DATA = help-common-ofi.txt
35+
3436
# Header files
3537

3638
headers = \

opal/mca/common/ofi/common_ofi.c

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@
1616
#include <errno.h>
1717
#include <unistd.h>
1818

19+
#include "opal_config.h"
1920
#include "common_ofi.h"
2021
#include "opal_config.h"
2122
#include "opal/constants.h"
2223
#include "opal/util/argv.h"
2324
#include "opal/mca/base/mca_base_var.h"
2425
#include "opal/mca/base/mca_base_framework.h"
2526
#include "opal/mca/hwloc/base/base.h"
27+
#include "opal/mca/pmix/base/base.h"
28+
#include "opal/util/show_help.h"
2629

2730
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
2831
.prov_include = NULL,
@@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
281284
return num_provider;
282285
}
283286

287+
/* Calculate the currrent process package rank.
288+
* @param (IN) process_info struct opal_process_info_t information
289+
* about the current process. used to get
290+
* num_local_peers, myprocid.rank, and
291+
* my_local_rank.
292+
*
293+
* @param (OUT) uint32_t package rank or myprocid.rank
294+
*
295+
* If successful, returns PMIX_PACKAGE_RANK, or an
296+
* equivalent calculated package rank.
297+
* otherwise falls back to using opal_process_info.myprocid.rank
298+
* this can affect performance, but is unlikely to happen.
299+
*/
300+
static uint32_t get_package_rank(opal_process_info_t process_info)
301+
{
302+
int i;
303+
uint16_t relative_locality, *package_rank_ptr;
304+
uint16_t current_package_rank = 0;
305+
uint16_t package_ranks[process_info.num_local_peers];
306+
opal_process_name_t pname;
307+
opal_status_t rc;
308+
char **peers = NULL;
309+
char *local_peers = NULL;
310+
char *locality_string = NULL;
311+
312+
pname.jobid = OPAL_PROC_MY_NAME.jobid;
313+
pname.vpid = OPAL_VPID_WILDCARD;
314+
315+
#if HAVE_DECL_PMIX_PACKAGE_RANK
316+
// Try to get the PACKAGE_RANK from PMIx
317+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_PACKAGE_RANK,
318+
&pname, &package_rank_ptr, PMIX_UINT16);
319+
if (PMIX_SUCCESS == rc) {
320+
return (uint32_t)*package_rank_ptr;
321+
}
322+
#endif
323+
324+
// Get the local peers
325+
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS,
326+
&pname, &local_peers, PMIX_STRING);
327+
if (PMIX_SUCCESS != rc || NULL == local_peers) {
328+
// We can't find package_rank, fall back to procid
329+
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
330+
return (uint32_t)process_info.myprocid.rank;
331+
}
332+
peers = opal_argv_split(local_peers, ',');
333+
free(local_peers);
334+
335+
for (i = 0; NULL != peers[i]; i++) {
336+
pname.vpid = strtoul(peers[i], NULL, 10);
337+
locality_string = NULL;
338+
// Get the LOCALITY_STRING for process[i]
339+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
340+
&pname, &locality_string, PMIX_STRING);
341+
if (PMIX_SUCCESS != rc || NULL == locality_string) {
342+
// If we don't have information about locality, fall back to procid
343+
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
344+
return (uint32_t)process_info.myprocid.rank;
345+
}
346+
347+
// compute relative locality
348+
relative_locality = opal_hwloc_compute_relative_locality(process_info.cpuset, locality_string);
349+
free(locality_string);
350+
351+
if (relative_locality & OPAL_PROC_ON_SOCKET) {
352+
package_ranks[i] = current_package_rank;
353+
current_package_rank++;
354+
}
355+
}
356+
357+
return (uint32_t)package_ranks[process_info.my_local_rank];
358+
}
359+
284360
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
285361
*
286362
* Initializes opal_hwloc_topology to access hardware topology if not previously
@@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
318394
* selection. This provider is returned if the
319395
* NIC selection fails.
320396
*
321-
* @param local_index (IN) int The local rank of the process. Used to
397+
* @param package_rank (IN) uint32_t The rank of the process. Used to
322398
* select one valid NIC if there is a case
323399
* where more than one can be selected. This
324400
* could occur when more than one provider
325401
* shares the same cpuset as the process.
402+
* This could either be a package_rank if one is
403+
* successfully calculated, or the process id.
326404
*
327405
* @param provider (OUT) struct fi_info* object with the selected
328406
* provider if the selection succeeds
@@ -335,14 +413,15 @@ count_providers(struct fi_info* provider_list)
335413
* balance across available NICs.
336414
*/
337415
struct fi_info*
338-
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_index)
416+
opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info)
339417
{
340418
struct fi_info *provider = provider_list, *current_provider = provider_list;
341419
struct fi_info **provider_table;
342420
#if OPAL_OFI_PCI_DATA_AVAILABLE
343421
struct fi_pci_attr pci;
344422
#endif
345423
int ret;
424+
uint32_t package_rank;
346425
unsigned int num_provider = 0, provider_limit = 0;
347426
bool provider_found = false, cpusets_match = false;
348427

@@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
399478
}
400479

401480
/* Select provider from local rank % number of providers */
402-
if (num_provider > 0) {
403-
provider = provider_table[local_index % num_provider];
481+
if (num_provider >= 2) {
482+
// If there are multiple NICs "close" to the process, try to calculate package_rank
483+
package_rank = get_package_rank(process_info);
484+
provider = provider_table[package_rank % num_provider];
485+
} else if (num_provider == 1) {
486+
provider = provider_table[num_provider - 1];
404487
}
405488

406489
#if OPAL_OFI_PCI_DATA_AVAILABLE
@@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
412495

413496
#if OPAL_ENABLE_DEBUG
414497
opal_output_verbose(1, opal_common_ofi.output,
415-
"local rank: %d device: %s cpusets match: %s\n",
416-
local_index, provider->domain_attr->name,
498+
"package rank: %d device: %s cpusets match: %s\n",
499+
package_rank, provider->domain_attr->name,
417500
cpusets_match ? "true" : "false");
418501
#endif
419502

opal/mca/common/ofi/common_ofi.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "opal_config.h"
2020
#include "opal/mca/base/mca_base_var.h"
2121
#include "opal/mca/base/mca_base_framework.h"
22+
#include "opal/util/proc.h"
2223
#include <rdma/fabric.h>
2324

2425
BEGIN_C_DECLS
@@ -36,8 +37,7 @@ extern opal_common_ofi_module_t opal_common_ofi;
3637
OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component);
3738
OPAL_DECLSPEC void opal_common_ofi_mca_register(void);
3839
OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void);
39-
OPAL_DECLSPEC struct fi_info* opal_common_ofi_select_ofi_provider(struct fi_info *providers,
40-
char *framework_name);
40+
4141
/*
4242
* @param list (IN) List of strings corresponding to lower providers.
4343
* @param item (IN) Single string corresponding to a provider.
@@ -56,6 +56,6 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
5656

5757
END_C_DECLS
5858

59-
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int rank);
59+
struct fi_info* opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t process_info);
6060

6161
#endif /* OPAL_MCA_COMMON_OFI_H */
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights reserved.
4+
# $COPYRIGHT$
5+
#
6+
# Additional copyrights may follow
7+
#
8+
# $HEADER$
9+
#
10+
[package_rank failed]
11+
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
12+
but had insufficient information to ensure MPI processes fairly pick a NIC for use.
13+
This may negatively impact performance. A more modern PMIx server is necessary to
14+
resolve this issue.

0 commit comments

Comments
 (0)