Skip to content

Commit 223ed58

Browse files
authored
Merge pull request #12144 from wenduwan/nic_selection_doc
ofi nic selection bugfix and document update
2 parents af0c640 + 3aba0bb commit 223ed58

File tree

3 files changed

+98
-55
lines changed

3 files changed

+98
-55
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo
623623
/**
624624
* @brief Get the nearest device to the current thread
625625
*
626-
* Use the PMIx server or calculate the device distances, then out of the set of
627-
* returned distances find the subset of the nearest devices. This can be
628-
* 0 or more.
629-
* If there are multiple equidistant devices, break the tie using the rank.
626+
* Compute the distances from the current thread to each NIC in provider_list,
627+
* and select the NIC with the shortest distance.
628+
* If there are multiple equidistant devices, break the tie using local rank
629+
* to balance NIC utilization.
630630
*
631631
* @param[in] topoloy hwloc topology
632632
* @param[in] provider_list List of providers to select from
@@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
724724
return ret;
725725
}
726726

727-
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank,
728-
size_t num_providers)
727+
/**
728+
* @brief Selects a provider from the list in a round-robin fashion
729+
*
730+
* This function implements a round-robin algorithm to select a provider from
731+
* the provided list based on a rank. Only providers of the same type as the
732+
* first provider are eligible for selection.
733+
*
734+
* @param[in] provider_list A list of providers to select from.
735+
* @param[out] rank A rank metric for the current process, such as
736+
* the rank on the same node or CPU package.
737+
* @return Pointer to the selected provider
738+
*/
739+
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank)
729740
{
730-
uint32_t provider_rank = rank % num_providers;
731-
struct fi_info *current_provider = provider_list;
741+
uint32_t provider_rank = 0, current_rank = 0;
742+
size_t num_providers = 0;
743+
struct fi_info *current_provider = NULL;
732744

733-
for (uint32_t i = 0; i < provider_rank; ++i) {
745+
for (current_provider = provider_list; NULL != current_provider;) {
746+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) {
747+
++num_providers;
748+
}
734749
current_provider = current_provider->next;
735750
}
736751

752+
current_provider = provider_list;
753+
if (2 > num_providers) {
754+
goto out;
755+
}
756+
757+
provider_rank = rank % num_providers;
758+
759+
while (NULL != current_provider) {
760+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)
761+
&& provider_rank == current_rank++) {
762+
break;
763+
}
764+
current_provider = current_provider->next;
765+
}
766+
out:
737767
return current_provider;
738768
}
739769

@@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
850880
{
851881
int ret, num_providers = 0;
852882
struct fi_info *provider = NULL;
853-
uint32_t package_rank = 0;
883+
uint32_t package_rank = process_info->my_local_rank;
854884

855885
num_providers = count_providers(provider_list);
856886
if (!process_info->proc_is_bound || 2 > num_providers) {
@@ -868,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
868898
package_rank = get_package_rank(process_info);
869899

870900
#if OPAL_OFI_PCI_DATA_AVAILABLE
901+
/**
902+
* If provider PCI BDF information is available, we calculate its physical distance
903+
* to the current process, and select the provider with the shortest distance.
904+
*/
871905
ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank,
872906
&provider);
873907
if (OPAL_SUCCESS == ret) {
@@ -876,7 +910,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
876910
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
877911

878912
round_robin:
879-
provider = select_provider_round_robin(provider_list, package_rank, num_providers);
913+
if (!process_info->proc_is_bound && 1 < num_providers
914+
&& opal_output_get_verbosity(opal_common_ofi.output) >= 1) {
915+
opal_show_help("help-common-ofi.txt", "unbound_process", true, 1);
916+
}
917+
918+
provider = select_provider_round_robin(provider_list, package_rank);
880919
out:
881920
#if OPAL_ENABLE_DEBUG
882921
opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank,
@@ -950,5 +989,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
950989
}
951990
return ret;
952991
}
953-
954-

opal/mca/common/ofi/common_ofi.h

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
103103
/**
104104
* Selects NIC (provider) based on hardware locality
105105
*
106-
* In multi-nic situations, use hardware topology to pick the "best"
107-
* of the selected NICs.
108-
* There are 3 main cases that this covers:
109-
*
110-
* 1. If the first provider passed into this function is the only valid
111-
* provider, this provider is returned.
112-
*
113-
* 2. If there is more than 1 provider that matches the type of the first
114-
* provider in the list, and the BDF data
115-
* is available then a provider is selected based on locality of device
116-
* cpuset and process cpuset and tries to ensure that processes
117-
* are distributed evenly across NICs. This has two separate
118-
* cases:
119-
*
120-
* i. There is one or more provider local to the process:
121-
*
122-
* (local rank % number of providers of the same type
123-
* that share the process cpuset) is used to select one
124-
* of these providers.
125-
*
126-
* ii. There is no provider that is local to the process:
127-
*
128-
* (local rank % number of providers of the same type)
129-
* is used to select one of these providers
130-
*
131-
* 3. If there is more than 1 providers of the same type in the
132-
* list, and the BDF data is not available (the ofi version does
133-
* not support fi_info.nic or the provider does not support BDF)
134-
* then (local rank % number of providers of the same type) is
135-
* used to select one of these providers
136-
*
137-
* @param provider_list (IN) struct fi_info* An initially selected
138-
* provider NIC. The provider name and
139-
* attributes are used to restrict NIC
140-
* selection. This provider is returned if the
141-
* NIC selection fails.
142-
*
143-
* @param provider (OUT) struct fi_info* object with the selected
144-
* provider if the selection succeeds
145-
* if the selection fails, returns the fi_info
146-
* object that was initially provided.
106+
* The selection is based on the following priority:
107+
*
108+
* Single-NIC:
109+
*
110+
* If only 1 provider is available, always return that provider.
111+
*
112+
* Multi-NIC:
113+
*
114+
* 1. If the process is NOT bound, pick a NIC using (local rank % number
115+
* of providers of the same type). This gives a fair chance to each
116+
* qualified NIC and balances overall utilization.
117+
*
118+
* 2. If the process is bound, we compare providers in the list that have
119+
* the same type as the first provider, and find the provider with the
120+
* shortest distance to the current process.
121+
*
122+
* i. If the provider has PCI BDF data, we attempt to compute the
123+
* distance between the NIC and the current process cpuset. The NIC
124+
* with the shortest distance is returned.
125+
*
126+
* * For equidistant NICs, we select a NIC in round-robin fashion
127+
* using the package rank of the current process, i.e. (package
128+
* rank % number of providers with the same distance).
129+
*
130+
* ii. If we cannot compute the distance between the NIC and the
131+
* current process, e.g. PCI BDF data is not available, a NIC will be
132+
* selected in a round-robin fashion using package rank, i.e. (package
133+
* rank % number of providers of the same type).
134+
*
135+
* @param[in] provider_list struct fi_info* An initially selected
136+
* provider NIC. The provider name and
137+
* attributes are used to restrict NIC
138+
* selection. This provider is returned if the
139+
* NIC selection fails.
140+
*
141+
* @param[in] process_info opal_process_info_t* The current process info
142+
*
143+
* @param[out] provider struct fi_info* object with the selected
144+
* provider if the selection succeeds
145+
* if the selection fails, returns the fi_info
146+
* object that was initially provided.
147147
*
148148
* All errors should be recoverable and will return the initially provided
149149
* provider. However, if an error occurs we can no longer guarantee
@@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
152152
*
153153
*/
154154
OPAL_DECLSPEC struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
155-
opal_process_info_t *process_info);
155+
opal_process_info_t *process_info);
156156

157157
/**
158158
* Obtain EP endpoint name

opal/mca/common/ofi/help-common-ofi.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
#
88
# $HEADER$
99
#
10+
[unbound_process]
11+
Open MPI's OFI driver detected multiple NICs on the system but cannot select an
12+
optimal device because the current process is not bound. This may negatively
13+
impact performance. This can be resolved by specifying "--bind-to ..." on
14+
command line.
15+
1016
[package_rank failed]
1117
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
1218
but had insufficient information to ensure MPI processes fairly pick a NIC for use.

0 commit comments

Comments
 (0)