Skip to content

Commit b061f96

Browse files
committed
opal/ofi: fix round-robin selection logic
This change fixes current round-robin selection logic: - Only providers of the same type should be considered, i.e. providers that match the head of the list. This deviates from the documented behavior. - For unbound process the selection should be based on its local rank, i.e. rank among processes on the same node. Currently only the first NIC will be selected. Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
1 parent 43a3f42 commit b061f96

File tree

2 files changed

+48
-9
lines changed

2 files changed

+48
-9
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
724724
return ret;
725725
}
726726

727-
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank,
728-
size_t num_providers)
727+
/**
728+
* @brief Selects a provider from the list in a round-robin fashion
729+
*
730+
* This function implements a round-robin algorithm to select a provider from
731+
* the provided list based on a rank. Only providers of the same type as the
732+
* first provider are eligible for selection.
733+
*
734+
* @param[in] provider_list A list of providers to select from.
735+
* @param[out] rank A rank metric for the current process, such as
736+
* the rank on the same node or CPU package.
737+
* @return Pointer to the selected provider
738+
*/
739+
static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank)
729740
{
730-
uint32_t provider_rank = rank % num_providers;
731-
struct fi_info *current_provider = provider_list;
741+
uint32_t provider_rank = 0, current_rank = 0;
742+
size_t num_providers = 0;
743+
struct fi_info *current_provider = NULL;
732744

733-
for (uint32_t i = 0; i < provider_rank; ++i) {
745+
for (current_provider = provider_list; NULL != current_provider;) {
746+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) {
747+
++num_providers;
748+
}
734749
current_provider = current_provider->next;
735750
}
736751

752+
current_provider = provider_list;
753+
if (2 > num_providers) {
754+
goto out;
755+
}
756+
757+
provider_rank = rank % num_providers;
758+
759+
while (NULL != current_provider) {
760+
if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)
761+
&& provider_rank == current_rank++) {
762+
break;
763+
}
764+
current_provider = current_provider->next;
765+
}
766+
out:
737767
return current_provider;
738768
}
739769

@@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
850880
{
851881
int ret, num_providers = 0;
852882
struct fi_info *provider = NULL;
853-
uint32_t package_rank = 0;
883+
uint32_t package_rank = process_info->my_local_rank;
854884

855885
num_providers = count_providers(provider_list);
856886
if (!process_info->proc_is_bound || 2 > num_providers) {
@@ -876,7 +906,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
876906
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
877907

878908
round_robin:
879-
provider = select_provider_round_robin(provider_list, package_rank, num_providers);
909+
if (!process_info->proc_is_bound && 1 < num_providers
910+
&& opal_output_get_verbosity(opal_common_ofi.output) >= 1) {
911+
opal_show_help("help-common-ofi.txt", "unbound_process", true, 1);
912+
}
913+
914+
provider = select_provider_round_robin(provider_list, package_rank);
880915
out:
881916
#if OPAL_ENABLE_DEBUG
882917
opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank,
@@ -950,5 +985,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
950985
}
951986
return ret;
952987
}
953-
954-

opal/mca/common/ofi/help-common-ofi.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
#
88
# $HEADER$
99
#
10+
[unbound_process]
11+
Open MPI's OFI driver detected multiple NICs on the system but cannot select an
12+
optimal device because the current process is not bound. This may negatively
13+
impact performance. This can be resolved by specifying "--bind-to ..." on
14+
command line.
15+
1016
[package_rank failed]
1117
Open MPI's OFI driver detected multiple equidistant NICs from the current process,
1218
but had insufficient information to ensure MPI processes fairly pick a NIC for use.

0 commit comments

Comments
 (0)