@@ -623,10 +623,10 @@ static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topo
623
623
/**
624
624
* @brief Get the nearest device to the current thread
625
625
*
626
- * Use the PMIx server or calculate the device distances, then out of the set of
627
- * returned distances find the subset of the nearest devices. This can be
628
- * 0 or more.
629
- * If there are multiple equidistant devices, break the tie using the rank .
626
+ * Compute the distances from the current thread to each NIC in provider_list,
627
+ * and select the NIC with the shortest distance.
628
+ * If there are multiple equidistant devices, break the tie using local rank
629
+ * to balance NIC utilization .
630
630
*
631
631
* @param[in] topoloy hwloc topology
632
632
* @param[in] provider_list List of providers to select from
@@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
724
724
return ret ;
725
725
}
726
726
727
- static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank ,
728
- size_t num_providers )
727
+ /**
728
+ * @brief Selects a provider from the list in a round-robin fashion
729
+ *
730
+ * This function implements a round-robin algorithm to select a provider from
731
+ * the provided list based on a rank. Only providers of the same type as the
732
+ * first provider are eligible for selection.
733
+ *
734
+ * @param[in] provider_list A list of providers to select from.
735
+ * @param[out] rank A rank metric for the current process, such as
736
+ * the rank on the same node or CPU package.
737
+ * @return Pointer to the selected provider
738
+ */
739
+ static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank )
729
740
{
730
- uint32_t provider_rank = rank % num_providers ;
731
- struct fi_info * current_provider = provider_list ;
741
+ uint32_t provider_rank = 0 , current_rank = 0 ;
742
+ size_t num_providers = 0 ;
743
+ struct fi_info * current_provider = NULL ;
732
744
733
- for (uint32_t i = 0 ; i < provider_rank ; ++ i ) {
745
+ for (current_provider = provider_list ; NULL != current_provider ;) {
746
+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )) {
747
+ ++ num_providers ;
748
+ }
734
749
current_provider = current_provider -> next ;
735
750
}
736
751
752
+ current_provider = provider_list ;
753
+ if (2 > num_providers ) {
754
+ goto out ;
755
+ }
756
+
757
+ provider_rank = rank % num_providers ;
758
+
759
+ while (NULL != current_provider ) {
760
+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )
761
+ && provider_rank == current_rank ++ ) {
762
+ break ;
763
+ }
764
+ current_provider = current_provider -> next ;
765
+ }
766
+ out :
737
767
return current_provider ;
738
768
}
739
769
@@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
850
880
{
851
881
int ret , num_providers = 0 ;
852
882
struct fi_info * provider = NULL ;
853
- uint32_t package_rank = 0 ;
883
+ uint32_t package_rank = process_info -> my_local_rank ;
854
884
855
885
num_providers = count_providers (provider_list );
856
886
if (!process_info -> proc_is_bound || 2 > num_providers ) {
@@ -868,6 +898,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
868
898
package_rank = get_package_rank (process_info );
869
899
870
900
#if OPAL_OFI_PCI_DATA_AVAILABLE
901
+ /**
902
+ * If provider PCI BDF information is available, we calculate its physical distance
903
+ * to the current process, and select the provider with the shortest distance.
904
+ */
871
905
ret = get_nearest_nic (opal_hwloc_topology , provider_list , num_providers , package_rank ,
872
906
& provider );
873
907
if (OPAL_SUCCESS == ret ) {
@@ -876,7 +910,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
876
910
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
877
911
878
912
round_robin :
879
- provider = select_provider_round_robin (provider_list , package_rank , num_providers );
913
+ if (!process_info -> proc_is_bound && 1 < num_providers
914
+ && opal_output_get_verbosity (opal_common_ofi .output ) >= 1 ) {
915
+ opal_show_help ("help-common-ofi.txt" , "unbound_process" , true, 1 );
916
+ }
917
+
918
+ provider = select_provider_round_robin (provider_list , package_rank );
880
919
out :
881
920
#if OPAL_ENABLE_DEBUG
882
921
opal_output_verbose (1 , opal_common_ofi .output , "package rank: %d device: %s" , package_rank ,
@@ -950,5 +989,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
950
989
}
951
990
return ret ;
952
991
}
953
-
954
-
0 commit comments