@@ -724,16 +724,46 @@ static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_l
724
724
return ret ;
725
725
}
726
726
727
- static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank ,
728
- size_t num_providers )
727
+ /**
728
+ * @brief Selects a provider from the list in a round-robin fashion
729
+ *
730
+ * This function implements a round-robin algorithm to select a provider from
731
+ * the provided list based on a rank. Only providers of the same type as the
732
+ * first provider are eligible for selection.
733
+ *
734
+ * @param[in] provider_list A list of providers to select from.
735
+ * @param[out] rank A rank metric for the current process, such as
736
+ * the rank on the same node or CPU package.
737
+ * @return Pointer to the selected provider
738
+ */
739
+ static struct fi_info * select_provider_round_robin (struct fi_info * provider_list , uint32_t rank )
729
740
{
730
- uint32_t provider_rank = rank % num_providers ;
731
- struct fi_info * current_provider = provider_list ;
741
+ uint32_t provider_rank = 0 , current_rank = 0 ;
742
+ size_t num_providers = 0 ;
743
+ struct fi_info * current_provider = NULL ;
732
744
733
- for (uint32_t i = 0 ; i < provider_rank ; ++ i ) {
745
+ for (current_provider = provider_list ; NULL != current_provider ;) {
746
+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )) {
747
+ ++ num_providers ;
748
+ }
734
749
current_provider = current_provider -> next ;
735
750
}
736
751
752
+ current_provider = provider_list ;
753
+ if (2 > num_providers ) {
754
+ goto out ;
755
+ }
756
+
757
+ provider_rank = rank % num_providers ;
758
+
759
+ while (NULL != current_provider ) {
760
+ if (OPAL_SUCCESS == check_provider_attr (provider_list , current_provider )
761
+ && provider_rank == current_rank ++ ) {
762
+ break ;
763
+ }
764
+ current_provider = current_provider -> next ;
765
+ }
766
+ out :
737
767
return current_provider ;
738
768
}
739
769
@@ -850,7 +880,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
850
880
{
851
881
int ret , num_providers = 0 ;
852
882
struct fi_info * provider = NULL ;
853
- uint32_t package_rank = 0 ;
883
+ uint32_t package_rank = process_info -> my_local_rank ;
854
884
855
885
num_providers = count_providers (provider_list );
856
886
if (!process_info -> proc_is_bound || 2 > num_providers ) {
@@ -876,7 +906,12 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
876
906
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
877
907
878
908
round_robin :
879
- provider = select_provider_round_robin (provider_list , package_rank , num_providers );
909
+ if (!process_info -> proc_is_bound && 1 < num_providers
910
+ && opal_output_get_verbosity (opal_common_ofi .output ) >= 1 ) {
911
+ opal_show_help ("help-common-ofi.txt" , "unbound_process" , true, 1 );
912
+ }
913
+
914
+ provider = select_provider_round_robin (provider_list , package_rank );
880
915
out :
881
916
#if OPAL_ENABLE_DEBUG
882
917
opal_output_verbose (1 , opal_common_ofi .output , "package rank: %d device: %s" , package_rank ,
@@ -950,5 +985,3 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
950
985
}
951
986
return ret ;
952
987
}
953
-
954
-
0 commit comments