@@ -103,47 +103,47 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
103
103
/**
104
104
* Selects NIC (provider) based on hardware locality
105
105
*
106
- * In multi-nic situations, use hardware topology to pick the "best"
107
- * of the selected NICs.
108
- * There are 3 main cases that this covers :
109
- *
110
- * 1. If the first provider passed into this function is the only valid
111
- * provider, this provider is returned.
112
- *
113
- * 2. If there is more than 1 provider that matches the type of the first
114
- * provider in the list, and the BDF data
115
- * is available then a provider is selected based on locality of device
116
- * cpuset and process cpuset and tries to ensure that processes
117
- * are distributed evenly across NICs. This has two separate
118
- * cases:
119
- *
120
- * i. There is one or more provider local to the process:
121
- *
122
- * (local rank % number of providers of the same type
123
- * that share the process cpuset) is used to select one
124
- * of these providers .
125
- *
126
- * ii. There is no provider that is local to the process:
127
- *
128
- * (local rank % number of providers of the same type)
129
- * is used to select one of these providers
130
- *
131
- * 3. If there is more than 1 providers of the same type in the
132
- * list, and the BDF data is not available (the ofi version does
133
- * not support fi_info.nic or the provider does not support BDF)
134
- * then (local rank % number of providers of the same type) is
135
- * used to select one of these providers
136
- *
137
- * @param provider_list (IN) struct fi_info* An initially selected
138
- * provider NIC. The provider name and
139
- * attributes are used to restrict NIC
140
- * selection. This provider is returned if the
141
- * NIC selection fails.
142
- *
143
- * @param provider (OUT) struct fi_info* object with the selected
144
- * provider if the selection succeeds
145
- * if the selection fails, returns the fi_info
146
- * object that was initially provided.
106
+ * The selection is based on the following priority:
107
+ *
108
+ * Single-NIC :
109
+ *
110
+ * If only 1 provider is available, always return that provider.
111
+ *
112
+ * Multi-NIC:
113
+ *
114
+ * 1. If the process is NOT bound, pick a NIC using (local rank % number
115
+ * of providers of the same type). This gives a fair chance to each
116
+ * qualified NIC and balances overall utilization.
117
+ *
118
+ * 2. If the process is bound, we compare providers in the list that have
119
+ * the same type as the first provider, and find the provider with the
120
+ * shortest distance to the current process.
121
+ *
122
+ * i. If the provider has PCI BDF data, we attempt to compute the
123
+ * distance between the NIC and the current process cpuset. The NIC
124
+ * with the shortest distance is returned .
125
+ *
126
+ * * For equidistant NICs, we select a NIC in round-robin fashion
127
+ * using the package rank of the current process, i.e. (package
128
+ * rank % number of providers with the same distance).
129
+ *
130
+ * ii. If we cannot compute the distance between the NIC and the
131
+ * current process, e.g. PCI BDF data is not available, a NIC will be
132
+ * selected in a round-robin fashion using package rank, i.e. (package
133
+ * rank % number of providers of the same type).
134
+ *
135
+ * @param[in] provider_list struct fi_info* An initially selected
136
+ * provider NIC. The provider name and
137
+ * attributes are used to restrict NIC
138
+ * selection. This provider is returned if the
139
+ * NIC selection fails.
140
+ *
141
+ * @param[in] process_info opal_process_info_t* The current process info
142
+ *
143
+ * @param[out] provider struct fi_info* object with the selected
144
+ * provider if the selection succeeds
145
+ * if the selection fails, returns the fi_info
146
+ * object that was initially provided.
147
147
*
148
148
* All errors should be recoverable and will return the initially provided
149
149
* provider. However, if an error occurs we can no longer guarantee
@@ -152,7 +152,7 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item);
152
152
*
153
153
*/
154
154
OPAL_DECLSPEC struct fi_info * opal_common_ofi_select_provider (struct fi_info * provider_list ,
155
- opal_process_info_t * process_info );
155
+ opal_process_info_t * process_info );
156
156
157
157
/**
158
158
* Obtain EP endpoint name
0 commit comments