11
11
12
12
#include "opal_config.h"
13
13
#include "opal/constants.h"
14
+ #include "opal/mca/hwloc/base/base.h"
14
15
15
16
#include <errno.h>
16
17
#include <unistd.h>
@@ -25,3 +26,258 @@ int mca_common_ofi_register_mca_variables(void)
25
26
return OPAL_ERROR ;
26
27
}
27
28
}
29
+
30
+ /* check that the tx attributes match */
31
+ static int
32
+ check_tx_attr (struct fi_tx_attr * provider_info ,
33
+ struct fi_tx_attr * provider )
34
+ {
35
+ if (!(provider -> msg_order & ~(provider_info -> msg_order )) &&
36
+ !(provider -> op_flags & ~(provider_info -> op_flags )) &&
37
+ (provider -> inject_size == provider_info -> inject_size )) {
38
+ return 0 ;
39
+ } else {
40
+ return OPAL_ERROR ;
41
+ }
42
+ }
43
+
44
+ /* check that the rx attributes match */
45
+ static int
46
+ check_rx_attr (struct fi_rx_attr * provider_info ,
47
+ struct fi_rx_attr * provider )
48
+ {
49
+ if (!(provider -> msg_order & ~(provider_info -> msg_order )) &&
50
+ !(provider -> op_flags & ~(provider_info -> op_flags ))) {
51
+ return 0 ;
52
+ } else {
53
+ return OPAL_ERROR ;
54
+ }
55
+ }
56
+
57
+ /* check that the ep attributes match */
58
+ static int
59
+ check_ep_attr (struct fi_ep_attr * provider_info ,
60
+ struct fi_ep_attr * provider )
61
+ {
62
+ if (!(provider -> type & ~(provider_info -> type )) &&
63
+ !(provider -> mem_tag_format & ~(provider_info -> mem_tag_format )) &&
64
+ (provider -> max_msg_size == provider_info -> max_msg_size ) &&
65
+ (provider -> tx_ctx_cnt == provider_info -> tx_ctx_cnt ) &&
66
+ (provider -> rx_ctx_cnt == provider_info -> rx_ctx_cnt )) {
67
+ return 0 ;
68
+ } else {
69
+ return OPAL_ERROR ;
70
+ }
71
+ }
72
+
73
+ /* check that the provider attributes match */
74
+ static int
75
+ check_provider_attr (struct fi_info * provider_info ,
76
+ struct fi_info * provider )
77
+ {
78
+ /* make sure both info are the same provider and provide the same attributes */
79
+ if (0 == strcmp (provider_info -> fabric_attr -> prov_name , provider -> fabric_attr -> prov_name ) &&
80
+ !check_tx_attr (provider_info -> tx_attr , provider -> tx_attr ) &&
81
+ !check_rx_attr (provider_info -> rx_attr , provider -> rx_attr ) &&
82
+ !check_ep_attr (provider_info -> ep_attr , provider -> ep_attr ) &&
83
+ !(provider_info -> caps & ~(provider -> caps )) &&
84
+ !(provider_info -> mode & ~(provider -> mode ))) {
85
+ return 0 ;
86
+ } else {
87
+ return OPAL_ERROR ;
88
+ }
89
+ }
90
+
91
+ /* Check if a process and a pci device share the same cpuset
92
+ * @param (IN) pci struct fi_pci_attr pci device attributes,
93
+ * used to find hwloc object for device.
94
+ *
95
+ * @param (IN) topology hwloc_topology_t topology to get the cpusets
96
+ * from
97
+ *
98
+ * @param (OUT) returns true if cpusets match and false if
99
+ * cpusets do not match or an error prevents comparison
100
+ *
101
+ * Uses a pci device to find an ancestor that contains a cpuset, and
102
+ * determines if it intersects with the cpuset that the process is bound to.
103
+ * if the process is not bound, or if a cpuset is unavailable for whatever
104
+ * reason, returns false. Otherwise, returns the result of
105
+ * hwloc_cpuset_intersects()
106
+ */
107
+ static bool
108
+ compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
109
+ {
110
+ bool result = false;
111
+ int ret ;
112
+ hwloc_bitmap_t proc_cpuset ;
113
+ hwloc_obj_t obj = NULL ;
114
+
115
+ /* Cannot find topology info if no topology is found */
116
+ if (NULL == topology ) {
117
+ return false;
118
+ }
119
+
120
+ /* Allocate memory for proc_cpuset */
121
+ proc_cpuset = hwloc_bitmap_alloc ();
122
+ if (NULL == proc_cpuset ) {
123
+ return false;
124
+ }
125
+
126
+ /* Fill cpuset with the collection of cpu cores that the process runs on */
127
+ ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
128
+ if (0 > ret ) {
129
+ goto error ;
130
+ }
131
+
132
+ /* Get the pci device from bdf */
133
+ obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id ,
134
+ pci .device_id , pci .function_id );
135
+ if (NULL == obj ) {
136
+ goto error ;
137
+ }
138
+
139
+ /* pcidev objects don't have cpusets so find the first non-io object above */
140
+ obj = hwloc_get_non_io_ancestor_obj (topology , obj );
141
+ if (NULL != obj ) {
142
+ result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
143
+ }
144
+
145
+ error :
146
+ hwloc_bitmap_free (proc_cpuset );
147
+ return result ;
148
+ }
149
+
150
+ /* Count providers returns the number of providers present in an fi_info list
151
+ * @param (IN) provider_list struct fi_info* list of providers available
152
+ *
153
+ * @param (OUT) int number of providers present in the list
154
+ *
155
+ * returns 0 if the list is NULL
156
+ */
157
+ static int
158
+ count_providers (struct fi_info * provider_list )
159
+ {
160
+ struct fi_info * dev = provider_list ;
161
+ int num_provider = 0 ;
162
+
163
+ while (NULL != dev ) {
164
+ num_provider ++ ;
165
+ dev = dev -> next ;
166
+ }
167
+
168
+ return num_provider ;
169
+ }
170
+
171
+ /* Selects a NIC based on hardware locality to process cpuset and device BDF.
172
+ *
173
+ * @param provider_list (IN) struct fi_info* An initially selected
174
+ * provider NIC. The provider name and
175
+ * attributes are used to restrict NIC
176
+ * selection. This provider is returned if the
177
+ * NIC selection fails.
178
+ *
179
+ * @param local_index (IN) int The local rank of the process. Used to
180
+ * select one valid NIC if there is a case
181
+ * where more than one can be selected. This
182
+ * could occur when more than one provider
183
+ * shares the same cpuset as the process.
184
+ *
185
+ * @param provider (OUT) struct fi_info* object with the selected
186
+ * provider if the selection succeeds
187
+ * if the selection fails, returns the fi_info
188
+ * object that was initially provided.
189
+ *
190
+ * If there is more than one provider that shares the same cpuset, we use
191
+ * (local rank % number of valid providers that share the process cpuset)
192
+ * to select one of the local providers.
193
+ *
194
+ * Likewise, If no providers share the same cpuset as the process, we use
195
+ * (local rank % number of valid providers that share the process cpuset)
196
+ * to select one of the valid providers.
197
+ *
198
+ * Initializes opal_hwloc_topology to access hardware topology if not previously
199
+ * initialized
200
+ *
201
+ * If a provider does not provide a BDF, the locality can't be determined and it
202
+ * is treated as though it does not share the same cpuset as the process.
203
+ *
204
+ * All errors should be recoverable and will return the initially provided
205
+ * provider. However, if an error occurs this will no longer guarantee
206
+ * that the provider returned is local to the process or that the processes will
207
+ * balance across available NICs.
208
+ */
209
+ struct fi_info *
210
+ opal_mca_common_ofi_select_provider (struct fi_info * provider_list , int local_index )
211
+ {
212
+ struct fi_info * provider = provider_list , * current_provider = provider_list ;
213
+ struct fi_info * * provider_table ;
214
+ struct fi_pci_attr pci ;
215
+ int ret ;
216
+ unsigned int num_provider = 0 , provider_limit = 0 ;
217
+ bool provider_found = false, cpusets_match = false;
218
+
219
+ /* Initialize opal_hwloc_topology if it is not already */
220
+ ret = opal_hwloc_base_get_topology ();
221
+ if (0 > ret ) {
222
+ /* Provider selection can continue but there is no guarantee of locality */
223
+ opal_output (1 , "%s:%d:Failed to initialize topology\n" , __FILE__ , __LINE__ );
224
+ }
225
+
226
+ provider_limit = count_providers (provider_list );
227
+
228
+ /* Allocate memory for provider table */
229
+ provider_table = calloc (provider_limit , sizeof (struct fi_info * ));
230
+ if (NULL == provider_table ) {
231
+ opal_output (1 , "%s:%d:Failed to allocate memory for provider table\n" , __FILE__ , __LINE__ );
232
+ return provider_list ;
233
+ }
234
+
235
+ current_provider = provider ;
236
+
237
+ /* Cycle through remaining fi_info objects, looking for alike providers */
238
+ while (NULL != current_provider ) {
239
+ if (!check_provider_attr (provider , current_provider )) {
240
+ cpusets_match = false;
241
+ if (NULL != current_provider -> nic ) {
242
+ pci = current_provider -> nic -> bus_attr -> attr .pci ;
243
+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
244
+ }
245
+
246
+ /* Reset the list if the cpusets match and no other provider was
247
+ * found on the same cpuset as the process.
248
+ */
249
+ if (cpusets_match && !provider_found ) {
250
+ provider_found = true;
251
+ num_provider = 0 ;
252
+ }
253
+
254
+ /* Add the provider to the provider list if the cpusets match or if
255
+ * no other provider was found on the same cpuset as the process.
256
+ */
257
+ if (cpusets_match || !provider_found ) {
258
+ provider_table [num_provider ] = current_provider ;
259
+ num_provider ++ ;
260
+ }
261
+ }
262
+ current_provider = current_provider -> next ;
263
+ }
264
+
265
+ /* Select provider from local rank % number of providers */
266
+ if (num_provider > 0 ) {
267
+ provider = provider_table [local_index % num_provider ];
268
+ }
269
+
270
+ #if OPAL_DEBUG_ENABLE
271
+ if (NULL != provider -> nic ) {
272
+ pci = provider -> nic -> bus_attr -> attr .pci ;
273
+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
274
+ }
275
+
276
+ opal_output (10 , "local rank: %d device: %s cpusets match: %s\n" ,
277
+ local_index , provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
278
+ #endif
279
+
280
+ err_free_table :
281
+ free (provider_table );
282
+ return provider ;
283
+ }
0 commit comments