16
16
#include <errno.h>
17
17
#include <unistd.h>
18
18
19
+ #include "opal_config.h"
19
20
#include "common_ofi.h"
20
21
#include "opal_config.h"
21
22
#include "opal/constants.h"
22
23
#include "opal/util/argv.h"
23
24
#include "opal/mca/base/mca_base_var.h"
24
25
#include "opal/mca/base/mca_base_framework.h"
25
26
#include "opal/mca/hwloc/base/base.h"
27
+ #include "opal/mca/pmix/base/base.h"
28
+ #include "opal/util/show_help.h"
26
29
27
30
OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {
28
31
.prov_include = NULL ,
@@ -281,6 +284,79 @@ count_providers(struct fi_info* provider_list)
281
284
return num_provider ;
282
285
}
283
286
287
+ /* Calculate the currrent process package rank.
288
+ * @param (IN) process_info struct opal_process_info_t information
289
+ * about the current process. used to get
290
+ * num_local_peers, myprocid.rank, and
291
+ * my_local_rank.
292
+ *
293
+ * @param (OUT) uint32_t package rank or myprocid.rank
294
+ *
295
+ * If successful, returns PMIX_PACKAGE_RANK, or an
296
+ * equivalent calculated package rank.
297
+ * otherwise falls back to using opal_process_info.myprocid.rank
298
+ * this can affect performance, but is unlikely to happen.
299
+ */
300
+ static uint32_t get_package_rank (opal_process_info_t process_info )
301
+ {
302
+ int i ;
303
+ uint16_t relative_locality , * package_rank_ptr ;
304
+ uint16_t current_package_rank = 0 ;
305
+ uint16_t package_ranks [process_info .num_local_peers ];
306
+ opal_process_name_t pname ;
307
+ opal_status_t rc ;
308
+ char * * peers = NULL ;
309
+ char * local_peers = NULL ;
310
+ char * locality_string = NULL ;
311
+
312
+ pname .jobid = OPAL_PROC_MY_NAME .jobid ;
313
+ pname .vpid = OPAL_VPID_WILDCARD ;
314
+
315
+ #if HAVE_DECL_PMIX_PACKAGE_RANK
316
+ // Try to get the PACKAGE_RANK from PMIx
317
+ OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_PACKAGE_RANK ,
318
+ & pname , & package_rank_ptr , PMIX_UINT16 );
319
+ if (PMIX_SUCCESS == rc ) {
320
+ return (uint32_t )* package_rank_ptr ;
321
+ }
322
+ #endif
323
+
324
+ // Get the local peers
325
+ OPAL_MODEX_RECV_VALUE (rc , PMIX_LOCAL_PEERS ,
326
+ & pname , & local_peers , PMIX_STRING );
327
+ if (PMIX_SUCCESS != rc || NULL == local_peers ) {
328
+ // We can't find package_rank, fall back to procid
329
+ opal_show_help ("help-common-ofi.txt" , "package_rank failed" , true);
330
+ return (uint32_t )process_info .myprocid .rank ;
331
+ }
332
+ peers = opal_argv_split (local_peers , ',' );
333
+ free (local_peers );
334
+
335
+ for (i = 0 ; NULL != peers [i ]; i ++ ) {
336
+ pname .vpid = strtoul (peers [i ], NULL , 10 );
337
+ locality_string = NULL ;
338
+ // Get the LOCALITY_STRING for process[i]
339
+ OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_LOCALITY_STRING ,
340
+ & pname , & locality_string , PMIX_STRING );
341
+ if (PMIX_SUCCESS != rc || NULL == locality_string ) {
342
+ // If we don't have information about locality, fall back to procid
343
+ opal_show_help ("help-common-ofi.txt" , "package_rank failed" , true);
344
+ return (uint32_t )process_info .myprocid .rank ;
345
+ }
346
+
347
+ // compute relative locality
348
+ relative_locality = opal_hwloc_compute_relative_locality (process_info .cpuset , locality_string );
349
+ free (locality_string );
350
+
351
+ if (relative_locality & OPAL_PROC_ON_SOCKET ) {
352
+ package_ranks [i ] = current_package_rank ;
353
+ current_package_rank ++ ;
354
+ }
355
+ }
356
+
357
+ return (uint32_t )package_ranks [process_info .my_local_rank ];
358
+ }
359
+
284
360
/* Selects a NIC based on hardware locality between process cpuset and device BDF.
285
361
*
286
362
* Initializes opal_hwloc_topology to access hardware topology if not previously
@@ -318,11 +394,13 @@ count_providers(struct fi_info* provider_list)
318
394
* selection. This provider is returned if the
319
395
* NIC selection fails.
320
396
*
321
- * @param local_index (IN) int The local rank of the process. Used to
397
+ * @param package_rank (IN) uint32_t The rank of the process. Used to
322
398
* select one valid NIC if there is a case
323
399
* where more than one can be selected. This
324
400
* could occur when more than one provider
325
401
* shares the same cpuset as the process.
402
+ * This could either be a package_rank if one is
403
+ * successfully calculated, or the process id.
326
404
*
327
405
* @param provider (OUT) struct fi_info* object with the selected
328
406
* provider if the selection succeeds
@@ -335,14 +413,15 @@ count_providers(struct fi_info* provider_list)
335
413
* balance across available NICs.
336
414
*/
337
415
struct fi_info *
338
- opal_mca_common_ofi_select_provider (struct fi_info * provider_list , int local_index )
416
+ opal_mca_common_ofi_select_provider (struct fi_info * provider_list , opal_process_info_t process_info )
339
417
{
340
418
struct fi_info * provider = provider_list , * current_provider = provider_list ;
341
419
struct fi_info * * provider_table ;
342
420
#if OPAL_OFI_PCI_DATA_AVAILABLE
343
421
struct fi_pci_attr pci ;
344
422
#endif
345
423
int ret ;
424
+ uint32_t package_rank ;
346
425
unsigned int num_provider = 0 , provider_limit = 0 ;
347
426
bool provider_found = false, cpusets_match = false;
348
427
@@ -399,8 +478,12 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
399
478
}
400
479
401
480
/* Select provider from local rank % number of providers */
402
- if (num_provider > 0 ) {
403
- provider = provider_table [local_index % num_provider ];
481
+ if (num_provider >= 2 ) {
482
+ // If there are multiple NICs "close" to the process, try to calculate package_rank
483
+ package_rank = get_package_rank (process_info );
484
+ provider = provider_table [package_rank % num_provider ];
485
+ } else if (num_provider == 1 ) {
486
+ provider = provider_table [num_provider - 1 ];
404
487
}
405
488
406
489
#if OPAL_OFI_PCI_DATA_AVAILABLE
@@ -412,8 +495,8 @@ opal_mca_common_ofi_select_provider(struct fi_info *provider_list, int local_ind
412
495
413
496
#if OPAL_ENABLE_DEBUG
414
497
opal_output_verbose (1 , opal_common_ofi .output ,
415
- "local rank: %d device: %s cpusets match: %s\n" ,
416
- local_index , provider -> domain_attr -> name ,
498
+ "package rank: %d device: %s cpusets match: %s\n" ,
499
+ package_rank , provider -> domain_attr -> name ,
417
500
cpusets_match ? "true" : "false" );
418
501
#endif
419
502
0 commit comments