@@ -445,214 +445,63 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445
445
}
446
446
}
447
447
448
- /**
449
- * Calculate device distances
450
- *
451
- * Calculate the distances between the current thread and all devices of
452
- * type OPENFABRICS or NETWORK.
453
- *
454
- * The shortest distances are the nearest and therefore most efficient
455
- * devices to use.
456
- *
457
- * Return an array of all the distances. Each entry is of type
458
- * pmix_device_distance_t
459
- *
460
- * This function is used if there is no PMIx server running.
461
- *
462
- * @param distances (OUT) distances array
463
- * @param ndist (OUT) number of entries in the distances array
464
- *
465
- * @return 0 on success. Error otherwise.
466
- *
467
- */
468
- static int compute_dev_distances (pmix_device_distance_t * * distances ,
469
- size_t * ndist )
470
- {
471
- int ret = 0 ;
472
- size_t ninfo ;
473
- pmix_info_t * info ;
474
- pmix_cpuset_t cpuset ;
475
- pmix_topology_t * pmix_topo ;
476
- pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477
- PMIX_DEVTYPE_NETWORK ;
478
-
479
- PMIX_CPUSET_CONSTRUCT (& cpuset );
480
- ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
481
- if (PMIX_SUCCESS != ret ) {
482
- goto out ;
483
- }
484
-
485
- /* load the PMIX topology */
486
- PMIX_TOPOLOGY_CREATE (pmix_topo , 1 );
487
- ret = PMIx_Load_topology (pmix_topo );
488
- if (PMIX_SUCCESS != ret ) {
489
- goto out ;
490
- }
491
-
492
- ninfo = 1 ;
493
- PMIX_INFO_CREATE (info , ninfo );
494
- PMIX_INFO_LOAD (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
495
- ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
496
- ndist );
497
- PMIX_INFO_FREE (info , ninfo );
498
-
499
- PMIX_TOPOLOGY_FREE (pmix_topo , 1 );
500
- out :
501
- return ret ;
502
- }
503
-
504
- /**
505
- * Find the nearest devices to the current thread
506
- *
507
- * Use the PMIx server or calculate the device distances, then out of the set of
508
- * returned distances find the subset of the nearest devices. This can be
509
- * 1 or more.
510
- *
511
- * @param num_distances (OUT) number of entries in the returned array
512
- *
513
- * @return An array of device distances which are nearest this thread
514
- * or NULL if we fail to get the distances. In this case we will just
515
- * revert to round robin.
516
- *
517
- */
518
- static pmix_device_distance_t * get_nearest_nics (int * num_distances )
519
- {
520
- size_t ndist ;
521
- pmix_topology_t * topo ;
522
- int ret , i , idx = 0 ;
523
- pmix_data_array_t * dptr ;
524
- uint16_t near = USHRT_MAX ;
525
- pmix_info_t directive ;
526
- pmix_value_t * val = NULL ;
527
- pmix_device_distance_t * distances , * nearest = NULL ;
528
-
529
- PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
530
- ret = PMIx_Get (& opal_process_info .myprocid ,
531
- PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
532
- PMIX_INFO_DESTRUCT (& directive );
533
- if (ret != PMIX_SUCCESS || !val ) {
534
- ret = compute_dev_distances (& distances , & ndist );
535
- if (ret )
536
- goto out ;
537
- goto find_nearest ;
538
- }
539
-
540
- if (PMIX_DATA_ARRAY != val -> type ) {
541
- goto out ;
542
- }
543
- dptr = val -> data .darray ;
544
- if (NULL == dptr ) {
545
- goto out ;
546
- }
547
- if (PMIX_DEVICE_DIST != dptr -> type ) {
548
- goto out ;
549
- }
550
-
551
- distances = (pmix_device_distance_t * )dptr -> array ;
552
- ndist = dptr -> size ;
553
-
554
- find_nearest :
555
- nearest = calloc (sizeof (* distances ), ndist );
556
- if (!nearest )
557
- goto out ;
558
-
559
- for (i = 0 ; i < ndist ; i ++ ) {
560
- if (distances [i ].mindist < near ) {
561
- idx = 0 ;
562
- near = distances [i ].mindist ;
563
- nearest [idx ] = distances [i ];
564
- idx ++ ;
565
- } else if (distances [i ].mindist == near ) {
566
- nearest [idx ] = distances [i ];
567
- idx ++ ;
568
- }
569
- }
570
-
571
- * num_distances = idx ;
572
-
573
- out :
574
- if (val )
575
- PMIX_VALUE_RELEASE (val );
576
- return nearest ;
577
- }
578
-
579
448
#if OPAL_OFI_PCI_DATA_AVAILABLE
580
- /**
581
- * Determine if a device is nearest
582
- *
583
- * Given a device distances array of the nearest pci devices,
584
- * determine if one of these device distances refers to the pci
585
- * device passed in
449
+ /* Check if a process and a pci device share the same cpuset
450
+ * @param (IN) pci struct fi_pci_attr pci device attributes,
451
+ * used to find hwloc object for device.
586
452
*
587
- * @param distances (IN) distances array
588
- * @param num_distances (IN) number of entries in the distances array
589
- * @param topology (IN) topology of the node
590
- * @param pci (IN) PCI device being examined
453
+ * @param (IN) topology hwloc_topology_t topology to get the cpusets
454
+ * from
591
455
*
592
- * @return true if the PCI device is in the distances array or if the
593
- * distances array is not provided. False otherwise.
456
+ * @param (OUT) returns true if cpusets match and false if
457
+ * cpusets do not match or an error prevents comparison
594
458
*
459
+ * Uses a pci device to find an ancestor that contains a cpuset, and
460
+ * determines if it intersects with the cpuset that the process is bound to.
461
+ * if the process is not bound, or if a cpuset is unavailable for whatever
462
+ * reason, returns false. Otherwise, returns the result of
463
+ * hwloc_cpuset_intersects()
595
464
*/
596
- static bool is_near (pmix_device_distance_t * distances ,
597
- int num_distances ,
598
- hwloc_topology_t topology ,
599
- struct fi_pci_attr pci )
465
+ static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
600
466
{
601
- hwloc_obj_t pcidev , osdev ;
602
-
603
- /* if we failed to find any distances, then we consider all interfaces
604
- * to be of equal distances and let the caller decide how to handle
605
- * them
606
- */
607
- if (!distances )
608
- return true;
467
+ bool result = false;
468
+ int ret ;
469
+ hwloc_bitmap_t proc_cpuset ;
470
+ hwloc_obj_t obj = NULL ;
609
471
610
- pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
611
- pci .bus_id , pci .device_id ,
612
- pci .function_id );
613
- if (!pcidev )
472
+ /* Cannot find topology info if no topology is found */
473
+ if (NULL == topology ) {
614
474
return false;
475
+ }
615
476
616
- for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
617
- int i ;
618
-
619
- if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
620
- const char * nguid = hwloc_obj_get_info_by_name (osdev ,"NodeGUID" );
621
- const char * sguid = hwloc_obj_get_info_by_name (osdev , "SysImageGUID" );
477
+ /* Allocate memory for proc_cpuset */
478
+ proc_cpuset = hwloc_bitmap_alloc ();
479
+ if (NULL == proc_cpuset ) {
480
+ return false;
481
+ }
622
482
623
- if (!nguid && !sguid )
624
- continue ;
483
+ /* Fill cpuset with the collection of cpu cores that the process runs on */
484
+ ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485
+ if (0 > ret ) {
486
+ goto error ;
487
+ }
625
488
626
- for (i = 0 ; i < num_distances ; i ++ ) {
627
- char lsguid [256 ], lnguid [256 ];
628
- int ret ;
489
+ /* Get the pci device from bdf */
490
+ obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491
+ pci .function_id );
492
+ if (NULL == obj ) {
493
+ goto error ;
494
+ }
629
495
630
- ret = sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
631
- if (ret != 2 )
632
- continue ;
633
- if (0 == strcasecmp (lnguid , nguid )) {
634
- return true;
635
- } else if (0 == strcasecmp (lsguid , sguid )) {
636
- return true;
637
- }
638
- }
639
- } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
640
- const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
641
- if (!address )
642
- continue ;
643
- for (i = 0 ; i < num_distances ; i ++ ) {
644
- char * addr = strstr (distances [i ].uuid , "://" );
645
- if (!addr || addr + 3 > distances [i ].uuid
646
- + strlen (distances [i ].uuid ))
647
- continue ;
648
- if (!strcmp (addr + 3 , address )) {
649
- return true;
650
- }
651
- }
652
- }
496
+ /* pcidev objects don't have cpusets so find the first non-io object above */
497
+ obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498
+ if (NULL != obj ) {
499
+ result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
653
500
}
654
501
655
- return false;
502
+ error :
503
+ hwloc_bitmap_free (proc_cpuset );
504
+ return result ;
656
505
}
657
506
#endif
658
507
@@ -765,10 +614,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
765
614
struct fi_info * provider = provider_list , * current_provider = provider_list ;
766
615
struct fi_info * * provider_table ;
767
616
#if OPAL_OFI_PCI_DATA_AVAILABLE
768
- pmix_device_distance_t * distances = NULL ;
769
617
struct fi_pci_attr pci ;
770
- int num_distances = 0 ;
771
- bool near ;
772
618
#endif
773
619
int ret ;
774
620
unsigned int num_provider = 0 , provider_limit = 0 ;
@@ -793,38 +639,33 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793
639
return provider_list ;
794
640
}
795
641
796
- #if OPAL_OFI_PCI_DATA_AVAILABLE
797
- /* find all the nearest devices to this thread, then out of these
798
- * determine which device we should bind to.
799
- */
800
- distances = get_nearest_nics (& num_distances );
801
- #endif
802
-
803
642
current_provider = provider ;
804
643
805
644
/* Cycle through remaining fi_info objects, looking for alike providers */
806
645
while (NULL != current_provider ) {
807
646
if (!check_provider_attr (provider , current_provider )) {
808
- near = false;
647
+ cpusets_match = false;
809
648
#if OPAL_OFI_PCI_DATA_AVAILABLE
810
649
if (NULL != current_provider -> nic
811
650
&& NULL != current_provider -> nic -> bus_attr
812
651
&& current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
813
652
pci = current_provider -> nic -> bus_attr -> attr .pci ;
814
- near = is_near (distances , num_distances ,
815
- opal_hwloc_topology , pci );
653
+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
816
654
}
817
655
#endif
818
- /* We could have multiple near providers */
819
- if (near && !provider_found ) {
656
+
657
+ /* Reset the list if the cpusets match and no other provider was
658
+ * found on the same cpuset as the process.
659
+ */
660
+ if (cpusets_match && !provider_found ) {
820
661
provider_found = true;
821
662
num_provider = 0 ;
822
663
}
823
664
824
665
/* Add the provider to the provider list if the cpusets match or if
825
666
* no other provider was found on the same cpuset as the process.
826
667
*/
827
- if (near || !provider_found ) {
668
+ if (cpusets_match || !provider_found ) {
828
669
provider_table [num_provider ] = current_provider ;
829
670
num_provider ++ ;
830
671
}
@@ -846,22 +687,17 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
846
687
&& NULL != provider -> nic -> bus_attr
847
688
&& provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
848
689
pci = provider -> nic -> bus_attr -> attr .pci ;
849
- near = is_near (distances , num_distances ,
850
- opal_hwloc_topology , pci );
690
+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
851
691
}
852
692
#endif
853
693
854
694
#if OPAL_ENABLE_DEBUG
855
695
opal_output_verbose (1 , opal_common_ofi .output ,
856
- "package rank: %d device: %s near : %s\n" , package_rank ,
857
- provider -> domain_attr -> name , near ? "true" : "false" );
696
+ "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697
+ provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
858
698
#endif
859
699
860
700
free (provider_table );
861
- #if OPAL_OFI_PCI_DATA_AVAILABLE
862
- if (distances )
863
- free (distances );
864
- #endif
865
701
return provider ;
866
702
}
867
703
0 commit comments