4
4
* reserved.
5
5
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
6
6
* reserved.
7
- * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
7
+ * Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
8
8
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
9
9
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
10
10
* reserved.
11
+ * Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
11
12
* $COPYRIGHT$
12
13
*
13
14
* Additional copyrights may follow
@@ -445,63 +446,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445
446
}
446
447
}
447
448
448
- #if OPAL_OFI_PCI_DATA_AVAILABLE
449
- /* Check if a process and a pci device share the same cpuset
450
- * @param (IN) pci struct fi_pci_attr pci device attributes,
451
- * used to find hwloc object for device.
449
+ /**
450
+ * Calculate device distances
451
+ *
452
+ * Calculate the distances between the current thread and all devices of
453
+ * type OPENFABRICS or NETWORK.
454
+ *
455
+ * The shortest distances are the nearest and therefore most efficient
456
+ * devices to use.
452
457
*
453
- * @param (IN) topology hwloc_topology_t topology to get the cpusets
454
- * from
458
+ * Return an array of all the distances. Each entry is of type
459
+ * pmix_device_distance_t
455
460
*
456
- * @param (OUT) returns true if cpusets match and false if
457
- * cpusets do not match or an error prevents comparison
461
+ * This function is used if there is no PMIx server running.
462
+ *
463
+ * @param distances (OUT) distances array
464
+ * @param ndist (OUT) number of entries in the distances array
465
+ *
466
+ * @return 0 on success. Error otherwise.
458
467
*
459
- * Uses a pci device to find an ancestor that contains a cpuset, and
460
- * determines if it intersects with the cpuset that the process is bound to.
461
- * if the process is not bound, or if a cpuset is unavailable for whatever
462
- * reason, returns false. Otherwise, returns the result of
463
- * hwloc_cpuset_intersects()
464
468
*/
465
- static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
469
+ static int compute_dev_distances (pmix_device_distance_t * * distances ,
470
+ size_t * ndist )
466
471
{
467
- bool result = false;
468
- int ret ;
469
- hwloc_bitmap_t proc_cpuset ;
470
- hwloc_obj_t obj = NULL ;
472
+ int ret = 0 ;
473
+ size_t ninfo ;
474
+ pmix_info_t * info ;
475
+ pmix_cpuset_t cpuset ;
476
+ pmix_topology_t * pmix_topo ;
477
+ pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
478
+ PMIX_DEVTYPE_NETWORK ;
479
+
480
+ PMIX_CPUSET_CONSTRUCT (& cpuset );
481
+ ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
482
+ if (PMIX_SUCCESS != ret ) {
483
+ goto out ;
484
+ }
485
+
486
+ /* load the PMIX topology */
487
+ PMIx_Topology_free (pmix_topo , 1 );
488
+ ret = PMIx_Load_topology (pmix_topo );
489
+ if (PMIX_SUCCESS != ret ) {
490
+ goto out ;
491
+ }
492
+
493
+ ninfo = 1 ;
494
+ info = PMIx_Info_create (ninfo );
495
+ PMIx_Info_load (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
496
+ ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
497
+ ndist );
498
+ PMIx_Info_free (info , ninfo );
499
+
500
+ PMIx_Topology_free (pmix_topo , 1 );
501
+ out :
502
+ return ret ;
503
+ }
471
504
472
- /* Cannot find topology info if no topology is found */
473
- if (NULL == topology ) {
474
- return false;
505
+ /**
506
+ * Find the nearest devices to the current thread
507
+ *
508
+ * Use the PMIx server or calculate the device distances, then out of the set of
509
+ * returned distances find the subset of the nearest devices. This can be
510
+ * 1 or more.
511
+ *
512
+ * @param num_distances (OUT) number of entries in the returned array
513
+ *
514
+ * @return An array of device distances which are nearest this thread
515
+ * or NULL if we fail to get the distances. In this case we will just
516
+ * revert to round robin.
517
+ *
518
+ */
519
+ static pmix_device_distance_t *
520
+ get_nearest_nics (int * num_distances , pmix_value_t * * valin )
521
+ {
522
+ size_t ndist , i ;
523
+ int ret , idx = 0 ;
524
+ pmix_data_array_t * dptr ;
525
+ uint16_t near = USHRT_MAX ;
526
+ pmix_info_t directive ;
527
+ pmix_value_t * val = NULL ;
528
+ pmix_device_distance_t * distances , * nearest = NULL ;
529
+
530
+ PMIx_Info_load (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
531
+ ret = PMIx_Get (& opal_process_info .myprocid ,
532
+ PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
533
+ PMIx_Info_destruct (& directive );
534
+ if (ret != PMIX_SUCCESS || !val ) {
535
+ ret = compute_dev_distances (& distances , & ndist );
536
+ if (ret )
537
+ goto out ;
538
+ goto find_nearest ;
539
+ }
540
+
541
+ if (PMIX_DATA_ARRAY != val -> type ) {
542
+ goto out ;
543
+ }
544
+ dptr = val -> data .darray ;
545
+ if (NULL == dptr ) {
546
+ goto out ;
547
+ }
548
+ if (PMIX_DEVICE_DIST != dptr -> type ) {
549
+ goto out ;
550
+ }
551
+
552
+ distances = (pmix_device_distance_t * )dptr -> array ;
553
+ ndist = dptr -> size ;
554
+
555
+ find_nearest :
556
+ nearest = calloc (sizeof (* distances ), ndist );
557
+ if (!nearest )
558
+ goto out ;
559
+
560
+ for (i = 0 ; i < ndist ; i ++ ) {
561
+ if (distances [i ].type != PMIX_DEVTYPE_NETWORK &&
562
+ distances [i ].type != PMIX_DEVTYPE_OPENFABRICS )
563
+ continue ;
564
+ if (distances [i ].mindist < near ) {
565
+ idx = 0 ;
566
+ near = distances [i ].mindist ;
567
+ nearest [idx ] = distances [i ];
568
+ idx ++ ;
569
+ } else if (distances [i ].mindist == near ) {
570
+ nearest [idx ] = distances [i ];
571
+ idx ++ ;
572
+ }
475
573
}
476
574
477
- /* Allocate memory for proc_cpuset */
478
- proc_cpuset = hwloc_bitmap_alloc ();
479
- if (NULL == proc_cpuset ) {
575
+ * num_distances = idx ;
576
+
577
+ out :
578
+ * valin = val ;
579
+ return nearest ;
580
+ }
581
+
582
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
583
+ /**
584
+ * Determine if a device is nearest
585
+ *
586
+ * Given a device distances array of the nearest pci devices,
587
+ * determine if one of these device distances refers to the pci
588
+ * device passed in
589
+ *
590
+ * @param distances (IN) distances array
591
+ * @param num_distances (IN) number of entries in the distances array
592
+ * @param topology (IN) topology of the node
593
+ * @param pci (IN) PCI device being examined
594
+ *
595
+ * @return true if the PCI device is in the distances array or if the
596
+ * distances array is not provided. False otherwise.
597
+ *
598
+ */
599
+ static bool is_near (pmix_device_distance_t * distances ,
600
+ int num_distances ,
601
+ hwloc_topology_t topology ,
602
+ struct fi_pci_attr pci )
603
+ {
604
+ hwloc_obj_t pcidev , osdev ;
605
+
606
+ /* if we failed to find any distances, then we consider all interfaces
607
+ * to be of equal distances and let the caller decide how to handle
608
+ * them
609
+ */
610
+ if (!distances )
611
+ return true;
612
+
613
+ pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
614
+ pci .bus_id , pci .device_id ,
615
+ pci .function_id );
616
+ if (!pcidev )
480
617
return false;
481
- }
482
618
483
- /* Fill cpuset with the collection of cpu cores that the process runs on */
484
- ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485
- if (0 > ret ) {
486
- goto error ;
487
- }
619
+ for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
620
+ int i ;
488
621
489
- /* Get the pci device from bdf */
490
- obj = hwloc_get_pcidev_by_busid ( topology , pci . domain_id , pci . bus_id , pci . device_id ,
491
- pci . function_id );
492
- if ( NULL == obj ) {
493
- goto error ;
494
- }
622
+ if ( osdev -> attr -> osdev . type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
623
+ const char * nguid = hwloc_obj_get_info_by_name ( osdev , "NodeGUID" );
624
+ const char * sguid = hwloc_obj_get_info_by_name ( osdev , "SysImageGUID" );
625
+
626
+ if (! nguid && ! sguid )
627
+ continue ;
495
628
496
- /* pcidev objects don't have cpusets so find the first non-io object above */
497
- obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498
- if (NULL != obj ) {
499
- result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
629
+ for (i = 0 ; i < num_distances ; i ++ ) {
630
+ char lsguid [256 ], lnguid [256 ];
631
+ int ret ;
632
+
633
+ ret = sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
634
+ if (ret != 2 )
635
+ continue ;
636
+ if (0 == strcasecmp (lnguid , nguid )) {
637
+ return true;
638
+ } else if (0 == strcasecmp (lsguid , sguid )) {
639
+ return true;
640
+ }
641
+ }
642
+ } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
643
+ const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
644
+ if (!address )
645
+ continue ;
646
+ for (i = 0 ; i < num_distances ; i ++ ) {
647
+ char * addr = strstr (distances [i ].uuid , "://" );
648
+ if (!addr || addr + 3 > distances [i ].uuid
649
+ + strlen (distances [i ].uuid ))
650
+ continue ;
651
+ if (!strcmp (addr + 3 , address )) {
652
+ return true;
653
+ }
654
+ }
655
+ }
500
656
}
501
657
502
- error :
503
- hwloc_bitmap_free (proc_cpuset );
504
- return result ;
658
+ return false;
505
659
}
506
660
#endif
507
661
@@ -614,11 +768,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614
768
struct fi_info * provider = provider_list , * current_provider = provider_list ;
615
769
struct fi_info * * provider_table ;
616
770
#if OPAL_OFI_PCI_DATA_AVAILABLE
771
+ pmix_device_distance_t * distances = NULL ;
772
+ pmix_value_t * pmix_val ;
617
773
struct fi_pci_attr pci ;
774
+ int num_distances = 0 ;
775
+ bool near ;
618
776
#endif
619
777
int ret ;
620
778
unsigned int num_provider = 0 , provider_limit = 0 ;
621
- bool provider_found = false, cpusets_match = false ;
779
+ bool provider_found = false;
622
780
623
781
/* Initialize opal_hwloc_topology if it is not already */
624
782
ret = opal_hwloc_base_get_topology ();
@@ -639,33 +797,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639
797
return provider_list ;
640
798
}
641
799
800
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
801
+ /* find all the nearest devices to this thread, then out of these
802
+ * determine which device we should bind to.
803
+ */
804
+ distances = get_nearest_nics (& num_distances , & pmix_val );
805
+ #endif
806
+
642
807
current_provider = provider ;
643
808
644
809
/* Cycle through remaining fi_info objects, looking for alike providers */
645
810
while (NULL != current_provider ) {
646
811
if (!check_provider_attr (provider , current_provider )) {
647
- cpusets_match = false;
812
+ near = false;
648
813
#if OPAL_OFI_PCI_DATA_AVAILABLE
649
814
if (NULL != current_provider -> nic
650
815
&& NULL != current_provider -> nic -> bus_attr
651
816
&& current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
652
817
pci = current_provider -> nic -> bus_attr -> attr .pci ;
653
- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
818
+ near = is_near (distances , num_distances ,
819
+ opal_hwloc_topology , pci );
654
820
}
655
821
#endif
656
-
657
- /* Reset the list if the cpusets match and no other provider was
658
- * found on the same cpuset as the process.
659
- */
660
- if (cpusets_match && !provider_found ) {
822
+ /* We could have multiple near providers */
823
+ if (near && !provider_found ) {
661
824
provider_found = true;
662
825
num_provider = 0 ;
663
826
}
664
827
665
828
/* Add the provider to the provider list if the cpusets match or if
666
829
* no other provider was found on the same cpuset as the process.
667
830
*/
668
- if (cpusets_match || !provider_found ) {
831
+ if (near || !provider_found ) {
669
832
provider_table [num_provider ] = current_provider ;
670
833
num_provider ++ ;
671
834
}
@@ -687,17 +850,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687
850
&& NULL != provider -> nic -> bus_attr
688
851
&& provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
689
852
pci = provider -> nic -> bus_attr -> attr .pci ;
690
- cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
853
+ near = is_near (distances , num_distances ,
854
+ opal_hwloc_topology , pci );
691
855
}
692
856
#endif
693
857
694
858
#if OPAL_ENABLE_DEBUG
695
859
opal_output_verbose (1 , opal_common_ofi .output ,
696
- "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697
- provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
860
+ "package rank: %d device: %s near : %s\n" , package_rank ,
861
+ provider -> domain_attr -> name , near ? "true" : "false" );
698
862
#endif
699
863
700
864
free (provider_table );
865
+ #if OPAL_OFI_PCI_DATA_AVAILABLE
866
+ if (pmix_val )
867
+ PMIx_Value_free (pmix_val , 1 );
868
+ #endif
701
869
return provider ;
702
870
}
703
871
0 commit comments