Skip to content

Commit d4e1ae5

Browse files
ofi: NIC selection update
The existing code in compare_cpusets assumed that some non_io ancestor of a PCI object should intersect with the cpuset of the proc. However, this is not true. There is a case where the non IO ancestor can be an L3. If there exists two L3s on the same NUMA and the process is bound to one L3, but the PCI object is connected to the other L3, then compare_cpusets() will return false. A better way to determine the optimal interface is by finding the distances of the interfaces from the current process. Then find out which of these interfaces is nearest the process and select it. Use the PMIx distance generation for this purpose. Move away from using deprecated PMIX macros and use the functions directly instead. Signed-off-by: Amir Shehata <shehataa@ornl.gov>
1 parent df7cf53 commit d4e1ae5

File tree

1 file changed

+224
-56
lines changed

1 file changed

+224
-56
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 224 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
* reserved.
55
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
66
* reserved.
7-
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
7+
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
88
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
99
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
1010
* reserved.
11+
* Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
1112
* $COPYRIGHT$
1213
*
1314
* Additional copyrights may follow
@@ -445,63 +446,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445446
}
446447
}
447448

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
449+
/**
450+
* Calculate device distances
451+
*
452+
* Calculate the distances between the current thread and all devices of
453+
* type OPENFABRICS or NETWORK.
454+
*
455+
* The shortest distances are the nearest and therefore most efficient
456+
* devices to use.
452457
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
458+
* Return an array of all the distances. Each entry is of type
459+
* pmix_device_distance_t
455460
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
461+
* This function is used if there is no PMIx server running.
462+
*
463+
* @param distances (OUT) distances array
464+
* @param ndist (OUT) number of entries in the distances array
465+
*
466+
* @return 0 on success. Error otherwise.
458467
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464468
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
469+
static int compute_dev_distances(pmix_device_distance_t **distances,
470+
size_t *ndist)
466471
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
472+
int ret = 0;
473+
size_t ninfo;
474+
pmix_info_t *info;
475+
pmix_cpuset_t cpuset;
476+
pmix_topology_t *pmix_topo;
477+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
478+
PMIX_DEVTYPE_NETWORK;
479+
480+
PMIX_CPUSET_CONSTRUCT(&cpuset);
481+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
482+
if (PMIX_SUCCESS != ret) {
483+
goto out;
484+
}
485+
486+
/* load the PMIX topology */
487+
PMIx_Topology_free(pmix_topo, 1);
488+
ret = PMIx_Load_topology(pmix_topo);
489+
if (PMIX_SUCCESS != ret) {
490+
goto out;
491+
}
492+
493+
ninfo = 1;
494+
info = PMIx_Info_create(ninfo);
495+
PMIx_Info_load(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
496+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
497+
ndist);
498+
PMIx_Info_free(info, ninfo);
499+
500+
PMIx_Topology_free(pmix_topo, 1);
501+
out:
502+
return ret;
503+
}
471504

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
505+
/**
506+
* Find the nearest devices to the current thread
507+
*
508+
* Use the PMIx server or calculate the device distances, then out of the set of
509+
* returned distances find the subset of the nearest devices. This can be
510+
* 1 or more.
511+
*
512+
* @param num_distances (OUT) number of entries in the returned array
513+
*
514+
* @return An array of device distances which are nearest this thread
515+
* or NULL if we fail to get the distances. In this case we will just
516+
* revert to round robin.
517+
*
518+
*/
519+
static pmix_device_distance_t *
520+
get_nearest_nics(int *num_distances, pmix_value_t **valin)
521+
{
522+
size_t ndist, i;
523+
int ret, idx = 0;
524+
pmix_data_array_t *dptr;
525+
uint16_t near = USHRT_MAX;
526+
pmix_info_t directive;
527+
pmix_value_t *val = NULL;
528+
pmix_device_distance_t *distances, *nearest = NULL;
529+
530+
PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
531+
ret = PMIx_Get(&opal_process_info.myprocid,
532+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
533+
PMIx_Info_destruct(&directive);
534+
if (ret != PMIX_SUCCESS || !val) {
535+
ret = compute_dev_distances(&distances, &ndist);
536+
if (ret)
537+
goto out;
538+
goto find_nearest;
539+
}
540+
541+
if (PMIX_DATA_ARRAY != val->type) {
542+
goto out;
543+
}
544+
dptr = val->data.darray;
545+
if (NULL == dptr) {
546+
goto out;
547+
}
548+
if (PMIX_DEVICE_DIST != dptr->type) {
549+
goto out;
550+
}
551+
552+
distances = (pmix_device_distance_t*)dptr->array;
553+
ndist = dptr->size;
554+
555+
find_nearest:
556+
nearest = calloc(sizeof(*distances), ndist);
557+
if (!nearest)
558+
goto out;
559+
560+
for (i = 0; i < ndist; i++) {
561+
if (distances[i].type != PMIX_DEVTYPE_NETWORK &&
562+
distances[i].type != PMIX_DEVTYPE_OPENFABRICS)
563+
continue;
564+
if (distances[i].mindist < near) {
565+
idx = 0;
566+
near = distances[i].mindist;
567+
nearest[idx] = distances[i];
568+
idx++;
569+
} else if (distances[i].mindist == near) {
570+
nearest[idx] = distances[i];
571+
idx++;
572+
}
475573
}
476574

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
575+
*num_distances = idx;
576+
577+
out:
578+
*valin = val;
579+
return nearest;
580+
}
581+
582+
#if OPAL_OFI_PCI_DATA_AVAILABLE
583+
/**
584+
* Determine if a device is nearest
585+
*
586+
* Given a device distances array of the nearest pci devices,
587+
* determine if one of these device distances refers to the pci
588+
* device passed in
589+
*
590+
* @param distances (IN) distances array
591+
* @param num_distances (IN) number of entries in the distances array
592+
* @param topology (IN) topology of the node
593+
* @param pci (IN) PCI device being examined
594+
*
595+
* @return true if the PCI device is in the distances array or if the
596+
* distances array is not provided. False otherwise.
597+
*
598+
*/
599+
static bool is_near(pmix_device_distance_t *distances,
600+
int num_distances,
601+
hwloc_topology_t topology,
602+
struct fi_pci_attr pci)
603+
{
604+
hwloc_obj_t pcidev, osdev;
605+
606+
/* if we failed to find any distances, then we consider all interfaces
607+
* to be of equal distances and let the caller decide how to handle
608+
* them
609+
*/
610+
if (!distances)
611+
return true;
612+
613+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
614+
pci.bus_id, pci.device_id,
615+
pci.function_id);
616+
if (!pcidev)
480617
return false;
481-
}
482618

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
619+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
620+
int i;
488621

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
622+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
623+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
624+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
625+
626+
if (!nguid && !sguid)
627+
continue;
495628

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
629+
for (i = 0; i < num_distances; i++) {
630+
char lsguid[256], lnguid[256];
631+
int ret;
632+
633+
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
634+
if (ret != 2)
635+
continue;
636+
if (0 == strcasecmp(lnguid, nguid)) {
637+
return true;
638+
} else if (0 == strcasecmp(lsguid, sguid)) {
639+
return true;
640+
}
641+
}
642+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
643+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
644+
if (!address)
645+
continue;
646+
for (i = 0; i < num_distances; i++) {
647+
char *addr = strstr(distances[i].uuid, "://");
648+
if (!addr || addr + 3 > distances[i].uuid
649+
+ strlen(distances[i].uuid))
650+
continue;
651+
if (!strcmp(addr+3, address)) {
652+
return true;
653+
}
654+
}
655+
}
500656
}
501657

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
658+
return false;
505659
}
506660
#endif
507661

@@ -614,11 +768,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614768
struct fi_info *provider = provider_list, *current_provider = provider_list;
615769
struct fi_info **provider_table;
616770
#if OPAL_OFI_PCI_DATA_AVAILABLE
771+
pmix_device_distance_t *distances = NULL;
772+
pmix_value_t *pmix_val;
617773
struct fi_pci_attr pci;
774+
int num_distances = 0;
775+
bool near;
618776
#endif
619777
int ret;
620778
unsigned int num_provider = 0, provider_limit = 0;
621-
bool provider_found = false, cpusets_match = false;
779+
bool provider_found = false;
622780

623781
/* Initialize opal_hwloc_topology if it is not already */
624782
ret = opal_hwloc_base_get_topology();
@@ -639,33 +797,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639797
return provider_list;
640798
}
641799

800+
#if OPAL_OFI_PCI_DATA_AVAILABLE
801+
/* find all the nearest devices to this thread, then out of these
802+
* determine which device we should bind to.
803+
*/
804+
distances = get_nearest_nics(&num_distances, &pmix_val);
805+
#endif
806+
642807
current_provider = provider;
643808

644809
/* Cycle through remaining fi_info objects, looking for alike providers */
645810
while (NULL != current_provider) {
646811
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
812+
near = false;
648813
#if OPAL_OFI_PCI_DATA_AVAILABLE
649814
if (NULL != current_provider->nic
650815
&& NULL != current_provider->nic->bus_attr
651816
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652817
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
818+
near = is_near(distances, num_distances,
819+
opal_hwloc_topology, pci);
654820
}
655821
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
822+
/* We could have multiple near providers */
823+
if (near && !provider_found) {
661824
provider_found = true;
662825
num_provider = 0;
663826
}
664827

665828
/* Add the provider to the provider list if the cpusets match or if
666829
* no other provider was found on the same cpuset as the process.
667830
*/
668-
if (cpusets_match || !provider_found) {
831+
if (near || !provider_found) {
669832
provider_table[num_provider] = current_provider;
670833
num_provider++;
671834
}
@@ -687,17 +850,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687850
&& NULL != provider->nic->bus_attr
688851
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689852
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
853+
near = is_near(distances, num_distances,
854+
opal_hwloc_topology, pci);
691855
}
692856
#endif
693857

694858
#if OPAL_ENABLE_DEBUG
695859
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
860+
"package rank: %d device: %s near: %s\n", package_rank,
861+
provider->domain_attr->name, near ? "true" : "false");
698862
#endif
699863

700864
free(provider_table);
865+
#if OPAL_OFI_PCI_DATA_AVAILABLE
866+
if (pmix_val)
867+
PMIx_Value_free(pmix_val, 1);
868+
#endif
701869
return provider;
702870
}
703871

0 commit comments

Comments
 (0)