Skip to content

Commit 42e577f

Browse files
authored
Merge pull request #11565 from amirshehataornl/distances
ofi: NIC selection update
2 parents 7ab9e3b + d4e1ae5 commit 42e577f

File tree

1 file changed

+224
-56
lines changed

1 file changed

+224
-56
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 224 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
* reserved.
55
* Copyright (c) 2020-2022 Triad National Security, LLC. All rights
66
* reserved.
7-
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved
7+
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
88
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
99
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
1010
* reserved.
11+
* Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
1112
* $COPYRIGHT$
1213
*
1314
* Additional copyrights may follow
@@ -445,63 +446,216 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445446
}
446447
}
447448

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
449+
/**
450+
* Calculate device distances
451+
*
452+
* Calculate the distances between the current thread and all devices of
453+
* type OPENFABRICS or NETWORK.
454+
*
455+
* The shortest distances are the nearest and therefore most efficient
456+
* devices to use.
452457
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
458+
* Return an array of all the distances. Each entry is of type
459+
* pmix_device_distance_t
455460
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
461+
* This function is used if there is no PMIx server running.
462+
*
463+
* @param distances (OUT) distances array
464+
* @param ndist (OUT) number of entries in the distances array
465+
*
466+
* @return 0 on success. Error otherwise.
458467
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464468
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
469+
static int compute_dev_distances(pmix_device_distance_t **distances,
470+
size_t *ndist)
466471
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
472+
int ret = 0;
473+
size_t ninfo;
474+
pmix_info_t *info;
475+
pmix_cpuset_t cpuset;
476+
pmix_topology_t *pmix_topo;
477+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
478+
PMIX_DEVTYPE_NETWORK;
479+
480+
PMIX_CPUSET_CONSTRUCT(&cpuset);
481+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
482+
if (PMIX_SUCCESS != ret) {
483+
goto out;
484+
}
485+
486+
/* load the PMIX topology */
487+
PMIx_Topology_free(pmix_topo, 1);
488+
ret = PMIx_Load_topology(pmix_topo);
489+
if (PMIX_SUCCESS != ret) {
490+
goto out;
491+
}
492+
493+
ninfo = 1;
494+
info = PMIx_Info_create(ninfo);
495+
PMIx_Info_load(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
496+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
497+
ndist);
498+
PMIx_Info_free(info, ninfo);
499+
500+
PMIx_Topology_free(pmix_topo, 1);
501+
out:
502+
return ret;
503+
}
471504

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
505+
/**
506+
* Find the nearest devices to the current thread
507+
*
508+
* Use the PMIx server or calculate the device distances, then out of the set of
509+
* returned distances find the subset of the nearest devices. This can be
510+
* 1 or more.
511+
*
512+
* @param num_distances (OUT) number of entries in the returned array
513+
*
514+
* @return An array of device distances which are nearest this thread
515+
* or NULL if we fail to get the distances. In this case we will just
516+
* revert to round robin.
517+
*
518+
*/
519+
static pmix_device_distance_t *
520+
get_nearest_nics(int *num_distances, pmix_value_t **valin)
521+
{
522+
size_t ndist, i;
523+
int ret, idx = 0;
524+
pmix_data_array_t *dptr;
525+
uint16_t near = USHRT_MAX;
526+
pmix_info_t directive;
527+
pmix_value_t *val = NULL;
528+
pmix_device_distance_t *distances, *nearest = NULL;
529+
530+
PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
531+
ret = PMIx_Get(&opal_process_info.myprocid,
532+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
533+
PMIx_Info_destruct(&directive);
534+
if (ret != PMIX_SUCCESS || !val) {
535+
ret = compute_dev_distances(&distances, &ndist);
536+
if (ret)
537+
goto out;
538+
goto find_nearest;
539+
}
540+
541+
if (PMIX_DATA_ARRAY != val->type) {
542+
goto out;
543+
}
544+
dptr = val->data.darray;
545+
if (NULL == dptr) {
546+
goto out;
547+
}
548+
if (PMIX_DEVICE_DIST != dptr->type) {
549+
goto out;
550+
}
551+
552+
distances = (pmix_device_distance_t*)dptr->array;
553+
ndist = dptr->size;
554+
555+
find_nearest:
556+
nearest = calloc(sizeof(*distances), ndist);
557+
if (!nearest)
558+
goto out;
559+
560+
for (i = 0; i < ndist; i++) {
561+
if (distances[i].type != PMIX_DEVTYPE_NETWORK &&
562+
distances[i].type != PMIX_DEVTYPE_OPENFABRICS)
563+
continue;
564+
if (distances[i].mindist < near) {
565+
idx = 0;
566+
near = distances[i].mindist;
567+
nearest[idx] = distances[i];
568+
idx++;
569+
} else if (distances[i].mindist == near) {
570+
nearest[idx] = distances[i];
571+
idx++;
572+
}
475573
}
476574

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
575+
*num_distances = idx;
576+
577+
out:
578+
*valin = val;
579+
return nearest;
580+
}
581+
582+
#if OPAL_OFI_PCI_DATA_AVAILABLE
583+
/**
584+
* Determine if a device is nearest
585+
*
586+
* Given a device distances array of the nearest pci devices,
587+
* determine if one of these device distances refers to the pci
588+
* device passed in
589+
*
590+
* @param distances (IN) distances array
591+
* @param num_distances (IN) number of entries in the distances array
592+
* @param topology (IN) topology of the node
593+
* @param pci (IN) PCI device being examined
594+
*
595+
* @return true if the PCI device is in the distances array or if the
596+
* distances array is not provided. False otherwise.
597+
*
598+
*/
599+
static bool is_near(pmix_device_distance_t *distances,
600+
int num_distances,
601+
hwloc_topology_t topology,
602+
struct fi_pci_attr pci)
603+
{
604+
hwloc_obj_t pcidev, osdev;
605+
606+
/* if we failed to find any distances, then we consider all interfaces
607+
* to be of equal distances and let the caller decide how to handle
608+
* them
609+
*/
610+
if (!distances)
611+
return true;
612+
613+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
614+
pci.bus_id, pci.device_id,
615+
pci.function_id);
616+
if (!pcidev)
480617
return false;
481-
}
482618

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
619+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
620+
int i;
488621

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
622+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
623+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
624+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
625+
626+
if (!nguid && !sguid)
627+
continue;
495628

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
629+
for (i = 0; i < num_distances; i++) {
630+
char lsguid[256], lnguid[256];
631+
int ret;
632+
633+
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
634+
if (ret != 2)
635+
continue;
636+
if (0 == strcasecmp(lnguid, nguid)) {
637+
return true;
638+
} else if (0 == strcasecmp(lsguid, sguid)) {
639+
return true;
640+
}
641+
}
642+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
643+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
644+
if (!address)
645+
continue;
646+
for (i = 0; i < num_distances; i++) {
647+
char *addr = strstr(distances[i].uuid, "://");
648+
if (!addr || addr + 3 > distances[i].uuid
649+
+ strlen(distances[i].uuid))
650+
continue;
651+
if (!strcmp(addr+3, address)) {
652+
return true;
653+
}
654+
}
655+
}
500656
}
501657

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
658+
return false;
505659
}
506660
#endif
507661

@@ -614,11 +768,15 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614768
struct fi_info *provider = provider_list, *current_provider = provider_list;
615769
struct fi_info **provider_table;
616770
#if OPAL_OFI_PCI_DATA_AVAILABLE
771+
pmix_device_distance_t *distances = NULL;
772+
pmix_value_t *pmix_val;
617773
struct fi_pci_attr pci;
774+
int num_distances = 0;
775+
bool near;
618776
#endif
619777
int ret;
620778
unsigned int num_provider = 0, provider_limit = 0;
621-
bool provider_found = false, cpusets_match = false;
779+
bool provider_found = false;
622780

623781
/* Initialize opal_hwloc_topology if it is not already */
624782
ret = opal_hwloc_base_get_topology();
@@ -639,33 +797,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639797
return provider_list;
640798
}
641799

800+
#if OPAL_OFI_PCI_DATA_AVAILABLE
801+
/* find all the nearest devices to this thread, then out of these
802+
* determine which device we should bind to.
803+
*/
804+
distances = get_nearest_nics(&num_distances, &pmix_val);
805+
#endif
806+
642807
current_provider = provider;
643808

644809
/* Cycle through remaining fi_info objects, looking for alike providers */
645810
while (NULL != current_provider) {
646811
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
812+
near = false;
648813
#if OPAL_OFI_PCI_DATA_AVAILABLE
649814
if (NULL != current_provider->nic
650815
&& NULL != current_provider->nic->bus_attr
651816
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652817
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
818+
near = is_near(distances, num_distances,
819+
opal_hwloc_topology, pci);
654820
}
655821
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
822+
/* We could have multiple near providers */
823+
if (near && !provider_found) {
661824
provider_found = true;
662825
num_provider = 0;
663826
}
664827

665828
/* Add the provider to the provider list if the cpusets match or if
666829
* no other provider was found on the same cpuset as the process.
667830
*/
668-
if (cpusets_match || !provider_found) {
831+
if (near || !provider_found) {
669832
provider_table[num_provider] = current_provider;
670833
num_provider++;
671834
}
@@ -687,17 +850,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687850
&& NULL != provider->nic->bus_attr
688851
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689852
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
853+
near = is_near(distances, num_distances,
854+
opal_hwloc_topology, pci);
691855
}
692856
#endif
693857

694858
#if OPAL_ENABLE_DEBUG
695859
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
860+
"package rank: %d device: %s near: %s\n", package_rank,
861+
provider->domain_attr->name, near ? "true" : "false");
698862
#endif
699863

700864
free(provider_table);
865+
#if OPAL_OFI_PCI_DATA_AVAILABLE
866+
if (pmix_val)
867+
PMIx_Value_free(pmix_val, 1);
868+
#endif
701869
return provider;
702870
}
703871

0 commit comments

Comments
 (0)