Skip to content

Commit ac2cfc1

Browse files
authored
Merge pull request #11338 from amirshehataornl/distances
mtl/ofi: NIC selection update
2 parents 619e5c6 + d919232 commit ac2cfc1

File tree

1 file changed

+218
-54
lines changed

1 file changed

+218
-54
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 218 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -445,63 +445,214 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445
}
446446
}
447447

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
448+
/**
449+
* Calculate device distances
450+
*
451+
* Calculate the distances between the current thread and all devices of
452+
* type OPENFABRICS or NETWORK.
453+
*
454+
* The shortest distances are the nearest and therefore most efficient
455+
* devices to use.
452456
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
457+
* Return an array of all the distances. Each entry is of type
458+
* pmix_device_distance_t
455459
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
460+
* This function is used if there is no PMIx server running.
461+
*
462+
* @param distances (OUT) distances array
463+
* @param ndist (OUT) number of entries in the distances array
464+
*
465+
* @return 0 on success. Error otherwise.
458466
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464467
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
468+
static int compute_dev_distances(pmix_device_distance_t **distances,
469+
size_t *ndist)
466470
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
471+
int ret = 0;
472+
size_t ninfo;
473+
pmix_info_t *info;
474+
pmix_cpuset_t cpuset;
475+
pmix_topology_t *pmix_topo;
476+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477+
PMIX_DEVTYPE_NETWORK;
478+
479+
PMIX_CPUSET_CONSTRUCT(&cpuset);
480+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
481+
if (PMIX_SUCCESS != ret) {
482+
goto out;
483+
}
484+
485+
/* load the PMIX topology */
486+
PMIX_TOPOLOGY_CREATE(pmix_topo, 1);
487+
ret = PMIx_Load_topology(pmix_topo);
488+
if (PMIX_SUCCESS != ret) {
489+
goto out;
490+
}
491+
492+
ninfo = 1;
493+
PMIX_INFO_CREATE(info, ninfo);
494+
PMIX_INFO_LOAD(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
495+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
496+
ndist);
497+
PMIX_INFO_FREE(info, ninfo);
498+
499+
PMIX_TOPOLOGY_FREE(pmix_topo, 1);
500+
out:
501+
return ret;
502+
}
471503

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
504+
/**
505+
* Find the nearest devices to the current thread
506+
*
507+
* Use the PMIx server or calculate the device distances, then out of the set of
508+
* returned distances find the subset of the nearest devices. This can be
509+
* 1 or more.
510+
*
511+
* @param num_distances (OUT) number of entries in the returned array
512+
*
513+
* @return An array of device distances which are nearest this thread
514+
* or NULL if we fail to get the distances. In this case we will just
515+
* revert to round robin.
516+
*
517+
*/
518+
static pmix_device_distance_t *get_nearest_nics(int *num_distances)
519+
{
520+
size_t ndist;
521+
pmix_topology_t *topo;
522+
int ret, i, idx = 0;
523+
pmix_data_array_t *dptr;
524+
uint16_t near = USHRT_MAX;
525+
pmix_info_t directive;
526+
pmix_value_t *val = NULL;
527+
pmix_device_distance_t *distances, *nearest = NULL;
528+
529+
PMIX_INFO_LOAD(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
530+
ret = PMIx_Get(&opal_process_info.myprocid,
531+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
532+
PMIX_INFO_DESTRUCT(&directive);
533+
if (ret != PMIX_SUCCESS || !val) {
534+
ret = compute_dev_distances(&distances, &ndist);
535+
if (ret)
536+
goto out;
537+
goto find_nearest;
538+
}
539+
540+
if (PMIX_DATA_ARRAY != val->type) {
541+
goto out;
542+
}
543+
dptr = val->data.darray;
544+
if (NULL == dptr) {
545+
goto out;
546+
}
547+
if (PMIX_DEVICE_DIST != dptr->type) {
548+
goto out;
549+
}
550+
551+
distances = (pmix_device_distance_t*)dptr->array;
552+
ndist = dptr->size;
553+
554+
find_nearest:
555+
nearest = calloc(sizeof(*distances), ndist);
556+
if (!nearest)
557+
goto out;
558+
559+
for (i = 0; i < ndist; i++) {
560+
if (distances[i].mindist < near) {
561+
idx = 0;
562+
near = distances[i].mindist;
563+
nearest[idx] = distances[i];
564+
idx++;
565+
} else if (distances[i].mindist == near) {
566+
nearest[idx] = distances[i];
567+
idx++;
568+
}
475569
}
476570

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
571+
*num_distances = idx;
572+
573+
out:
574+
if (val)
575+
PMIX_VALUE_RELEASE(val);
576+
return nearest;
577+
}
578+
579+
#if OPAL_OFI_PCI_DATA_AVAILABLE
580+
/**
581+
* Determine if a device is nearest
582+
*
583+
* Given a device distances array of the nearest pci devices,
584+
* determine if one of these device distances refers to the pci
585+
* device passed in
586+
*
587+
* @param distances (IN) distances array
588+
* @param num_distances (IN) number of entries in the distances array
589+
* @param topology (IN) topology of the node
590+
* @param pci (IN) PCI device being examined
591+
*
592+
* @return true if the PCI device is in the distances array or if the
593+
* distances array is not provided. False otherwise.
594+
*
595+
*/
596+
static bool is_near(pmix_device_distance_t *distances,
597+
int num_distances,
598+
hwloc_topology_t topology,
599+
struct fi_pci_attr pci)
600+
{
601+
hwloc_obj_t pcidev, osdev;
602+
603+
/* if we failed to find any distances, then we consider all interfaces
604+
* to be of equal distances and let the caller decide how to handle
605+
* them
606+
*/
607+
if (!distances)
608+
return true;
609+
610+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
611+
pci.bus_id, pci.device_id,
612+
pci.function_id);
613+
if (!pcidev)
480614
return false;
481-
}
482615

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
616+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
617+
int i;
488618

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
619+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
620+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
621+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
622+
623+
if (!nguid && !sguid)
624+
continue;
495625

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
626+
for (i = 0; i < num_distances; i++) {
627+
char lsguid[256], lnguid[256];
628+
int ret;
629+
630+
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
631+
if (ret != 2)
632+
continue;
633+
if (0 == strcasecmp(lnguid, nguid)) {
634+
return true;
635+
} else if (0 == strcasecmp(lsguid, sguid)) {
636+
return true;
637+
}
638+
}
639+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
640+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
641+
if (!address)
642+
continue;
643+
for (i = 0; i < num_distances; i++) {
644+
char *addr = strstr(distances[i].uuid, "://");
645+
if (!addr || addr + 3 > distances[i].uuid
646+
+ strlen(distances[i].uuid))
647+
continue;
648+
if (!strcmp(addr+3, address)) {
649+
return true;
650+
}
651+
}
652+
}
500653
}
501654

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
655+
return false;
505656
}
506657
#endif
507658

@@ -614,7 +765,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614765
struct fi_info *provider = provider_list, *current_provider = provider_list;
615766
struct fi_info **provider_table;
616767
#if OPAL_OFI_PCI_DATA_AVAILABLE
768+
pmix_device_distance_t *distances = NULL;
617769
struct fi_pci_attr pci;
770+
int num_distances = 0;
771+
bool near;
618772
#endif
619773
int ret;
620774
unsigned int num_provider = 0, provider_limit = 0;
@@ -639,33 +793,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639793
return provider_list;
640794
}
641795

796+
#if OPAL_OFI_PCI_DATA_AVAILABLE
797+
/* find all the nearest devices to this thread, then out of these
798+
* determine which device we should bind to.
799+
*/
800+
distances = get_nearest_nics(&num_distances);
801+
#endif
802+
642803
current_provider = provider;
643804

644805
/* Cycle through remaining fi_info objects, looking for alike providers */
645806
while (NULL != current_provider) {
646807
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
808+
near = false;
648809
#if OPAL_OFI_PCI_DATA_AVAILABLE
649810
if (NULL != current_provider->nic
650811
&& NULL != current_provider->nic->bus_attr
651812
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652813
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
814+
near = is_near(distances, num_distances,
815+
opal_hwloc_topology, pci);
654816
}
655817
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
818+
/* We could have multiple near providers */
819+
if (near && !provider_found) {
661820
provider_found = true;
662821
num_provider = 0;
663822
}
664823

665824
/* Add the provider to the provider list if the cpusets match or if
666825
* no other provider was found on the same cpuset as the process.
667826
*/
668-
if (cpusets_match || !provider_found) {
827+
if (near || !provider_found) {
669828
provider_table[num_provider] = current_provider;
670829
num_provider++;
671830
}
@@ -687,17 +846,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687846
&& NULL != provider->nic->bus_attr
688847
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689848
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
849+
near = is_near(distances, num_distances,
850+
opal_hwloc_topology, pci);
691851
}
692852
#endif
693853

694854
#if OPAL_ENABLE_DEBUG
695855
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
856+
"package rank: %d device: %s near: %s\n", package_rank,
857+
provider->domain_attr->name, near ? "true" : "false");
698858
#endif
699859

700860
free(provider_table);
861+
#if OPAL_OFI_PCI_DATA_AVAILABLE
862+
if (distances)
863+
free(distances);
864+
#endif
701865
return provider;
702866
}
703867

0 commit comments

Comments
 (0)