Skip to content

Commit d919232

Browse files
mtl/ofi: NIC selection update
The existing code in compare_cpusets assumed that some non_io ancestor of a PCI object should intersect with the cpuset of the proc. However, this is not true. There is a case where the non IO ancestor can be an L3. If there exists two L3s on the same NUMA and the process is bound to one L3, but the PCI object is connected to the other L3, then compare_cpusets() will return false. A better way to determine the optimal interface is by finding the distances of the interfaces from the current process. Then find out which of these interfaces is nearest the process and select it. Use the PMIx distance generation for this purpose. Signed-off-by: Amir Shehata <shehataa@ornl.gov>
1 parent dd6b875 commit d919232

File tree

1 file changed

+218
-54
lines changed

1 file changed

+218
-54
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 218 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -445,63 +445,214 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445
}
446446
}
447447

448-
#if OPAL_OFI_PCI_DATA_AVAILABLE
449-
/* Check if a process and a pci device share the same cpuset
450-
* @param (IN) pci struct fi_pci_attr pci device attributes,
451-
* used to find hwloc object for device.
448+
/**
449+
* Calculate device distances
450+
*
451+
* Calculate the distances between the current thread and all devices of
452+
* type OPENFABRICS or NETWORK.
453+
*
454+
* The shortest distances are the nearest and therefore most efficient
455+
* devices to use.
452456
*
453-
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454-
* from
457+
* Return an array of all the distances. Each entry is of type
458+
* pmix_device_distance_t
455459
*
456-
* @param (OUT) returns true if cpusets match and false if
457-
* cpusets do not match or an error prevents comparison
460+
* This function is used if there is no PMIx server running.
461+
*
462+
* @param distances (OUT) distances array
463+
* @param ndist (OUT) number of entries in the distances array
464+
*
465+
* @return 0 on success. Error otherwise.
458466
*
459-
* Uses a pci device to find an ancestor that contains a cpuset, and
460-
* determines if it intersects with the cpuset that the process is bound to.
461-
* if the process is not bound, or if a cpuset is unavailable for whatever
462-
* reason, returns false. Otherwise, returns the result of
463-
* hwloc_cpuset_intersects()
464467
*/
465-
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
468+
static int compute_dev_distances(pmix_device_distance_t **distances,
469+
size_t *ndist)
466470
{
467-
bool result = false;
468-
int ret;
469-
hwloc_bitmap_t proc_cpuset;
470-
hwloc_obj_t obj = NULL;
471+
int ret = 0;
472+
size_t ninfo;
473+
pmix_info_t *info;
474+
pmix_cpuset_t cpuset;
475+
pmix_topology_t *pmix_topo;
476+
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477+
PMIX_DEVTYPE_NETWORK;
478+
479+
PMIX_CPUSET_CONSTRUCT(&cpuset);
480+
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
481+
if (PMIX_SUCCESS != ret) {
482+
goto out;
483+
}
484+
485+
/* load the PMIX topology */
486+
PMIX_TOPOLOGY_CREATE(pmix_topo, 1);
487+
ret = PMIx_Load_topology(pmix_topo);
488+
if (PMIX_SUCCESS != ret) {
489+
goto out;
490+
}
491+
492+
ninfo = 1;
493+
PMIX_INFO_CREATE(info, ninfo);
494+
PMIX_INFO_LOAD(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
495+
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
496+
ndist);
497+
PMIX_INFO_FREE(info, ninfo);
498+
499+
PMIX_TOPOLOGY_FREE(pmix_topo, 1);
500+
out:
501+
return ret;
502+
}
471503

472-
/* Cannot find topology info if no topology is found */
473-
if (NULL == topology) {
474-
return false;
504+
/**
505+
* Find the nearest devices to the current thread
506+
*
507+
* Use the PMIx server or calculate the device distances, then out of the set of
508+
* returned distances find the subset of the nearest devices. This can be
509+
* 1 or more.
510+
*
511+
* @param num_distances (OUT) number of entries in the returned array
512+
*
513+
* @return An array of device distances which are nearest this thread
514+
* or NULL if we fail to get the distances. In this case we will just
515+
* revert to round robin.
516+
*
517+
*/
518+
static pmix_device_distance_t *get_nearest_nics(int *num_distances)
519+
{
520+
size_t ndist;
521+
pmix_topology_t *topo;
522+
int ret, i, idx = 0;
523+
pmix_data_array_t *dptr;
524+
uint16_t near = USHRT_MAX;
525+
pmix_info_t directive;
526+
pmix_value_t *val = NULL;
527+
pmix_device_distance_t *distances, *nearest = NULL;
528+
529+
PMIX_INFO_LOAD(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
530+
ret = PMIx_Get(&opal_process_info.myprocid,
531+
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
532+
PMIX_INFO_DESTRUCT(&directive);
533+
if (ret != PMIX_SUCCESS || !val) {
534+
ret = compute_dev_distances(&distances, &ndist);
535+
if (ret)
536+
goto out;
537+
goto find_nearest;
538+
}
539+
540+
if (PMIX_DATA_ARRAY != val->type) {
541+
goto out;
542+
}
543+
dptr = val->data.darray;
544+
if (NULL == dptr) {
545+
goto out;
546+
}
547+
if (PMIX_DEVICE_DIST != dptr->type) {
548+
goto out;
549+
}
550+
551+
distances = (pmix_device_distance_t*)dptr->array;
552+
ndist = dptr->size;
553+
554+
find_nearest:
555+
nearest = calloc(sizeof(*distances), ndist);
556+
if (!nearest)
557+
goto out;
558+
559+
for (i = 0; i < ndist; i++) {
560+
if (distances[i].mindist < near) {
561+
idx = 0;
562+
near = distances[i].mindist;
563+
nearest[idx] = distances[i];
564+
idx++;
565+
} else if (distances[i].mindist == near) {
566+
nearest[idx] = distances[i];
567+
idx++;
568+
}
475569
}
476570

477-
/* Allocate memory for proc_cpuset */
478-
proc_cpuset = hwloc_bitmap_alloc();
479-
if (NULL == proc_cpuset) {
571+
*num_distances = idx;
572+
573+
out:
574+
if (val)
575+
PMIX_VALUE_RELEASE(val);
576+
return nearest;
577+
}
578+
579+
#if OPAL_OFI_PCI_DATA_AVAILABLE
580+
/**
581+
* Determine if a device is nearest
582+
*
583+
* Given a device distances array of the nearest pci devices,
584+
* determine if one of these device distances refers to the pci
585+
* device passed in
586+
*
587+
* @param distances (IN) distances array
588+
* @param num_distances (IN) number of entries in the distances array
589+
* @param topology (IN) topology of the node
590+
* @param pci (IN) PCI device being examined
591+
*
592+
* @return true if the PCI device is in the distances array or if the
593+
* distances array is not provided. False otherwise.
594+
*
595+
*/
596+
static bool is_near(pmix_device_distance_t *distances,
597+
int num_distances,
598+
hwloc_topology_t topology,
599+
struct fi_pci_attr pci)
600+
{
601+
hwloc_obj_t pcidev, osdev;
602+
603+
/* if we failed to find any distances, then we consider all interfaces
604+
* to be of equal distances and let the caller decide how to handle
605+
* them
606+
*/
607+
if (!distances)
608+
return true;
609+
610+
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
611+
pci.bus_id, pci.device_id,
612+
pci.function_id);
613+
if (!pcidev)
480614
return false;
481-
}
482615

483-
/* Fill cpuset with the collection of cpu cores that the process runs on */
484-
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485-
if (0 > ret) {
486-
goto error;
487-
}
616+
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
617+
int i;
488618

489-
/* Get the pci device from bdf */
490-
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491-
pci.function_id);
492-
if (NULL == obj) {
493-
goto error;
494-
}
619+
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
620+
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
621+
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
622+
623+
if (!nguid && !sguid)
624+
continue;
495625

496-
/* pcidev objects don't have cpusets so find the first non-io object above */
497-
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498-
if (NULL != obj) {
499-
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
626+
for (i = 0; i < num_distances; i++) {
627+
char lsguid[256], lnguid[256];
628+
int ret;
629+
630+
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
631+
if (ret != 2)
632+
continue;
633+
if (0 == strcasecmp(lnguid, nguid)) {
634+
return true;
635+
} else if (0 == strcasecmp(lsguid, sguid)) {
636+
return true;
637+
}
638+
}
639+
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
640+
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
641+
if (!address)
642+
continue;
643+
for (i = 0; i < num_distances; i++) {
644+
char *addr = strstr(distances[i].uuid, "://");
645+
if (!addr || addr + 3 > distances[i].uuid
646+
+ strlen(distances[i].uuid))
647+
continue;
648+
if (!strcmp(addr+3, address)) {
649+
return true;
650+
}
651+
}
652+
}
500653
}
501654

502-
error:
503-
hwloc_bitmap_free(proc_cpuset);
504-
return result;
655+
return false;
505656
}
506657
#endif
507658

@@ -614,7 +765,10 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
614765
struct fi_info *provider = provider_list, *current_provider = provider_list;
615766
struct fi_info **provider_table;
616767
#if OPAL_OFI_PCI_DATA_AVAILABLE
768+
pmix_device_distance_t *distances = NULL;
617769
struct fi_pci_attr pci;
770+
int num_distances = 0;
771+
bool near;
618772
#endif
619773
int ret;
620774
unsigned int num_provider = 0, provider_limit = 0;
@@ -639,33 +793,38 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
639793
return provider_list;
640794
}
641795

796+
#if OPAL_OFI_PCI_DATA_AVAILABLE
797+
/* find all the nearest devices to this thread, then out of these
798+
* determine which device we should bind to.
799+
*/
800+
distances = get_nearest_nics(&num_distances);
801+
#endif
802+
642803
current_provider = provider;
643804

644805
/* Cycle through remaining fi_info objects, looking for alike providers */
645806
while (NULL != current_provider) {
646807
if (!check_provider_attr(provider, current_provider)) {
647-
cpusets_match = false;
808+
near = false;
648809
#if OPAL_OFI_PCI_DATA_AVAILABLE
649810
if (NULL != current_provider->nic
650811
&& NULL != current_provider->nic->bus_attr
651812
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
652813
pci = current_provider->nic->bus_attr->attr.pci;
653-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
814+
near = is_near(distances, num_distances,
815+
opal_hwloc_topology, pci);
654816
}
655817
#endif
656-
657-
/* Reset the list if the cpusets match and no other provider was
658-
* found on the same cpuset as the process.
659-
*/
660-
if (cpusets_match && !provider_found) {
818+
/* We could have multiple near providers */
819+
if (near && !provider_found) {
661820
provider_found = true;
662821
num_provider = 0;
663822
}
664823

665824
/* Add the provider to the provider list if the cpusets match or if
666825
* no other provider was found on the same cpuset as the process.
667826
*/
668-
if (cpusets_match || !provider_found) {
827+
if (near || !provider_found) {
669828
provider_table[num_provider] = current_provider;
670829
num_provider++;
671830
}
@@ -687,17 +846,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
687846
&& NULL != provider->nic->bus_attr
688847
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
689848
pci = provider->nic->bus_attr->attr.pci;
690-
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
849+
near = is_near(distances, num_distances,
850+
opal_hwloc_topology, pci);
691851
}
692852
#endif
693853

694854
#if OPAL_ENABLE_DEBUG
695855
opal_output_verbose(1, opal_common_ofi.output,
696-
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697-
provider->domain_attr->name, cpusets_match ? "true" : "false");
856+
"package rank: %d device: %s near: %s\n", package_rank,
857+
provider->domain_attr->name, near ? "true" : "false");
698858
#endif
699859

700860
free(provider_table);
861+
#if OPAL_OFI_PCI_DATA_AVAILABLE
862+
if (distances)
863+
free(distances);
864+
#endif
701865
return provider;
702866
}
703867

0 commit comments

Comments
 (0)