Skip to content

Commit df7cf53

Browse files
authored
Merge pull request #11562 from open-mpi/revert-11338-distances
Revert "mtl/ofi: NIC selection update"
2 parents ac2cfc1 + 40aeb05 commit df7cf53

File tree

1 file changed

+54
-218
lines changed

1 file changed

+54
-218
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 54 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -445,214 +445,63 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445
}
446446
}
447447

448-
/**
449-
* Calculate device distances
450-
*
451-
* Calculate the distances between the current thread and all devices of
452-
* type OPENFABRICS or NETWORK.
453-
*
454-
* The shortest distances are the nearest and therefore most efficient
455-
* devices to use.
456-
*
457-
* Return an array of all the distances. Each entry is of type
458-
* pmix_device_distance_t
459-
*
460-
* This function is used if there is no PMIx server running.
461-
*
462-
* @param distances (OUT) distances array
463-
* @param ndist (OUT) number of entries in the distances array
464-
*
465-
* @return 0 on success. Error otherwise.
466-
*
467-
*/
468-
static int compute_dev_distances(pmix_device_distance_t **distances,
469-
size_t *ndist)
470-
{
471-
int ret = 0;
472-
size_t ninfo;
473-
pmix_info_t *info;
474-
pmix_cpuset_t cpuset;
475-
pmix_topology_t *pmix_topo;
476-
pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477-
PMIX_DEVTYPE_NETWORK;
478-
479-
PMIX_CPUSET_CONSTRUCT(&cpuset);
480-
ret = PMIx_Get_cpuset(&cpuset, PMIX_CPUBIND_THREAD);
481-
if (PMIX_SUCCESS != ret) {
482-
goto out;
483-
}
484-
485-
/* load the PMIX topology */
486-
PMIX_TOPOLOGY_CREATE(pmix_topo, 1);
487-
ret = PMIx_Load_topology(pmix_topo);
488-
if (PMIX_SUCCESS != ret) {
489-
goto out;
490-
}
491-
492-
ninfo = 1;
493-
PMIX_INFO_CREATE(info, ninfo);
494-
PMIX_INFO_LOAD(&info[0], PMIX_DEVICE_TYPE, &type, PMIX_DEVTYPE);
495-
ret = PMIx_Compute_distances(pmix_topo, &cpuset, info, ninfo, distances,
496-
ndist);
497-
PMIX_INFO_FREE(info, ninfo);
498-
499-
PMIX_TOPOLOGY_FREE(pmix_topo, 1);
500-
out:
501-
return ret;
502-
}
503-
504-
/**
505-
* Find the nearest devices to the current thread
506-
*
507-
* Use the PMIx server or calculate the device distances, then out of the set of
508-
* returned distances find the subset of the nearest devices. This can be
509-
* 1 or more.
510-
*
511-
* @param num_distances (OUT) number of entries in the returned array
512-
*
513-
* @return An array of device distances which are nearest this thread
514-
* or NULL if we fail to get the distances. In this case we will just
515-
* revert to round robin.
516-
*
517-
*/
518-
static pmix_device_distance_t *get_nearest_nics(int *num_distances)
519-
{
520-
size_t ndist;
521-
pmix_topology_t *topo;
522-
int ret, i, idx = 0;
523-
pmix_data_array_t *dptr;
524-
uint16_t near = USHRT_MAX;
525-
pmix_info_t directive;
526-
pmix_value_t *val = NULL;
527-
pmix_device_distance_t *distances, *nearest = NULL;
528-
529-
PMIX_INFO_LOAD(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL);
530-
ret = PMIx_Get(&opal_process_info.myprocid,
531-
PMIX_DEVICE_DISTANCES, &directive, 1, &val);
532-
PMIX_INFO_DESTRUCT(&directive);
533-
if (ret != PMIX_SUCCESS || !val) {
534-
ret = compute_dev_distances(&distances, &ndist);
535-
if (ret)
536-
goto out;
537-
goto find_nearest;
538-
}
539-
540-
if (PMIX_DATA_ARRAY != val->type) {
541-
goto out;
542-
}
543-
dptr = val->data.darray;
544-
if (NULL == dptr) {
545-
goto out;
546-
}
547-
if (PMIX_DEVICE_DIST != dptr->type) {
548-
goto out;
549-
}
550-
551-
distances = (pmix_device_distance_t*)dptr->array;
552-
ndist = dptr->size;
553-
554-
find_nearest:
555-
nearest = calloc(sizeof(*distances), ndist);
556-
if (!nearest)
557-
goto out;
558-
559-
for (i = 0; i < ndist; i++) {
560-
if (distances[i].mindist < near) {
561-
idx = 0;
562-
near = distances[i].mindist;
563-
nearest[idx] = distances[i];
564-
idx++;
565-
} else if (distances[i].mindist == near) {
566-
nearest[idx] = distances[i];
567-
idx++;
568-
}
569-
}
570-
571-
*num_distances = idx;
572-
573-
out:
574-
if (val)
575-
PMIX_VALUE_RELEASE(val);
576-
return nearest;
577-
}
578-
579448
#if OPAL_OFI_PCI_DATA_AVAILABLE
580-
/**
581-
* Determine if a device is nearest
582-
*
583-
* Given a device distances array of the nearest pci devices,
584-
* determine if one of these device distances refers to the pci
585-
* device passed in
449+
/* Check if a process and a pci device share the same cpuset
450+
* @param (IN) pci struct fi_pci_attr pci device attributes,
451+
* used to find hwloc object for device.
586452
*
587-
* @param distances (IN) distances array
588-
* @param num_distances (IN) number of entries in the distances array
589-
* @param topology (IN) topology of the node
590-
* @param pci (IN) PCI device being examined
453+
* @param (IN) topology hwloc_topology_t topology to get the cpusets
454+
* from
591455
*
592-
* @return true if the PCI device is in the distances array or if the
593-
* distances array is not provided. False otherwise.
456+
* @param (OUT) returns true if cpusets match and false if
457+
* cpusets do not match or an error prevents comparison
594458
*
459+
* Uses a pci device to find an ancestor that contains a cpuset, and
460+
* determines if it intersects with the cpuset that the process is bound to.
461+
* if the process is not bound, or if a cpuset is unavailable for whatever
462+
* reason, returns false. Otherwise, returns the result of
463+
* hwloc_cpuset_intersects()
595464
*/
596-
static bool is_near(pmix_device_distance_t *distances,
597-
int num_distances,
598-
hwloc_topology_t topology,
599-
struct fi_pci_attr pci)
465+
static bool compare_cpusets(hwloc_topology_t topology, struct fi_pci_attr pci)
600466
{
601-
hwloc_obj_t pcidev, osdev;
602-
603-
/* if we failed to find any distances, then we consider all interfaces
604-
* to be of equal distances and let the caller decide how to handle
605-
* them
606-
*/
607-
if (!distances)
608-
return true;
467+
bool result = false;
468+
int ret;
469+
hwloc_bitmap_t proc_cpuset;
470+
hwloc_obj_t obj = NULL;
609471

610-
pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id,
611-
pci.bus_id, pci.device_id,
612-
pci.function_id);
613-
if (!pcidev)
472+
/* Cannot find topology info if no topology is found */
473+
if (NULL == topology) {
614474
return false;
475+
}
615476

616-
for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) {
617-
int i;
618-
619-
if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) {
620-
const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID");
621-
const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID");
477+
/* Allocate memory for proc_cpuset */
478+
proc_cpuset = hwloc_bitmap_alloc();
479+
if (NULL == proc_cpuset) {
480+
return false;
481+
}
622482

623-
if (!nguid && !sguid)
624-
continue;
483+
/* Fill cpuset with the collection of cpu cores that the process runs on */
484+
ret = hwloc_get_cpubind(topology, proc_cpuset, HWLOC_CPUBIND_PROCESS);
485+
if (0 > ret) {
486+
goto error;
487+
}
625488

626-
for (i = 0; i < num_distances; i++) {
627-
char lsguid[256], lnguid[256];
628-
int ret;
489+
/* Get the pci device from bdf */
490+
obj = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id,
491+
pci.function_id);
492+
if (NULL == obj) {
493+
goto error;
494+
}
629495

630-
ret = sscanf(distances[i].uuid, "fab://%256s::%256s", lnguid, lsguid);
631-
if (ret != 2)
632-
continue;
633-
if (0 == strcasecmp(lnguid, nguid)) {
634-
return true;
635-
} else if (0 == strcasecmp(lsguid, sguid)) {
636-
return true;
637-
}
638-
}
639-
} else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) {
640-
const char *address = hwloc_obj_get_info_by_name(osdev, "Address");
641-
if (!address)
642-
continue;
643-
for (i = 0; i < num_distances; i++) {
644-
char *addr = strstr(distances[i].uuid, "://");
645-
if (!addr || addr + 3 > distances[i].uuid
646-
+ strlen(distances[i].uuid))
647-
continue;
648-
if (!strcmp(addr+3, address)) {
649-
return true;
650-
}
651-
}
652-
}
496+
/* pcidev objects don't have cpusets so find the first non-io object above */
497+
obj = hwloc_get_non_io_ancestor_obj(topology, obj);
498+
if (NULL != obj) {
499+
result = hwloc_bitmap_intersects(proc_cpuset, obj->cpuset);
653500
}
654501

655-
return false;
502+
error:
503+
hwloc_bitmap_free(proc_cpuset);
504+
return result;
656505
}
657506
#endif
658507

@@ -765,10 +614,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
765614
struct fi_info *provider = provider_list, *current_provider = provider_list;
766615
struct fi_info **provider_table;
767616
#if OPAL_OFI_PCI_DATA_AVAILABLE
768-
pmix_device_distance_t *distances = NULL;
769617
struct fi_pci_attr pci;
770-
int num_distances = 0;
771-
bool near;
772618
#endif
773619
int ret;
774620
unsigned int num_provider = 0, provider_limit = 0;
@@ -793,38 +639,33 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793639
return provider_list;
794640
}
795641

796-
#if OPAL_OFI_PCI_DATA_AVAILABLE
797-
/* find all the nearest devices to this thread, then out of these
798-
* determine which device we should bind to.
799-
*/
800-
distances = get_nearest_nics(&num_distances);
801-
#endif
802-
803642
current_provider = provider;
804643

805644
/* Cycle through remaining fi_info objects, looking for alike providers */
806645
while (NULL != current_provider) {
807646
if (!check_provider_attr(provider, current_provider)) {
808-
near = false;
647+
cpusets_match = false;
809648
#if OPAL_OFI_PCI_DATA_AVAILABLE
810649
if (NULL != current_provider->nic
811650
&& NULL != current_provider->nic->bus_attr
812651
&& current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
813652
pci = current_provider->nic->bus_attr->attr.pci;
814-
near = is_near(distances, num_distances,
815-
opal_hwloc_topology, pci);
653+
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
816654
}
817655
#endif
818-
/* We could have multiple near providers */
819-
if (near && !provider_found) {
656+
657+
/* Reset the list if the cpusets match and no other provider was
658+
* found on the same cpuset as the process.
659+
*/
660+
if (cpusets_match && !provider_found) {
820661
provider_found = true;
821662
num_provider = 0;
822663
}
823664

824665
/* Add the provider to the provider list if the cpusets match or if
825666
* no other provider was found on the same cpuset as the process.
826667
*/
827-
if (near || !provider_found) {
668+
if (cpusets_match || !provider_found) {
828669
provider_table[num_provider] = current_provider;
829670
num_provider++;
830671
}
@@ -846,22 +687,17 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
846687
&& NULL != provider->nic->bus_attr
847688
&& provider->nic->bus_attr->bus_type == FI_BUS_PCI) {
848689
pci = provider->nic->bus_attr->attr.pci;
849-
near = is_near(distances, num_distances,
850-
opal_hwloc_topology, pci);
690+
cpusets_match = compare_cpusets(opal_hwloc_topology, pci);
851691
}
852692
#endif
853693

854694
#if OPAL_ENABLE_DEBUG
855695
opal_output_verbose(1, opal_common_ofi.output,
856-
"package rank: %d device: %s near: %s\n", package_rank,
857-
provider->domain_attr->name, near ? "true" : "false");
696+
"package rank: %d device: %s cpusets match: %s\n", package_rank,
697+
provider->domain_attr->name, cpusets_match ? "true" : "false");
858698
#endif
859699

860700
free(provider_table);
861-
#if OPAL_OFI_PCI_DATA_AVAILABLE
862-
if (distances)
863-
free(distances);
864-
#endif
865701
return provider;
866702
}
867703

0 commit comments

Comments
 (0)