Skip to content

Commit 66912b9

Browse files
committed
opal/mca/ofi: select NIC closest to accelerator if requested
This patch introduces the capability to select the closest NIC to the accelerator device. If the accelerator or NIC PCI information is not available, fallback to select the NIC on the closest package. To enable this feature, the application should set the MCA parameter OMPI_MCA_opal_common_ofi_accelerator_rank(default -1) to a non-negative integer, which represents the process rank(0-based) on the same accelerator. The distance between the acclerator device and NIC is the number of hwloc objects inbetween, which includes the lowest common ancestor on the hwloc topology tree. Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
1 parent c29f239 commit 66912b9

File tree

1 file changed

+219
-5
lines changed

1 file changed

+219
-5
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 219 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common_ofi.h"
3232
#include "opal/constants.h"
33+
#include "opal/mca/accelerator/accelerator.h"
3334
#include "opal/mca/base/mca_base_framework.h"
3435
#include "opal/mca/base/mca_base_var.h"
3536
#include "opal/mca/hwloc/base/base.h"
@@ -38,13 +39,15 @@
3839
#include "opal/util/argv.h"
3940
#include "opal/util/show_help.h"
4041

42+
extern opal_accelerator_base_module_t opal_accelerator;
4143
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4244
.prov_exclude = NULL,
4345
.output = -1};
4446
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net";
4547
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
4648
static int opal_common_ofi_verbose_level = 0;
4749
static int opal_common_ofi_init_ref_cnt = 0;
50+
static int opal_common_ofi_accelerator_rank = -1;
4851
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
4952
static bool opal_common_ofi_installed_memory_monitor = false;
5053
#endif
@@ -324,6 +327,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
324327
static int include_index = -1;
325328
static int exclude_index = -1;
326329
static int verbose_index = -1;
330+
static int accelerator_rank_index = -1;
327331
int ret;
328332

329333
if (fi_version() < FI_VERSION(1, 0)) {
@@ -389,6 +393,19 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
389393
}
390394
}
391395

396+
if (0 > accelerator_rank_index) {
397+
accelerator_rank_index
398+
= mca_base_var_register("opal", "opal_common", "ofi", "accelerator_rank",
399+
"Process rank(non-negative) on the selected accelerator device",
400+
MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
401+
OPAL_INFO_LVL_1, MCA_BASE_VAR_SCOPE_LOCAL,
402+
&opal_common_ofi_accelerator_rank);
403+
if (0 > accelerator_rank_index) {
404+
ret = accelerator_rank_index;
405+
goto err;
406+
}
407+
}
408+
392409
if (component) {
393410
ret = mca_base_var_register_synonym(include_index,
394411
component->mca_project_name,
@@ -414,6 +431,15 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
414431
if (0 > ret) {
415432
goto err;
416433
}
434+
435+
ret = mca_base_var_register_synonym(accelerator_rank_index,
436+
component->mca_project_name,
437+
component->mca_type_name,
438+
component->mca_component_name,
439+
"accelerator_rank_index", 0);
440+
if (0 > ret) {
441+
goto err;
442+
}
417443
}
418444

419445
/* The frameworks initialize their output streams during
@@ -915,18 +941,193 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
915941
return (uint32_t) process_info->myprocid.rank;
916942
}
917943

944+
static int get_parent_distance(hwloc_obj_t parent, hwloc_obj_t child, int *distance)
945+
{
946+
int dist = 0;
947+
948+
while (child != parent) {
949+
if (!child) {
950+
return OPAL_ERROR;
951+
}
952+
child = child->parent;
953+
++dist;
954+
}
955+
956+
*distance = dist;
957+
return OPAL_SUCCESS;
958+
}
959+
960+
#if OPAL_OFI_PCI_DATA_AVAILABLE
961+
/**
962+
* @brief Attempt to find a nearest provider from the accelerator.
963+
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
964+
* shortest distance.
965+
* Special cases:
966+
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
967+
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
968+
* return OPAL_ERR_NOT_AVAILABLE.
969+
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
970+
* i.e. (local rank on the same accelerator) % (number of nearest providers)
971+
* @param[in] provider_list List of providers
972+
* @param[in] num_providers Number of providers
973+
* @param[in] accl_id Accelerator id
974+
* @param[in] device_rank Local rank on the accelerator
975+
* @param[out] provider Pointer to the selected provider
976+
* @return OPAL_SUCCESS if a provider is successfully selected
977+
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
978+
* OPAL_ERROR if a fatal error happened
979+
*/
980+
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
981+
size_t num_providers,
982+
int accl_id,
983+
uint32_t device_rank,
984+
struct fi_info **provider)
985+
{
986+
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
987+
int ret = -1, accl_distance = -1, prov_distance = -1, min_distance = INT_MAX;
988+
opal_accelerator_pci_attr_t accl_pci_attr = {0};
989+
struct fi_info *current_provider = NULL;
990+
struct fi_pci_attr pci = {0};
991+
uint32_t distances[num_providers], *distance = distances;
992+
uint32_t near_provider_count = 0, provider_rank = 0;
993+
994+
memset(distances, 0, sizeof(distances));
995+
996+
ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
997+
if (OPAL_SUCCESS != ret) {
998+
opal_output_verbose(1, opal_common_ofi.output,
999+
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
1000+
return OPAL_ERROR;
1001+
}
1002+
1003+
accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
1004+
accl_pci_attr.bus_id, accl_pci_attr.device_id,
1005+
accl_pci_attr.function_id);
1006+
if (NULL == accl_dev) {
1007+
opal_output_verbose(1, opal_common_ofi.output,
1008+
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
1009+
return OPAL_ERROR;
1010+
}
1011+
1012+
opal_output_verbose(1, opal_common_ofi.output,
1013+
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
1014+
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
1015+
accl_pci_attr.device_id, accl_pci_attr.function_id,
1016+
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);
1017+
1018+
current_provider = provider_list;
1019+
while (NULL != current_provider) {
1020+
common_ancestor = NULL;
1021+
if (0 == check_provider_attr(provider_list, current_provider)
1022+
&& OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci)) {
1023+
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
1024+
pci.device_id, pci.function_id);
1025+
if (NULL == prov_dev) {
1026+
opal_output_verbose(1, opal_common_ofi.output,
1027+
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
1028+
return OPAL_ERROR;
1029+
}
1030+
1031+
common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
1032+
prov_dev);
1033+
if (!common_ancestor) {
1034+
opal_output_verbose(
1035+
1, opal_common_ofi.output,
1036+
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
1037+
__FILE__, __LINE__);
1038+
/**
1039+
* Return error because any 2 PCI devices should share at least one common ancestor,
1040+
* i.e. root
1041+
*/
1042+
return OPAL_ERROR;
1043+
}
1044+
1045+
ret = get_parent_distance(common_ancestor, accl_dev, &accl_distance);
1046+
if (OPAL_SUCCESS != ret) {
1047+
opal_output_verbose(
1048+
1, opal_common_ofi.output,
1049+
"%s:%d:Failed to get distance between common ancestor and accelerator device",
1050+
__FILE__, __LINE__);
1051+
return OPAL_ERROR;
1052+
}
1053+
1054+
ret = get_parent_distance(common_ancestor, prov_dev, &prov_distance);
1055+
if (OPAL_SUCCESS != ret) {
1056+
opal_output_verbose(
1057+
1, opal_common_ofi.output,
1058+
"%s:%d:Failed to get distance between common ancestor and provider device",
1059+
__FILE__, __LINE__);
1060+
return OPAL_ERROR;
1061+
}
1062+
1063+
if (min_distance > accl_distance + prov_distance) {
1064+
min_distance = accl_distance + prov_distance;
1065+
near_provider_count = 1;
1066+
} else if (min_distance == accl_distance + prov_distance) {
1067+
++near_provider_count;
1068+
}
1069+
}
1070+
1071+
*(distance++) = !common_ancestor ? 0 : accl_distance + prov_distance;
1072+
current_provider = current_provider->next;
1073+
}
1074+
1075+
if (0 == near_provider_count) {
1076+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
1077+
__FILE__, __LINE__);
1078+
return OPAL_ERR_NOT_AVAILABLE;
1079+
}
1080+
1081+
provider_rank = device_rank % near_provider_count;
1082+
1083+
distance = distances;
1084+
current_provider = provider_list;
1085+
near_provider_count = 0;
1086+
while (NULL != current_provider) {
1087+
if ((uint32_t) min_distance == *(distance++)
1088+
&& provider_rank == near_provider_count++) {
1089+
*provider = current_provider;
1090+
return OPAL_SUCCESS;
1091+
}
1092+
1093+
current_provider = current_provider->next;
1094+
}
1095+
1096+
assert(0 == near_provider_count);
1097+
1098+
return OPAL_ERROR;
1099+
}
1100+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1101+
1102+
9181103
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9191104
opal_process_info_t *process_info)
9201105
{
921-
int ret, num_providers = 0;
1106+
int ret, num_providers = 0, accel_id = -1;
9221107
struct fi_info *provider = NULL;
923-
uint32_t package_rank = process_info->my_local_rank;
1108+
uint32_t package_rank;
9241109

1110+
/* Current process' local rank on the same package(socket) */
1111+
package_rank = process_info->proc_is_bound ? get_package_rank(process_info)
1112+
: process_info->my_local_rank;
9251113
num_providers = count_providers(provider_list);
926-
if (!process_info->proc_is_bound || 2 > num_providers) {
1114+
1115+
#if OPAL_OFI_PCI_DATA_AVAILABLE
1116+
if (-1 < opal_common_ofi_accelerator_rank) {
1117+
ret = opal_accelerator.get_device(&accel_id);
1118+
if (OPAL_SUCCESS != ret) {
1119+
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
1120+
__FILE__, __LINE__);
1121+
accel_id = -1;
1122+
}
1123+
}
1124+
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */
1125+
1126+
if ((!process_info->proc_is_bound && 0 > accel_id) || 2 > num_providers) {
9271127
goto round_robin;
9281128
}
9291129

1130+
#if OPAL_OFI_PCI_DATA_AVAILABLE
9301131
/* Initialize opal_hwloc_topology if it is not already */
9311132
ret = opal_hwloc_base_get_topology();
9321133
if (0 > ret) {
@@ -935,9 +1136,22 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
9351136
__FILE__, __LINE__);
9361137
}
9371138

938-
package_rank = get_package_rank(process_info);
1139+
if (0 <= accel_id) {
1140+
ret = find_nearest_provider_from_accelerator(provider_list, num_providers, accel_id,
1141+
opal_common_ofi_accelerator_rank, &provider);
1142+
if (OPAL_SUCCESS == ret) {
1143+
goto out;
1144+
}
1145+
1146+
opal_output_verbose(1, opal_common_ofi.output,
1147+
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
1148+
__FILE__, __LINE__, ret);
1149+
1150+
if (!process_info->proc_is_bound) {
1151+
goto round_robin;
1152+
}
1153+
}
9391154

940-
#if OPAL_OFI_PCI_DATA_AVAILABLE
9411155
/**
9421156
* If provider PCI BDF information is available, we calculate its physical distance
9431157
* to the current process, and select the provider with the shortest distance.

0 commit comments

Comments
 (0)