Skip to content

Commit 1a1b84a

Browse files
committed
opal/ofi: fallback to process id as package rank when unbound
The previous change caused inconsistent package rank when the process is unbound - two processes could return the same package rank. If the process is unbound, OPAL_MODEX_RECV_VALUE_OPTIONAL throws an error, in which case and we should fallback to the process id instead of attempting to return a local package rank. Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
1 parent 926fdfb commit 1a1b84a

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -752,10 +752,6 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
752752
for (i = 0; NULL != peers[i]; i++) {
753753
pname.vpid = strtoul(peers[i], NULL, 10);
754754

755-
if ((uint16_t) pname.vpid == process_info->my_local_rank) {
756-
return ranks_on_package;
757-
}
758-
759755
locality_string = NULL;
760756
// Get the LOCALITY_STRING for process[i]
761757
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &pname, &locality_string,
@@ -769,6 +765,10 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
769765
locality_string);
770766
free(locality_string);
771767

768+
if ((uint16_t) pname.vpid == process_info->my_local_rank) {
769+
return ranks_on_package;
770+
}
771+
772772
if (relative_locality & OPAL_PROC_ON_SOCKET) {
773773
ranks_on_package++;
774774
}
@@ -799,6 +799,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
799799
int ret;
800800
unsigned int num_provider = 0, provider_limit = 0;
801801
bool provider_found = false;
802+
uint32_t package_rank = 0;
802803

803804
/* Initialize opal_hwloc_topology if it is not already */
804805
ret = opal_hwloc_base_get_topology();
@@ -859,9 +860,9 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
859860
}
860861

861862
/* Select provider from local rank % number of providers */
862-
uint32_t package_rank = get_package_rank(process_info);
863863
if (num_provider >= 2) {
864864
// If there are multiple NICs "close" to the process, try to calculate package_rank
865+
package_rank = get_package_rank(process_info);
865866
provider = provider_table[package_rank % num_provider];
866867
} else if (num_provider == 1) {
867868
provider = provider_table[num_provider - 1];

0 commit comments

Comments
 (0)