Skip to content

Commit 9046276

Browse files
committed
opal/ofi: package rank calculation bugfix
The current implementation has an out-of-bound bug that process_info->my_local_rank could exceed package_ranks array length. This patch eliminates package_ranks array and fixes the bug. Signed-off-by: Wenduo Wang <wenduwan@amazon.com>
1 parent 2a52280 commit 9046276

File tree

1 file changed

+12
-15
lines changed

1 file changed

+12
-15
lines changed

opal/mca/common/ofi/common_ofi.c

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -715,8 +715,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
715715
{
716716
int i;
717717
uint16_t relative_locality, *package_rank_ptr;
718-
uint16_t current_package_rank = 0;
719-
uint16_t package_ranks[process_info->num_local_peers + 1];
718+
uint32_t ranks_on_package = 0;
720719
opal_process_name_t pname;
721720
pmix_status_t rc;
722721
char **peers = NULL;
@@ -745,26 +744,24 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
745744
// Get the local peers
746745
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_PEERS, &pname, &local_peers, PMIX_STRING);
747746
if (PMIX_SUCCESS != rc || NULL == local_peers) {
748-
// We can't find package_rank, fall back to procid
749-
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
750-
return (uint32_t) process_info->myprocid.rank;
747+
goto err;
751748
}
752749
peers = opal_argv_split(local_peers, ',');
753750
free(local_peers);
754751

755752
for (i = 0; NULL != peers[i]; i++) {
756753
pname.vpid = strtoul(peers[i], NULL, 10);
754+
755+
if ((uint16_t) pname.vpid == process_info->my_local_rank) {
756+
return ranks_on_package;
757+
}
758+
757759
locality_string = NULL;
758760
// Get the LOCALITY_STRING for process[i]
759761
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &pname, &locality_string,
760762
PMIX_STRING);
761763
if (PMIX_SUCCESS != rc || NULL == locality_string) {
762-
// If we don't have information about locality, fall back to procid
763-
int level = 10;
764-
if (opal_output_get_verbosity(opal_common_ofi.output) >= level) {
765-
opal_show_help("help-common-ofi.txt", "package_rank failed", true, level);
766-
}
767-
return (uint32_t) process_info->myprocid.rank;
764+
goto err;
768765
}
769766

770767
// compute relative locality
@@ -773,12 +770,12 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
773770
free(locality_string);
774771

775772
if (relative_locality & OPAL_PROC_ON_SOCKET) {
776-
package_ranks[i] = current_package_rank;
777-
current_package_rank++;
773+
ranks_on_package++;
778774
}
779775
}
780-
781-
return (uint32_t) package_ranks[process_info->my_local_rank];
776+
err:
777+
opal_show_help("help-common-ofi.txt", "package_rank failed", true);
778+
return (uint32_t) process_info->myprocid.rank;
782779
}
783780

784781
struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,

0 commit comments

Comments
 (0)