Skip to content

Commit a1dc2f5

Browse files
authored
Merge pull request #11515 from wzamazon/mtl_ofi_try_to_use_libfabric_1_18
mtl/ofi: use libfabric 1.18 API if available
2 parents 118b95d + 9a7c8b8 commit a1dc2f5

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
560560
bool enable_mpi_threads,
561561
bool *accelerator_support)
562562
{
563-
int ret, fi_version;
563+
int ret, fi_primary_version, fi_alternate_version;
564564
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
565565
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
566566
char **include_list = NULL;
@@ -595,8 +595,17 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
595595
* (FI_REMOTE_COMM), which is insufficient for MTL selection.
596596
*
597597
* Note: API version 1.9 is the first version that supports FI_HMEM
598+
*
599+
* Note: API version 1.18 is the first version that clearly define
600+
* provider's behavior in making CUDA API calls that all provider
601+
* by default is permitted to make CUDA calls if application uses >= 1.18 API.
602+
*
603+
* If application is using < 1.18 API, some provider will not claim support
604+
* of FI_HMEM (even if they are capable of) because it does not know
605+
* whether application permits it to make CUDA calls.
598606
*/
599-
fi_version = FI_VERSION(1, 9);
607+
fi_primary_version = FI_VERSION(1, 18);
608+
fi_alternate_version = FI_VERSION(1, 9);
600609

601610
/**
602611
* Hints to filter providers
@@ -695,7 +704,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
695704
hints_dup->caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM);
696705
hints_dup->fabric_attr->prov_name = strdup("efa");
697706

698-
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
707+
ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers);
708+
if (FI_ENOSYS == -ret) {
709+
/* libfabric is not new enough, fallback to use older version of API */
710+
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers);
711+
}
699712

700713
opal_output_verbose(1, opal_common_ofi.output,
701714
"%s:%d: EFA specific fi_getinfo(): %s\n",
@@ -727,12 +740,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
727740
* remote node or service. this does not necessarily allocate resources.
728741
* Pass NULL for name/service because we want a list of providers supported.
729742
*/
730-
ret = fi_getinfo(fi_version, /* OFI version requested */
743+
ret = fi_getinfo(fi_primary_version, /* OFI version requested */
731744
NULL, /* Optional name or fabric to resolve */
732745
NULL, /* Optional service name or port to request */
733746
0ULL, /* Optional flag */
734747
hints, /* In: Hints to filter providers */
735748
&providers); /* Out: List of matching providers */
749+
if (FI_ENOSYS == -ret) {
750+
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers);
751+
}
736752

737753
opal_output_verbose(1, opal_common_ofi.output,
738754
"%s:%d: fi_getinfo(): %s\n",

0 commit comments

Comments
 (0)