Skip to content

Commit 9a7c8b8

Browse files
committed
mtl/ofi: use libfabric 1.18 API if available
This patch try to use 1.18 API when it is available. This is because 1.18 API clearly define provider's CUDA API behavior to be that provider can call CUDA API by default if application uses 1.18 API. When using older version of API, some libfabric will not claim support of FI_HMEM even if it is capable of supporting because the provider does not know whether CUDA calls are permitted. Signed-off-by: Wei Zhang <wzam@amazon.com>
1 parent df5a394 commit 9a7c8b8

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
560560
bool enable_mpi_threads,
561561
bool *accelerator_support)
562562
{
563-
int ret, fi_version;
563+
int ret, fi_primary_version, fi_alternate_version;
564564
int num_local_ranks, sep_support_in_provider, max_ofi_ctxts;
565565
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
566566
char **include_list = NULL;
@@ -595,8 +595,17 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
595595
* (FI_REMOTE_COMM), which is insufficient for MTL selection.
596596
*
597597
* Note: API version 1.9 is the first version that supports FI_HMEM
598+
*
599+
* Note: API version 1.18 is the first version that clearly define
600+
* provider's behavior in making CUDA API calls that all provider
601+
* by default is permitted to make CUDA calls if application uses >= 1.18 API.
602+
*
603+
* If application is using < 1.18 API, some provider will not claim support
604+
* of FI_HMEM (even if they are capable of) because it does not know
605+
* whether application permits it to make CUDA calls.
598606
*/
599-
fi_version = FI_VERSION(1, 9);
607+
fi_primary_version = FI_VERSION(1, 18);
608+
fi_alternate_version = FI_VERSION(1, 9);
600609

601610
/**
602611
* Hints to filter providers
@@ -695,7 +704,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
695704
hints_dup->caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM);
696705
hints_dup->fabric_attr->prov_name = strdup("efa");
697706

698-
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
707+
ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers);
708+
if (FI_ENOSYS == -ret) {
709+
/* libfabric is not new enough, fallback to use older version of API */
710+
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers);
711+
}
699712

700713
opal_output_verbose(1, opal_common_ofi.output,
701714
"%s:%d: EFA specific fi_getinfo(): %s\n",
@@ -727,12 +740,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
727740
* remote node or service. this does not necessarily allocate resources.
728741
* Pass NULL for name/service because we want a list of providers supported.
729742
*/
730-
ret = fi_getinfo(fi_version, /* OFI version requested */
743+
ret = fi_getinfo(fi_primary_version, /* OFI version requested */
731744
NULL, /* Optional name or fabric to resolve */
732745
NULL, /* Optional service name or port to request */
733746
0ULL, /* Optional flag */
734747
hints, /* In: Hints to filter providers */
735748
&providers); /* Out: List of matching providers */
749+
if (FI_ENOSYS == -ret) {
750+
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers);
751+
}
736752

737753
opal_output_verbose(1, opal_common_ofi.output,
738754
"%s:%d: fi_getinfo(): %s\n",

0 commit comments

Comments
 (0)