@@ -560,7 +560,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
560
560
bool enable_mpi_threads ,
561
561
bool * accelerator_support )
562
562
{
563
- int ret , fi_version ;
563
+ int ret , fi_primary_version , fi_alternate_version ;
564
564
int num_local_ranks , sep_support_in_provider , max_ofi_ctxts ;
565
565
int ofi_tag_leading_zeros , ofi_tag_bits_for_cid ;
566
566
char * * include_list = NULL ;
@@ -595,8 +595,17 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
595
595
* (FI_REMOTE_COMM), which is insufficient for MTL selection.
596
596
*
597
597
* Note: API version 1.9 is the first version that supports FI_HMEM
598
+ *
599
+ * Note: API version 1.18 is the first version that clearly define
600
+ * provider's behavior in making CUDA API calls that all provider
601
+ * by default is permitted to make CUDA calls if application uses >= 1.18 API.
602
+ *
603
+ * If application is using < 1.18 API, some provider will not claim support
604
+ * of FI_HMEM (even if they are capable of) because it does not know
605
+ * whether application permits it to make CUDA calls.
598
606
*/
599
- fi_version = FI_VERSION (1 , 9 );
607
+ fi_primary_version = FI_VERSION (1 , 18 );
608
+ fi_alternate_version = FI_VERSION (1 , 9 );
600
609
601
610
/**
602
611
* Hints to filter providers
@@ -695,7 +704,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
695
704
hints_dup -> caps &= ~(FI_LOCAL_COMM | FI_REMOTE_COMM );
696
705
hints_dup -> fabric_attr -> prov_name = strdup ("efa" );
697
706
698
- ret = fi_getinfo (fi_version , NULL , NULL , 0ULL , hints_dup , & providers );
707
+ ret = fi_getinfo (fi_primary_version , NULL , NULL , 0ULL , hints_dup , & providers );
708
+ if (FI_ENOSYS == - ret ) {
709
+ /* libfabric is not new enough, fallback to use older version of API */
710
+ ret = fi_getinfo (fi_alternate_version , NULL , NULL , 0ULL , hints_dup , & providers );
711
+ }
699
712
700
713
opal_output_verbose (1 , opal_common_ofi .output ,
701
714
"%s:%d: EFA specific fi_getinfo(): %s\n" ,
@@ -727,12 +740,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
727
740
* remote node or service. this does not necessarily allocate resources.
728
741
* Pass NULL for name/service because we want a list of providers supported.
729
742
*/
730
- ret = fi_getinfo (fi_version , /* OFI version requested */
743
+ ret = fi_getinfo (fi_primary_version , /* OFI version requested */
731
744
NULL , /* Optional name or fabric to resolve */
732
745
NULL , /* Optional service name or port to request */
733
746
0ULL , /* Optional flag */
734
747
hints , /* In: Hints to filter providers */
735
748
& providers ); /* Out: List of matching providers */
749
+ if (FI_ENOSYS == - ret ) {
750
+ ret = fi_getinfo (fi_alternate_version , NULL , NULL , 0ULL , hints , & providers );
751
+ }
736
752
737
753
opal_output_verbose (1 , opal_common_ofi .output ,
738
754
"%s:%d: fi_getinfo(): %s\n" ,
0 commit comments