mtl/ofi: Update initialization to add FI_HMEM, FI_MR_HMEM, and FI_MR_LOCAL support

wckzhang · wckzhang · commit f69f03a38e59 · 2021-03-04T00:47:37.000Z
Add a check to see if Libfabric has at least one provider with FI_HMEM
support, use this info to set whether or not Libfabric has CUDA support.

Add provider hints for FI_MR_LOCAL, and if Libfabric has CUDA support,
also add hints for FI_HMEM and FI_MR_HMEM.

In the case where Open MPI is built with CUDA support but Libfabric is
not, the MTL/OFI is not picked.

Signed-off-by: William Zhang &lt;wilzhang@amazon.com&gt;
diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c
@@ -20,6 +20,9 @@
 #include "opal/util/argv.h"
 #include "opal/util/printf.h"
 #include "opal/mca/common/ofi/common_ofi.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/mca/common/cuda/common_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 static int ompi_mtl_ofi_component_open(void);
 static int ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority);
@@ -297,6 +300,9 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
 static int
 ompi_mtl_ofi_component_close(void)
 {
+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_fini();
+#endif
     opal_common_ofi_mca_deregister();
     return OMPI_SUCCESS;
 }
@@ -591,6 +597,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
         exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ',');
     }
 
+    /**
+     * Note: API version 1.5 is the first version that supports
+     * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
+     * that checking -- e.g., the shared memory provider supports
+     * intranode communication (FI_LOCAL_COMM), but not internode
+     * (FI_REMOTE_COMM), which is insufficient for MTL selection.
+     */
+    fi_version = FI_VERSION(1, 5);
+
     /**
      * Hints to filter providers
      * See man fi_getinfo for a list of all filters
@@ -608,11 +623,23 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
                             __FILE__, __LINE__);
         goto error;
     }
+
+#if OPAL_CUDA_SUPPORT
+    /** If Open MPI is built with CUDA, request device transfer
+     *  capabilities */
+    hints->caps |= FI_HMEM;
+    hints->domain_attr->mr_mode |= FI_MR_HMEM;
+    /**
+     * Note: API version 1.9 is the first version that supports FI_HMEM
+     */
+    fi_version = FI_VERSION(1, 9);
+#endif /* OPAL_CUDA_SUPPORT */
+
     /* Make sure to get a RDM provider that can do the tagged matching
        interface and local communication and remote communication. */
     hints->mode               = FI_CONTEXT;
     hints->ep_attr->type      = FI_EP_RDM;
-    hints->caps               = FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV;
+    hints->caps               |= FI_TAGGED | FI_LOCAL_COMM | FI_REMOTE_COMM | FI_DIRECTED_RECV;
     hints->tx_attr->msg_order = FI_ORDER_SAS;
     hints->rx_attr->msg_order = FI_ORDER_SAS;
     hints->rx_attr->op_flags = FI_COMPLETION;
@@ -660,14 +687,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
 
     hints->domain_attr->resource_mgmt    = FI_RM_ENABLED;
 
-    /**
-     * Note: API version 1.5 is the first version that supports
-     * FI_LOCAL_COMM / FI_REMOTE_COMM checking (and we definitely need
-     * that checking -- e.g., some providers are suitable for RXD or
-     * RXM, but can't provide local communication).
-     */
-    fi_version = FI_VERSION(1, 5);
-
     /**
      * The EFA provider in Libfabric versions prior to 1.10 contains a bug
      * where the FI_LOCAL_COMM and FI_REMOTE_COMM capabilities are not
@@ -758,6 +777,15 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
     opal_argv_free(exclude_list);
     exclude_list = NULL;
 
+#if OPAL_CUDA_SUPPORT
+    if (!(prov->caps & FI_HMEM)) {
+        opal_output_verbose(1, opal_common_ofi.output,
+                            "%s:%d: Libfabric provider does not support CUDA buffers\n",
+                            __FILE__, __LINE__);
+        goto error;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
+
     /**
      * Select the format of the OFI tag
      */
@@ -1033,6 +1061,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
      */
     ompi_mtl_ofi.any_addr = FI_ADDR_UNSPEC;
 
+#if OPAL_CUDA_SUPPORT
+    mca_common_cuda_stage_one_init();
+#endif
+
     return &ompi_mtl_ofi.base;
 
 error: