open-mpi
diff --git a/‎config/opal_check_cuda.m4
Lines changed: 28 additions & 23 deletions b/‎config/opal_check_cuda.m4
Lines changed: 28 additions & 23 deletions
diff --git a/‎configure.ac
Lines changed: 8 additions & 3 deletions b/‎configure.ac
Lines changed: 8 additions & 3 deletions
diff --git a/‎opal/mca/accelerator/cuda/Makefile.am
Lines changed: 4 additions & 1 deletion b/‎opal/mca/accelerator/cuda/Makefile.am
Lines changed: 4 additions & 1 deletion
diff --git a/‎opal/mca/accelerator/cuda/accelerator_cuda.c
Lines changed: 29 additions & 29 deletions b/‎opal/mca/accelerator/cuda/accelerator_cuda.c
Lines changed: 29 additions & 29 deletions
@@ -27,7 +27,22 @@ dnl
 dnl $HEADER$
 dnl
 
+
+# OPAL_CHECK_CUDA(prefix, [action-if-found], [action-if-not-found])
+# --------------------------------------------------------
+# check if CUDA support can be found.  sets prefix_{CPPFLAGS,
+# LDFLAGS, LIBS} as needed and runs action-if-found if there is
+# support, otherwise executes action-if-not-found
+
+#
+# Check for CUDA support
+#
 AC_DEFUN([OPAL_CHECK_CUDA],[
+OPAL_VAR_SCOPE_PUSH([cuda_save_CPPFLAGS cuda_save_LDFLAGS cuda_save_LIBS])
+
+cuda_save_CPPFLAGS="$CPPFLAGS"
+cuda_save_LDFLAGS="$LDFLAGS"
+cuda_save_LIBS="$LIBS"
 #
 # Check to see if user wants CUDA support
 #
@@ -72,12 +87,15 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                             opal_cuda_incdir="$with_cuda/include"
                             AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
 
-dnl We cannot have CUDA support without dlopen support.  HOWEVER, at
-dnl this point in configure, we can't know whether the DL framework
-dnl has been configured or not yet (it likely hasn't, since CUDA is a
-dnl common framework, and likely configured first).  So we have to
-dnl defer this check until later (see the OPAL_CHECK_CUDA_AFTER_OPAL_DL m4
-dnl macro, below).  :-(
+AS_IF([test "$opal_check_cuda_happy" = "yes"],
+    [OAC_CHECK_PACKAGE([cuda],
+                       [$1],
+                       [cuda.h],
+                       [cuda],
+                       [cuMemFree],
+                       [opal_check_cuda_happy="yes"],
+                       [opal_check_cuda_happy="no"])],
+    [])
 
 # We require CUDA IPC support which started in CUDA 4.1. Error
 # out if the support is not there.
@@ -144,22 +162,9 @@ AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = "
 AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
                    [Whether we have CUDA GDR support available])
 
+CPPFLAGS=${cuda_save_CPPFLAGS}
+LDFLAGS=${cuda_save_LDFLAGS}
+LIBS=${cuda_save_LIBS}
+OPAL_VAR_SCOPE_POP
 ])
 
-dnl
-dnl CUDA support requires DL support (it dynamically opens the CUDA
-dnl library at run time).  But we do not check for OPAL DL support
-dnl until lafter the initial OPAL_CHECK_CUDA is called.  So put the
-dnl CUDA+DL check in a separate macro that can be called after the DL MCA
-dnl framework checks in the top-level configure.ac.
-dnl
-AC_DEFUN([OPAL_CHECK_CUDA_AFTER_OPAL_DL],[
-
-    # We cannot have CUDA support without OPAL DL support.  Error out
-    # if the user wants CUDA but we do not have OPAL DL support.
-    AS_IF([test $OPAL_HAVE_DL_SUPPORT -eq 0 && \
-           test "$opal_check_cuda_happy" = "yes"],
-          [AC_MSG_WARN([--with-cuda was specified, but dlopen support is disabled.])
-           AC_MSG_WARN([You must reconfigure Open MPI with dlopen ("dl") support.])
-           AC_MSG_ERROR([Cannot continue.])])
-])
 
@@ -987,7 +987,14 @@ AC_CACHE_SAVE
 
 opal_show_title "System-specific tests"
 
-OPAL_CHECK_CUDA
+################
+# CUDA support #
+################
+# Note, we should remove this when opal/cuda is removed
+OPAL_CHECK_CUDA([opal_cuda],
+                [opal_cuda_happy="yes"],
+                [opal_cuda_happy="no"])
+
 ##################################
 OPAL_CHECK_OS_FLAVORS
 
@@ -1233,8 +1240,6 @@ AC_CACHE_SAVE
 # be done better by having some kind of "run this check at the end of
 # all other MCA checks" hook...?
 
-OPAL_CHECK_CUDA_AFTER_OPAL_DL
-
 OPAL_CHECK_ROCM_AFTER_OPAL_DL
 
 ##################################
 
@@ -32,10 +32,13 @@ endif
 
 mcacomponentdir = $(opallibdir)
 mcacomponent_LTLIBRARIES = $(component_install)
+
 mca_accelerator_cuda_la_SOURCES = $(sources)
 mca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
-mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la
+mca_accelerator_cuda_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \
+        $(accelerator_cuda_LIBS)
 
 noinst_LTLIBRARIES = $(component_noinst)
 libmca_accelerator_cuda_la_SOURCES =$(sources)
 libmca_accelerator_cuda_la_LDFLAGS = -module -avoid-version
+libmca_accelerator_cuda_la_LIBADD = $(accelerator_cuda_LIBS)
@@ -97,7 +97,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                                          CU_POINTER_ATTRIBUTE_IS_MANAGED};
     void *attrdata[] = {(void *) &mem_type, (void *) &mem_ctx, (void *) &is_managed};
 
-    result = opal_accelerator_cuda_func.cuPointerGetAttributes(3, attributes, attrdata, dbuf);
+    result = cuPointerGetAttributes(3, attributes, attrdata, dbuf);
     OPAL_OUTPUT_VERBOSE((101, opal_accelerator_base_framework.framework_output,
                          "dbuf=%p, mem_type=%d, mem_ctx=%p, is_managed=%d, result=%d", (void *) dbuf,
                          (int) mem_type, (void *) mem_ctx, is_managed, result));
@@ -121,7 +121,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
 #else /* OPAL_CUDA_GET_ATTRIBUTES */
-    result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    result = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
     if (CUDA_SUCCESS != result) {
         /* If we cannot determine it is device pointer,
          * just assume it is not. */
@@ -142,11 +142,11 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
      * GPU memory, but no context, get the context from the GPU memory
      * and set the current context to that.  It is rare that we will not
      * have a context. */
-    result = opal_accelerator_cuda_func.cuCtxGetCurrent(&ctx);
+    result = cuCtxGetCurrent(&ctx);
     if (OPAL_UNLIKELY(NULL == ctx)) {
         if (CUDA_SUCCESS == result) {
 #if !OPAL_CUDA_GET_ATTRIBUTES
-            result = opal_accelerator_cuda_func.cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+            result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,
                             "CUDA: error calling cuPointerGetAttribute: "
@@ -155,7 +155,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
                 return OPAL_ERROR;
             }
 #endif /* OPAL_CUDA_GET_ATTRIBUTES */
-            result = opal_accelerator_cuda_func.cuCtxSetCurrent(mem_ctx);
+            result = cuCtxSetCurrent(mem_ctx);
             if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
                 opal_output(0,
                             "CUDA: error calling cuCtxSetCurrent: "
@@ -185,7 +185,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     if (OPAL_LIKELY(((CUDA_VERSION > 7000) ? 0 : 1))) {
         CUdeviceptr pbase;
         size_t psize;
-        result = opal_accelerator_cuda_func.cuMemGetAddressRange(&pbase, &psize, dbuf);
+        result = cuMemGetAddressRange(&pbase, &psize, dbuf);
         if (CUDA_SUCCESS != result) {
             opal_output_verbose(5, opal_accelerator_base_framework.framework_output,
                                 "CUDA: cuMemGetAddressRange failed on this pointer: result=%d, buf=%p "
@@ -214,7 +214,7 @@ static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
 
-    result = opal_accelerator_cuda_func.cuStreamCreate((*stream)->stream, 0);
+    result = cuStreamCreate((*stream)->stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -230,7 +230,7 @@ static void opal_accelerator_cuda_stream_destruct(opal_accelerator_cuda_stream_t
     CUresult result;
 
     if (NULL != stream->base.stream) {
-        result = opal_accelerator_cuda_func.cuStreamDestroy(*(CUstream *)stream->base.stream);
+        result = cuStreamDestroy(*(CUstream *)stream->base.stream);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuStreamDestroy failed", true,
                            result);
@@ -259,7 +259,7 @@ static int accelerator_cuda_create_event(int dev_id, opal_accelerator_event_t **
         OBJ_RELEASE(*event);
         return OPAL_ERR_OUT_OF_RESOURCE;
     }
-    result = opal_accelerator_cuda_func.cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING);
+    result = cuEventCreate((*event)->event, CU_EVENT_DISABLE_TIMING);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuEventCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -274,7 +274,7 @@ static void opal_accelerator_cuda_event_destruct(opal_accelerator_cuda_event_t *
 {
     CUresult result;
     if (NULL != event->base.event) {
-        result = opal_accelerator_cuda_func.cuEventDestroy(*(CUevent *)event->base.event);
+        result = cuEventDestroy(*(CUevent *)event->base.event);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuEventDestroy failed", true,
                            result);
@@ -297,7 +297,7 @@ static int accelerator_cuda_record_event(int dev_id, opal_accelerator_event_t *e
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream);
+    result = cuEventRecord(*(CUevent *)event->event, *(CUstream *)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuEventRecord failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -314,7 +314,7 @@ static int accelerator_cuda_query_event(int dev_id, opal_accelerator_event_t *ev
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuEventQuery(*(CUevent *)event->event);
+    result = cuEventQuery(*(CUevent *)event->event);
     switch (result) {
         case CUDA_SUCCESS:
             {
@@ -344,7 +344,7 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, *(CUstream *)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
@@ -370,13 +370,13 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
      * Additionally, cuMemcpy is not necessarily always synchronous. See:
      * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
      * TODO: Add optimizations for type field */
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
                        size, result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -395,29 +395,29 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemAlloc(&tmp, size);
+    result = cuMemAlloc(&tmp, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = opal_accelerator_cuda_func.cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
+    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    opal_accelerator_cuda_func.cuMemFree(tmp);
+    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
@@ -430,7 +430,7 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
     }
 
     if (size > 0) {
-        result = opal_accelerator_cuda_func.cuMemAlloc((CUdeviceptr *) ptr, size);
+        result = cuMemAlloc((CUdeviceptr *) ptr, size);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
                            OPAL_PROC_MY_HOSTNAME, result);
@@ -444,7 +444,7 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
 {
     CUresult result;
     if (NULL != ptr) {
-        result = opal_accelerator_cuda_func.cuMemFree((CUdeviceptr) ptr);
+        result = cuMemFree((CUdeviceptr) ptr);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
                            OPAL_PROC_MY_HOSTNAME, result);
@@ -463,7 +463,7 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr);
+    result = cuMemGetAddressRange((CUdeviceptr *) base, size, (CUdeviceptr) ptr);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemGetAddressRange failed 2", true,
                        OPAL_PROC_MY_HOSTNAME, result, ptr);
@@ -483,7 +483,7 @@ static int accelerator_cuda_host_register(int dev_id, void *ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuMemHostRegister(ptr, size, 0);
+    result = cuMemHostRegister(ptr, size, 0);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister failed", true,
                        ptr, size, OPAL_PROC_MY_HOSTNAME, result);
@@ -497,7 +497,7 @@ static int accelerator_cuda_host_unregister(int dev_id, void *ptr)
 {
     CUresult result;
     if (NULL != ptr) {
-        result = opal_accelerator_cuda_func.cuMemHostUnregister(ptr);
+        result = cuMemHostUnregister(ptr);
         if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
             opal_show_help("help-accelerator-cuda.txt", "cuMemHostUnregister failed", true,
                            ptr, OPAL_PROC_MY_HOSTNAME, result);
@@ -516,7 +516,7 @@ static int accelerator_cuda_get_device(int *dev_id)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuCtxGetDevice(&cuDev);
+    result = cuCtxGetDevice(&cuDev);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuCtxGetDevice failed", true,
                        result);
@@ -534,7 +534,7 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = opal_accelerator_cuda_func.cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2);
+    result = cuDeviceCanAccessPeer(access, (CUdevice) dev1, (CUdevice) dev2);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuDeviceCanAccessPeer failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
@@ -554,13 +554,13 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     CUresult result;
     int enable = 1;
-    result = opal_accelerator_cuda_func.cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
+    result = cuPointerGetAttribute((unsigned long long *)buf_id, CU_POINTER_ATTRIBUTE_BUFFER_ID, (CUdeviceptr) addr);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "bufferID failed", true, OPAL_PROC_MY_HOSTNAME,
                        result);
         return result;
     }
-    result = opal_accelerator_cuda_func.cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+    result = cuPointerSetAttribute(&enable, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                                        (CUdeviceptr) addr);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuPointerSetAttribute failed", true,