Merge pull request #851 from bratpiorka/rrudnick_cuda_win

lukaszstolarczuk · web-flow · commit 3ab25d05b3c5 · 2024-11-12T20:18:05.000+01:00
enable CUDA provider on windows
diff --git a/.github/workflows/reusable_gpu.yml b/.github/workflows/reusable_gpu.yml
@@ -19,7 +19,7 @@ jobs:
     name: Level-Zero
     env:
       VCPKG_PATH: "${{github.workspace}}/../../../../vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/tbb_x64-windows;${{github.workspace}}/../../../../vcpkg/packages/jemalloc_x64-windows"
-      COVERAGE_NAME : "exports-coverage-gpu"
+      COVERAGE_NAME : "exports-coverage-gpu-L0"
     # run only on upstream; forks will not have the HW
     if: github.repository == 'oneapi-src/unified-memory-framework'
     strategy:
@@ -130,18 +130,26 @@ jobs:
     name: CUDA
     env:
       COVERAGE_NAME : "exports-coverage-gpu-CUDA"
+      VCPKG_PATH: "${{github.workspace}}/build/vcpkg/packages/hwloc_x64-windows;${{github.workspace}}/build/vcpkg/packages/tbb_x64-windows;${{github.workspace}}/build/vcpkg/packages/jemalloc_x64-windows;"
+      CUDA_PATH: "c:/cuda"
+
     # run only on upstream; forks will not have the HW
     if: github.repository == 'oneapi-src/unified-memory-framework'
     strategy:
       matrix:
         shared_library: ['ON', 'OFF']
         build_type: ['Debug', 'Release']
-        # TODO add windows
-        os: ['Ubuntu']
+        os: ['Ubuntu', 'Windows']
         include:
+        - os: 'Windows'
+          compiler: {c: cl, cxx: cl}
+          number_of_processors: '$Env:NUMBER_OF_PROCESSORS'
         - os: 'Ubuntu'
           compiler: {c: gcc, cxx: g++}
           number_of_processors: '$(nproc)'
+        exclude:
+        - os: 'Windows'
+          build_type: 'Debug'
 
     runs-on: ["DSS-CUDA", "DSS-${{matrix.os}}"]
     steps:
@@ -154,10 +162,47 @@ jobs:
       if: matrix.os == 'Ubuntu'
       run: .github/scripts/get_system_info.sh
 
+    - name: Initialize vcpkg
+      if: matrix.os == 'Windows'
+      uses: lukka/run-vcpkg@5e0cab206a5ea620130caf672fce3e4a6b5666a1 # v11.5
+      with:
+        vcpkgGitCommitId: 3dd44b931481d7a8e9ba412621fa810232b66289
+        vcpkgDirectory: ${{env.BUILD_DIR}}/vcpkg
+        vcpkgJsonGlob: '**/vcpkg.json'
+
+    - name: Install dependencies (windows-latest)
+      if: matrix.os == 'Windows'
+      run: vcpkg install
+      shell: pwsh # Specifies PowerShell as the shell for running the script.
+
+    - name: Configure build for Win
+      if: matrix.os == 'Windows'
+      run: >
+        cmake
+        -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}${{env.CUDA_PATH}}"
+        -B ${{env.BUILD_DIR}}
+        -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
+        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
+        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
+        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
+        -DUMF_BUILD_SHARED_LIBRARY=${{matrix.shared_library}}
+        -DUMF_BUILD_BENCHMARKS=ON
+        -DUMF_BUILD_TESTS=ON
+        -DUMF_BUILD_GPU_TESTS=ON
+        -DUMF_BUILD_GPU_EXAMPLES=ON
+        -DUMF_FORMAT_CODE_STYLE=OFF
+        -DUMF_DEVELOPER_MODE=ON
+        -DUMF_BUILD_LIBUMF_POOL_DISJOINT=ON
+        -DUMF_BUILD_LIBUMF_POOL_JEMALLOC=ON
+        -DUMF_BUILD_LEVEL_ZERO_PROVIDER=OFF
+        -DUMF_BUILD_CUDA_PROVIDER=ON
+        -DUMF_TESTS_FAIL_ON_SKIP=ON
+
     - name: Configure build for Ubuntu
       if: matrix.os == 'Ubuntu'
       run: >
-        cmake -B ${{env.BUILD_DIR}}
+        cmake 
+        -B ${{env.BUILD_DIR}}
         -DCMAKE_INSTALL_PREFIX="${{env.INSTL_DIR}}"
         -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
         -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
diff --git a/cmake/FindCUDA.cmake b/cmake/FindCUDA.cmake
@@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
 set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})
 
 if(WINDOWS)
-    find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
+    find_file(CUDA_DLL NAMES "nvcuda.dll")
     get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
     set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
 endif()
diff --git a/examples/cmake/FindCUDA.cmake b/examples/cmake/FindCUDA.cmake
@@ -11,7 +11,7 @@ get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
 set(CUDA_LIBRARY_DIRS ${CUDA_LIB_DIR})
 
 if(WINDOWS)
-    find_file(CUDA_DLL NAMES "bin/cuda.dll" "cuda.dll")
+    find_file(CUDA_DLL NAMES "nvcuda.dll")
     get_filename_component(CUDA_DLL_DIR ${CUDA_DLL} DIRECTORY)
     set(CUDA_DLL_DIRS ${CUDA_DLL_DIR})
 endif()
diff --git a/examples/cuda_shared_memory/cuda_shared_memory.c b/examples/cuda_shared_memory/cuda_shared_memory.c
@@ -14,8 +14,18 @@
 #include <umf/pools/pool_disjoint.h>
 #include <umf/providers/provider_cuda.h>
 
+// disable warning 4201: nonstandard extension used: nameless struct/union
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4201)
+#endif // _MSC_VER
+
 #include <cuda.h>
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif // _MSC_VER
+
 int main(void) {
     // A result object for storing UMF API result status
     umf_result_t res;
diff --git a/src/provider/provider_cuda.c b/src/provider/provider_cuda.c
@@ -21,8 +21,18 @@ umf_memory_provider_ops_t *umfCUDAMemoryProviderOps(void) {
 
 #else // !defined(UMF_NO_CUDA_PROVIDER)
 
+// disable warning 4201: nonstandard extension used: nameless struct/union
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4201)
+#endif // _MSC_VER
+
 #include "cuda.h"
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif // _MSC_VER
+
 #include "base_alloc_global.h"
 #include "utils_assert.h"
 #include "utils_common.h"
@@ -100,7 +110,7 @@ static umf_result_t cu2umf_result(CUresult result) {
 
 static void init_cu_global_state(void) {
 #ifdef _WIN32
-    const char *lib_name = "cudart.dll";
+    const char *lib_name = "nvcuda.dll";
 #else
     const char *lib_name = "libcuda.so";
 #endif
@@ -159,6 +169,7 @@ static umf_result_t cu_memory_provider_initialize(void *params,
 
     if (cu_params->memory_type == UMF_MEMORY_TYPE_UNKNOWN ||
         cu_params->memory_type > UMF_MEMORY_TYPE_SHARED) {
+        LOG_ERR("Invalid memory type value");
         return UMF_RESULT_ERROR_INVALID_ARGUMENT;
     }
 
diff --git a/src/utils/utils_load_library.c b/src/utils/utils_load_library.c
@@ -16,15 +16,18 @@
 #include <libloaderapi.h>
 // clang-format on
 
-#else
+#else // _WIN32
 
 #define _GNU_SOURCE 1
 
 #include <dlfcn.h> // forces linking with libdl on Linux
 
-#endif
+#endif // !_WIN32
+
+#include <stddef.h>
 
 #include "utils_load_library.h"
+#include "utils_log.h"
 
 #ifdef _WIN32
 
@@ -47,7 +50,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol,
         }
         handle = GetModuleHandle(libname);
     }
-    return (void *)GetProcAddress((HMODULE)handle, symbol);
+
+    void *addr = (void *)GetProcAddress((HMODULE)handle, symbol);
+    if (addr == NULL) {
+        LOG_ERR("Required symbol not found: %s", symbol);
+    }
+
+    return addr;
 }
 
 #else /* Linux */
@@ -68,7 +77,13 @@ void *utils_get_symbol_addr(void *handle, const char *symbol,
     if (!handle) {
         handle = RTLD_DEFAULT;
     }
-    return dlsym(handle, symbol);
+
+    void *addr = dlsym(handle, symbol);
+    if (addr == NULL) {
+        LOG_ERR("Required symbol not found: %s", symbol);
+    }
+
+    return addr;
 }
 
 #endif
diff --git a/test/providers/cuda_helpers.cpp b/test/providers/cuda_helpers.cpp
@@ -18,6 +18,7 @@ struct libcu_ops {
     CUresult (*cuCtxCreate)(CUcontext *pctx, unsigned int flags, CUdevice dev);
     CUresult (*cuCtxDestroy)(CUcontext ctx);
     CUresult (*cuCtxGetCurrent)(CUcontext *pctx);
+    CUresult (*cuCtxSetCurrent)(CUcontext ctx);
     CUresult (*cuDeviceGet)(CUdevice *device, int ordinal);
     CUresult (*cuMemAlloc)(CUdeviceptr *dptr, size_t size);
     CUresult (*cuMemFree)(CUdeviceptr dptr);
@@ -34,6 +35,7 @@ struct libcu_ops {
                                        CUpointer_attribute *attributes,
                                        void **data, CUdeviceptr ptr);
     CUresult (*cuStreamSynchronize)(CUstream hStream);
+    CUresult (*cuCtxSynchronize)(void);
 } libcu_ops;
 
 #if USE_DLOPEN
@@ -48,7 +50,7 @@ struct DlHandleCloser {
 std::unique_ptr<void, DlHandleCloser> cuDlHandle = nullptr;
 int InitCUDAOps() {
 #ifdef _WIN32
-    const char *lib_name = "cudart.dll";
+    const char *lib_name = "nvcuda.dll";
 #else
     const char *lib_name = "libcuda.so";
 #endif
@@ -84,6 +86,12 @@ int InitCUDAOps() {
         fprintf(stderr, "cuCtxGetCurrent symbol not found in %s\n", lib_name);
         return -1;
     }
+    *(void **)&libcu_ops.cuCtxSetCurrent =
+        utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSetCurrent", lib_name);
+    if (libcu_ops.cuCtxSetCurrent == nullptr) {
+        fprintf(stderr, "cuCtxSetCurrent symbol not found in %s\n", lib_name);
+        return -1;
+    }
     *(void **)&libcu_ops.cuDeviceGet =
         utils_get_symbol_addr(cuDlHandle.get(), "cuDeviceGet", lib_name);
     if (libcu_ops.cuDeviceGet == nullptr) {
@@ -153,6 +161,12 @@ int InitCUDAOps() {
                 lib_name);
         return -1;
     }
+    *(void **)&libcu_ops.cuCtxSynchronize =
+        utils_get_symbol_addr(cuDlHandle.get(), "cuCtxSynchronize", lib_name);
+    if (libcu_ops.cuCtxSynchronize == nullptr) {
+        fprintf(stderr, "cuCtxSynchronize symbol not found in %s\n", lib_name);
+        return -1;
+    }
 
     return 0;
 }
@@ -165,6 +179,7 @@ int InitCUDAOps() {
     libcu_ops.cuCtxCreate = cuCtxCreate;
     libcu_ops.cuCtxDestroy = cuCtxDestroy;
     libcu_ops.cuCtxGetCurrent = cuCtxGetCurrent;
+    libcu_ops.cuCtxSetCurrent = cuCtxSetCurrent;
     libcu_ops.cuDeviceGet = cuDeviceGet;
     libcu_ops.cuMemAlloc = cuMemAlloc;
     libcu_ops.cuMemAllocHost = cuMemAllocHost;
@@ -176,11 +191,31 @@ int InitCUDAOps() {
     libcu_ops.cuPointerGetAttribute = cuPointerGetAttribute;
     libcu_ops.cuPointerGetAttributes = cuPointerGetAttributes;
     libcu_ops.cuStreamSynchronize = cuStreamSynchronize;
+    libcu_ops.cuCtxSynchronize = cuCtxSynchronize;
 
     return 0;
 }
 #endif // USE_DLOPEN
 
+static CUresult set_context(CUcontext required_ctx, CUcontext *restore_ctx) {
+    CUcontext current_ctx = NULL;
+    CUresult cu_result = libcu_ops.cuCtxGetCurrent(&current_ctx);
+    if (cu_result != CUDA_SUCCESS) {
+        fprintf(stderr, "cuCtxGetCurrent() failed.\n");
+        return cu_result;
+    }
+
+    *restore_ctx = current_ctx;
+    if (current_ctx != required_ctx) {
+        cu_result = libcu_ops.cuCtxSetCurrent(required_ctx);
+        if (cu_result != CUDA_SUCCESS) {
+            fprintf(stderr, "cuCtxSetCurrent() failed.\n");
+        }
+    }
+
+    return cu_result;
+}
+
 static int init_cuda_lib(void) {
     CUresult result = libcu_ops.cuInit(0);
     if (result != CUDA_SUCCESS) {
@@ -191,8 +226,6 @@ static int init_cuda_lib(void) {
 
 int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
               const void *pattern, size_t pattern_size) {
-
-    (void)context;
     (void)device;
     (void)pattern_size;
 
@@ -202,23 +235,40 @@ int cuda_fill(CUcontext context, CUdevice device, void *ptr, size_t size,
         return -1;
     }
 
+    // set required context
+    CUcontext curr_context = nullptr;
+    set_context(context, &curr_context);
+
     int ret = 0;
     CUresult res =
         libcu_ops.cuMemsetD32((CUdeviceptr)ptr, *(unsigned int *)pattern,
                               size / sizeof(unsigned int));
     if (res != CUDA_SUCCESS) {
-        fprintf(stderr, "cuMemsetD32() failed!\n");
+        fprintf(stderr, "cuMemsetD32(%llu, %u, %zu) failed!\n",
+                (CUdeviceptr)ptr, *(unsigned int *)pattern,
+                size / pattern_size);
+        return -1;
+    }
+
+    res = libcu_ops.cuCtxSynchronize();
+    if (res != CUDA_SUCCESS) {
+        fprintf(stderr, "cuCtxSynchronize() failed!\n");
         return -1;
     }
 
+    // restore context
+    set_context(curr_context, &curr_context);
     return ret;
 }
 
-int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
-              size_t size) {
-    (void)context;
+int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr,
+              const void *src_ptr, size_t size) {
     (void)device;
 
+    // set required context
+    CUcontext curr_context = nullptr;
+    set_context(context, &curr_context);
+
     int ret = 0;
     CUresult res =
         libcu_ops.cuMemcpy((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size);
@@ -227,12 +277,14 @@ int cuda_copy(CUcontext context, CUdevice device, void *dst_ptr, void *src_ptr,
         return -1;
     }
 
-    res = libcu_ops.cuStreamSynchronize(0);
+    res = libcu_ops.cuCtxSynchronize();
     if (res != CUDA_SUCCESS) {
-        fprintf(stderr, "cuStreamSynchronize() failed!\n");
+        fprintf(stderr, "cuCtxSynchronize() failed!\n");
         return -1;
     }
 
+    // restore context
+    set_context(curr_context, &curr_context);
     return ret;
 }
 
diff --git a/test/providers/cuda_helpers.h b/test/providers/cuda_helpers.h
diff --git a/test/providers/provider_cuda.cpp b/test/providers/provider_cuda.cpp