From ce4df17e420b0957c743031706b7da7fa5403956 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Fri, 3 Nov 2023 13:40:28 +0100 Subject: [PATCH 1/8] Check CUDA memory support in device properties. --- ggml-cuda.cu | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bdbcca0cabb88..4ea2c9450abf4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5844,7 +5844,19 @@ void ggml_init_cublas() { for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor); + + // configure memory pool + if (prop.memoryPoolsSupported == 1) { + cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); + if (err == cudaSuccess) { + size_t treshold = UINT64_MAX; + CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); + fprintf(stderr, ", CUDA memory pool is supported\n"); + } + } else { + fprintf(stderr, ", CUDA memory pool is not supported\n"); + } g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -5869,13 +5881,6 @@ void ggml_init_cublas() { // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); - - // configure memory pool - cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); - if (err == cudaSuccess) { - size_t treshold = UINT64_MAX; - CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); - } } // configure logging to stdout From bd56886fd64ef5b14c6e81d97c385e88855b5724 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Fri, 3 Nov 2023 13:46:14 +0100 Subject: [PATCH 2/8] set nullptr to memory pool element if it failed during initialization. --- ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4ea2c9450abf4..e65f7e95b85bc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5853,6 +5853,8 @@ void ggml_init_cublas() { size_t treshold = UINT64_MAX; CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); fprintf(stderr, ", CUDA memory pool is supported\n"); + } else { + g_cudaMemPools[id] = nullptr; } } else { fprintf(stderr, ", CUDA memory pool is not supported\n"); From c42ca8f1b7205b7568d1f223bae649e69e6d5c7a Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Fri, 3 Nov 2023 15:06:40 +0100 Subject: [PATCH 3/8] GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL was added to force use only custom memory pool --- ggml-cuda.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e65f7e95b85bc..1ff3278768c30 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -108,6 +108,10 @@ #define CUDA_USE_TENSOR_CORES #endif +#if !defined(GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL) +#define CUDA_USE_MEMORY_POOL +#endif + // max batch size to use MMQ kernels when tensor cores are available #define MMQ_MAX_BATCH_SIZE 32 @@ -5845,7 +5849,7 @@ void ggml_init_cublas() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor); - +#if defined(CUDA_USE_MEMORY_POOL) // configure memory pool if (prop.memoryPoolsSupported == 1) { cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); @@ -5859,7 +5863,7 @@ void ggml_init_cublas() { } else { fprintf(stderr, ", CUDA memory pool is not supported\n"); } - +#endif g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) From 815bf1a2f6c81c2e884ead0e02f99e2944b37229 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Fri, 3 Nov 2023 15:51:53 +0100 Subject: [PATCH 4/8] prop.memoryPoolsSupported cant be found in cuda 17. Revert back to basic error check. --- ggml-cuda.cu | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1ff3278768c30..42765e22450a4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5851,16 +5851,13 @@ void ggml_init_cublas() { fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor); #if defined(CUDA_USE_MEMORY_POOL) // configure memory pool - if (prop.memoryPoolsSupported == 1) { - cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); - if (err == cudaSuccess) { - size_t treshold = UINT64_MAX; - CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); - fprintf(stderr, ", CUDA memory pool is supported\n"); - } else { - g_cudaMemPools[id] = nullptr; - } + cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); + if (err == cudaSuccess) { + size_t treshold = UINT64_MAX; + CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); + fprintf(stderr, ", CUDA memory pool is supported\n"); } else { + g_cudaMemPools[id] = nullptr; fprintf(stderr, ", CUDA memory pool is not supported\n"); } #endif From 56e516240a923224613b132ec7062834c6485334 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Sat, 4 Nov 2023 10:25:51 +0100 Subject: [PATCH 5/8] All memory pool operation are checked during init phase. For CUDA 12+ device properties checked. --- ggml-cuda.cu | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 42765e22450a4..0b9bc0bcac92a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5849,16 +5849,43 @@ void ggml_init_cublas() { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor); + #if defined(CUDA_USE_MEMORY_POOL) - // configure memory pool - cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); - if (err == cudaSuccess) { - size_t treshold = UINT64_MAX; - CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); - fprintf(stderr, ", CUDA memory pool is supported\n"); - } else { - g_cudaMemPools[id] = nullptr; - fprintf(stderr, ", CUDA memory pool is not supported\n"); + bool support_mem_pool = true; +#if CUDART_VERSION >= 12000 + support_mem_pool = (prop.memoryPoolsSupported == 1); +#endif + if (support_mem_pool) { + cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); + if (err == cudaSuccess) { + size_t treshold = UINT64_MAX; + err = (cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); + if (err == cudaSuccess) { + fprintf(stderr, ", CUDA memory pool is supported\n"); + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (release threshold error)\n"); + } + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (cant load default pool)\n"); + } + // test alloc/dealoc + if (err == cudaSuccess) { + void *testPtr; + size_t testSize = 1024; + err = cudaMallocFromPoolAsync(&testPtr, testSize, g_cudaMemPools[id], g_cudaStreams[id][0]); + if (err == cudaSuccess) { + err = cudaFreeAsync(testPtr, g_cudaStreams[id][0]); + if (err != cudaSuccess) { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (deallocation failed)\n"); + } + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n"); + } + } } #endif g_tensor_split[id] = total_vram; From 81931b2ea7729e812b3fd8cc5ff60932b0fabdb1 Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Sat, 4 Nov 2023 17:29:08 +0100 Subject: [PATCH 6/8] Multi GPU memory pool access + Check memory pool support of multiple GPUs and main GPU. --- ggml-cuda.cu | 68 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0b9bc0bcac92a..0f2fb1921b94f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -503,6 +503,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; +static bool g_cudaMutliGpuMemPoolSupported = true; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -5813,7 +5814,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { - if (g_cudaMemPools[id] == nullptr) { + if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported) { return ggml_cuda_pool_free(ptr, actual_size); } CUDA_CHECK(cudaFreeAsync(ptr, stream)); @@ -5896,6 +5897,49 @@ void ggml_init_cublas() { g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } + +#if defined(CUDA_USE_MEMORY_POOL) + if (g_device_count > 1) { + // give access to devices memory pools + if (g_cudaMemPools[g_main_device] != nullptr) { + cudaMemPool_t main_device_pool; + cudaMemAccessDesc desc_main_device = {}; + desc_main_device.location.type = cudaMemLocationTypeDevice; + desc_main_device.location.id = g_main_device; + desc_main_device.flags = cudaMemAccessFlagsProtReadWrite; + CUDA_CHECK(cudaDeviceGetDefaultMemPool(&main_device_pool, g_main_device)); + for (int id = 0; id < g_device_count; ++id) { + if (id == g_main_device) continue; + + if (g_cudaMemPools[id] == nullptr) { + fprintf(stderr, + "Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n", + id); + } + + cudaMemAccessDesc desc_device = {}; + desc_device.location.type = cudaMemLocationTypeDevice; + desc_device.location.id = id; + desc_device.flags = cudaMemAccessFlagsProtReadWrite; + cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */); + if (err != cudaSuccess) { + fprintf(stderr, "Cant give access for main device memory pool to device %d\n", id); + } + cudaMemPool_t mempool; + CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id)); + err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */); + if (err != cudaSuccess) { + fprintf(stderr, "Cant give access for device %d memory pool to main device \n", id); + } + } + } else { + fprintf(stderr, + "WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n"); + g_cudaMutliGpuMemPoolSupported = false; + } + } +#endif + for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } @@ -6410,7 +6454,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; if (src1_convert_f16) { - src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + src1_dfloat = (half *) ggml_cuda_pool_malloc_async(ne00*sizeof(half), &ash, g_main_device, stream); ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1, sizeof(half), 0, 0, stream); @@ -6811,7 +6855,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s if (src0_on_device) { src0_ddf = (float *) src0_extra->data_device[g_main_device]; } else { - src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf); + src0_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_asf, g_main_device, main_stream); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); } @@ -6819,14 +6863,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s if (src1_on_device) { src1_ddf = (float *) src1_extra->data_device[g_main_device]; } else { - src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf); + src1_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf, g_main_device, main_stream); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); } } if (dst_on_device) { dst_ddf = (float *) dst_extra->data_device[g_main_device]; } else { - dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf); + dst_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(dst), &dst_asf, g_main_device, main_stream); } // do the computation @@ -6838,18 +6882,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); } + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } + if (src0_asf > 0) { - ggml_cuda_pool_free(src0_ddf, src0_asf); + ggml_cuda_pool_free_async(src0_ddf, src0_asf, g_main_device, main_stream); } if (src1_asf > 0) { - ggml_cuda_pool_free(src1_ddf, src1_asf); + ggml_cuda_pool_free_async(src1_ddf, src1_asf, g_main_device, main_stream); } if (dst_asf > 0) { - ggml_cuda_pool_free(dst_ddf, dst_asf); - } - - if (dst->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free_async(dst_ddf, dst_asf, g_main_device, main_stream); } } From 863166b4c39662b9e86406c7ee23014fb4e2d6bf Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Sat, 4 Nov 2023 17:50:59 +0100 Subject: [PATCH 7/8] Skip GPUs without mem pool support. --- ggml-cuda.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0f2fb1921b94f..bed0203943c1e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5869,7 +5869,7 @@ void ggml_init_cublas() { } } else { g_cudaMemPools[id] = nullptr; - fprintf(stderr, ", CUDA memory pool is not supported (cant load default pool)\n"); + fprintf(stderr, ", CUDA memory pool is not supported (can't load default pool)\n"); } // test alloc/dealoc if (err == cudaSuccess) { @@ -5887,6 +5887,8 @@ void ggml_init_cublas() { fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n"); } } + } else { + fprintf(stderr, ", CUDA memory pool is not supported\n"); } #endif g_tensor_split[id] = total_vram; @@ -5915,6 +5917,7 @@ void ggml_init_cublas() { fprintf(stderr, "Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n", id); + continue; } cudaMemAccessDesc desc_device = {}; @@ -5923,13 +5926,13 @@ void ggml_init_cublas() { desc_device.flags = cudaMemAccessFlagsProtReadWrite; cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */); if (err != cudaSuccess) { - fprintf(stderr, "Cant give access for main device memory pool to device %d\n", id); + fprintf(stderr, "Can't give access for main device memory pool to device %d\n", id); } cudaMemPool_t mempool; CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id)); err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */); if (err != cudaSuccess) { - fprintf(stderr, "Cant give access for device %d memory pool to main device \n", id); + fprintf(stderr, "Can't give access for device %d memory pool to main device \n", id); } } } else { From 2b0303add69b3dc8e8c873ba4540d7d0b5bd3a7c Mon Sep 17 00:00:00 2001 From: Oleksii Maryshchenko Date: Sat, 4 Nov 2023 18:41:11 +0100 Subject: [PATCH 8/8] CUDA pool is optional now. --- CMakeLists.txt | 10 ++++++++++ ggml-cuda.cu | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c49d645c3196..ed92a6c68e287 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA" #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) +option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -270,6 +271,11 @@ if (LLAMA_CUBLAS) if (LLAMA_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + + if (LLAMA_CUDA_USE_CUDA_POOL) + add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL) + endif() + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS) if (LLAMA_CUDA_FORCE_MMQ) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_USE_CUDA_POOL) + target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL) + endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bed0203943c1e..83da27c7f3c6a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -108,7 +108,7 @@ #define CUDA_USE_TENSOR_CORES #endif -#if !defined(GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL) +#if defined(GGML_USE_CUDA_MEMORY_POOL) #define CUDA_USE_MEMORY_POOL #endif @@ -503,7 +503,6 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; -static bool g_cudaMutliGpuMemPoolSupported = true; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -5814,7 +5813,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { - if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported) { + if (g_cudaMemPools[id] == nullptr) { return ggml_cuda_pool_free(ptr, actual_size); } CUDA_CHECK(cudaFreeAsync(ptr, stream)); @@ -5938,7 +5937,9 @@ void ggml_init_cublas() { } else { fprintf(stderr, "WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n"); - g_cudaMutliGpuMemPoolSupported = false; + for (int id = 0; id < g_device_count; ++id) { + g_cudaMemPools[id] = nullptr; + } } } #endif