diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c49d645c3196..ed92a6c68e287 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ option(LLAMA_CUBLAS "llama: use CUDA" #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) +option(LLAMA_CUDA_USE_CUDA_POOL "llama: use CUDA memory instead of custom pool" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) @@ -270,6 +271,11 @@ if (LLAMA_CUBLAS) if (LLAMA_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + + if (LLAMA_CUDA_USE_CUDA_POOL) + add_compile_definitions(GGML_USE_CUDA_MEMORY_POOL) + endif() + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -373,6 +379,10 @@ if (LLAMA_HIPBLAS) if (LLAMA_CUDA_FORCE_MMQ) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_USE_CUDA_POOL) + target_compile_definitions(ggml-rocm PRIVATE GGML_USE_CUDA_MEMORY_POOL) + endif() + target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bdbcca0cabb88..83da27c7f3c6a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -108,6 +108,10 @@ #define CUDA_USE_TENSOR_CORES #endif +#if defined(GGML_USE_CUDA_MEMORY_POOL) +#define CUDA_USE_MEMORY_POOL +#endif + // max batch size to use MMQ kernels when tensor cores are available #define MMQ_MAX_BATCH_SIZE 32 @@ -5844,8 +5848,48 @@ void ggml_init_cublas() { for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor); +#if defined(CUDA_USE_MEMORY_POOL) + bool support_mem_pool = true; +#if CUDART_VERSION >= 12000 + support_mem_pool = (prop.memoryPoolsSupported == 1); +#endif + if (support_mem_pool) { + cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); + if (err == cudaSuccess) { + size_t treshold = UINT64_MAX; + err = (cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); + if (err == cudaSuccess) { + fprintf(stderr, ", CUDA memory pool is supported\n"); + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (release threshold error)\n"); + } + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (can't load default pool)\n"); + } + // test alloc/dealoc + if (err == cudaSuccess) { + void *testPtr; + size_t testSize = 1024; + err = cudaMallocFromPoolAsync(&testPtr, testSize, g_cudaMemPools[id], g_cudaStreams[id][0]); + if (err == cudaSuccess) { + err = cudaFreeAsync(testPtr, g_cudaStreams[id][0]); + if (err != cudaSuccess) { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (deallocation failed)\n"); + } + } else { + g_cudaMemPools[id] = nullptr; + fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n"); + } + } + } else { + fprintf(stderr, ", CUDA memory pool is not supported\n"); + } +#endif g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) @@ -5854,6 +5898,52 @@ void ggml_init_cublas() { g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } + +#if defined(CUDA_USE_MEMORY_POOL) + if (g_device_count > 1) { + // give access to devices memory pools + if (g_cudaMemPools[g_main_device] != nullptr) { + cudaMemPool_t main_device_pool; + cudaMemAccessDesc desc_main_device = {}; + desc_main_device.location.type = cudaMemLocationTypeDevice; + desc_main_device.location.id = g_main_device; + desc_main_device.flags = cudaMemAccessFlagsProtReadWrite; + CUDA_CHECK(cudaDeviceGetDefaultMemPool(&main_device_pool, g_main_device)); + for (int id = 0; id < g_device_count; ++id) { + if (id == g_main_device) continue; + + if (g_cudaMemPools[id] == nullptr) { + fprintf(stderr, + "Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n", + id); + continue; + } + + cudaMemAccessDesc desc_device = {}; + desc_device.location.type = cudaMemLocationTypeDevice; + desc_device.location.id = id; + desc_device.flags = cudaMemAccessFlagsProtReadWrite; + cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */); + if (err != cudaSuccess) { + fprintf(stderr, "Can't give access for main device memory pool to device %d\n", id); + } + cudaMemPool_t mempool; + CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id)); + err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */); + if (err != cudaSuccess) { + fprintf(stderr, "Can't give access for device %d memory pool to main device \n", id); + } + } + } else { + fprintf(stderr, + "WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n"); + for (int id = 0; id < g_device_count; ++id) { + g_cudaMemPools[id] = nullptr; + } + } + } +#endif + for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } @@ -5869,13 +5959,6 @@ void ggml_init_cublas() { // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); - - // configure memory pool - cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id); - if (err == cudaSuccess) { - size_t treshold = UINT64_MAX; - CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold)); - } } // configure logging to stdout @@ -6375,7 +6458,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; if (src1_convert_f16) { - src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + src1_dfloat = (half *) ggml_cuda_pool_malloc_async(ne00*sizeof(half), &ash, g_main_device, stream); ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1, sizeof(half), 0, 0, stream); @@ -6776,7 +6859,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s if (src0_on_device) { src0_ddf = (float *) src0_extra->data_device[g_main_device]; } else { - src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf); + src0_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_asf, g_main_device, main_stream); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); } @@ -6784,14 +6867,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s if (src1_on_device) { src1_ddf = (float *) src1_extra->data_device[g_main_device]; } else { - src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf); + src1_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf, g_main_device, main_stream); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); } } if (dst_on_device) { dst_ddf = (float *) dst_extra->data_device[g_main_device]; } else { - dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf); + dst_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(dst), &dst_asf, g_main_device, main_stream); } // do the computation @@ -6803,18 +6886,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); } + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } + if (src0_asf > 0) { - ggml_cuda_pool_free(src0_ddf, src0_asf); + ggml_cuda_pool_free_async(src0_ddf, src0_asf, g_main_device, main_stream); } if (src1_asf > 0) { - ggml_cuda_pool_free(src1_ddf, src1_asf); + ggml_cuda_pool_free_async(src1_ddf, src1_asf, g_main_device, main_stream); } if (dst_asf > 0) { - ggml_cuda_pool_free(dst_ddf, dst_asf); - } - - if (dst->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free_async(dst_ddf, dst_asf, g_main_device, main_stream); } }