Skip to content

Check CUDA memory pool support #3931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
119 changes: 99 additions & 20 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@
#define CUDA_USE_TENSOR_CORES
#endif

#if !defined(GGML_CUDA_FORCE_CUSTOM_MEMORY_POOL)
#define CUDA_USE_MEMORY_POOL
#endif

// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32

Expand Down Expand Up @@ -499,6 +503,7 @@ static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
static void * g_scratch_buffer = nullptr;
static size_t g_scratch_size = 0; // disabled by default
static size_t g_scratch_offset = 0;
static bool g_cudaMutliGpuMemPoolSupported = true;

static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

Expand Down Expand Up @@ -5809,7 +5814,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {


static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
if (g_cudaMemPools[id] == nullptr) {
if (g_cudaMemPools[id] == nullptr || !g_cudaMutliGpuMemPoolSupported) {
return ggml_cuda_pool_free(ptr, actual_size);
}
CUDA_CHECK(cudaFreeAsync(ptr, stream));
Expand Down Expand Up @@ -5844,8 +5849,46 @@ void ggml_init_cublas() {
for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
fprintf(stderr, " Device %d: %s, compute capability %d.%d", id, prop.name, prop.major, prop.minor);

#if defined(CUDA_USE_MEMORY_POOL)
bool support_mem_pool = true;
#if CUDART_VERSION >= 12000
support_mem_pool = (prop.memoryPoolsSupported == 1);
#endif
if (support_mem_pool) {
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
err = (cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
if (err == cudaSuccess) {
fprintf(stderr, ", CUDA memory pool is supported\n");
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (release threshold error)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (cant load default pool)\n");
}
// test alloc/dealoc
if (err == cudaSuccess) {
void *testPtr;
size_t testSize = 1024;
err = cudaMallocFromPoolAsync(&testPtr, testSize, g_cudaMemPools[id], g_cudaStreams[id][0]);
if (err == cudaSuccess) {
err = cudaFreeAsync(testPtr, g_cudaStreams[id][0]);
if (err != cudaSuccess) {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (deallocation failed)\n");
}
} else {
g_cudaMemPools[id] = nullptr;
fprintf(stderr, ", CUDA memory pool is not supported (allocation failed)\n");
}
}
}
#endif
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
Expand All @@ -5854,6 +5897,49 @@ void ggml_init_cublas() {
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}

#if defined(CUDA_USE_MEMORY_POOL)
if (g_device_count > 1) {
// give access to devices memory pools
if (g_cudaMemPools[g_main_device] != nullptr) {
cudaMemPool_t main_device_pool;
cudaMemAccessDesc desc_main_device = {};
desc_main_device.location.type = cudaMemLocationTypeDevice;
desc_main_device.location.id = g_main_device;
desc_main_device.flags = cudaMemAccessFlagsProtReadWrite;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&main_device_pool, g_main_device));
for (int id = 0; id < g_device_count; ++id) {
if (id == g_main_device) continue;

if (g_cudaMemPools[id] == nullptr) {
fprintf(stderr,
"Warning: Device %d doesnt support CUDA memory pool, skipping pool access config\n",
id);
}

cudaMemAccessDesc desc_device = {};
desc_device.location.type = cudaMemLocationTypeDevice;
desc_device.location.id = id;
desc_device.flags = cudaMemAccessFlagsProtReadWrite;
cudaError_t err = cudaMemPoolSetAccess(main_device_pool, &desc_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Cant give access for main device memory pool to device %d\n", id);
}
cudaMemPool_t mempool;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, id));
err = cudaMemPoolSetAccess(mempool, &desc_main_device, 1 /* numDescs */);
if (err != cudaSuccess) {
fprintf(stderr, "Cant give access for device %d memory pool to main device \n", id);
}
}
} else {
fprintf(stderr,
"WARNING: Your main GPU device doesnt support CUDA memory pools. Using custom memory pool implementation.\n");
g_cudaMutliGpuMemPoolSupported = false;
}
}
#endif

for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
}
Expand All @@ -5869,13 +5955,6 @@ void ggml_init_cublas() {
// create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));

// configure memory pool
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
}
}

// configure logging to stdout
Expand Down Expand Up @@ -6375,7 +6454,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

if (src1_convert_f16) {
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
src1_dfloat = (half *) ggml_cuda_pool_malloc_async(ne00*sizeof(half), &ash, g_main_device, stream);
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
ne00, 1, sizeof(float), 0, 0,
ne00, 1, sizeof(half), 0, 0, stream);
Expand Down Expand Up @@ -6776,22 +6855,22 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
if (src0_on_device) {
src0_ddf = (float *) src0_extra->data_device[g_main_device];
} else {
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
src0_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}

if (use_src1 && !src1_stays_on_host) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[g_main_device];
} else {
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
src1_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf, g_main_device, main_stream);
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
}
}
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
dst_ddf = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(dst), &dst_asf, g_main_device, main_stream);
}

// do the computation
Expand All @@ -6803,18 +6882,18 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}

if (src0_asf > 0) {
ggml_cuda_pool_free(src0_ddf, src0_asf);
ggml_cuda_pool_free_async(src0_ddf, src0_asf, g_main_device, main_stream);
}
if (src1_asf > 0) {
ggml_cuda_pool_free(src1_ddf, src1_asf);
ggml_cuda_pool_free_async(src1_ddf, src1_asf, g_main_device, main_stream);
}
if (dst_asf > 0) {
ggml_cuda_pool_free(dst_ddf, dst_asf);
}

if (dst->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free_async(dst_ddf, dst_asf, g_main_device, main_stream);
}
}

Expand Down