From e96b20b29d7f40ea4fb7de2f2abc81ecbfa0d070 Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger Date: Thu, 13 Feb 2025 13:19:50 +0000 Subject: [PATCH 1/4] Feature: replaces singular cuda_graph with vector of cuda_graphs, adds for-loop to cycle through them, optimizes function calls to pass specific graphs instead of the whole context --- ggml/src/ggml-cuda/common.cuh | 3 +- ggml/src/ggml-cuda/ggml-cuda.cu | 177 ++++++++++++++++---------------- 2 files changed, 93 insertions(+), 87 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index fd4dcfa941d4b..518af27a4c408 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -703,7 +703,8 @@ struct ggml_backend_cuda_context { cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - std::unique_ptr cuda_graph; + const int NUM_CUDA_GRAPHS = 1; + std::vector> cuda_graphs; explicit ggml_backend_cuda_context(int device) : device(device), diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 6d5d9aa54703b..365e8d96654d4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2414,11 +2414,11 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } #ifdef USE_CUDA_GRAPH -static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +static bool check_node_graph_compatibility_and_refresh_copy_ops(std::unique_ptr & cuda_graph, ggml_cgraph * cgraph, std::vector & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->updated_kernel_arg.clear(); + cuda_graph->updated_kernel_arg.clear(); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2451,7 +2451,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud if (node->op == GGML_OP_CPY) { // store the copy op parameter which changes with each token. - cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); + cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); // store a pointer to each copy op CUDA kernel to identify it later void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); if (!ptr) { @@ -2525,26 +2525,28 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return true; } -static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) { +static void maintain_cuda_graph(std::unique_ptr & cuda_graph, std::vector & ggml_cuda_cpy_fn_ptrs, + bool cuda_graph_update_required) { if (cuda_graph_update_required) { // Extract nodes from graph // First call with null argument gets number of nodes in graph - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes)); + CUDA_CHECK(cudaGraphGetNodes(cuda_graph->graph, nullptr, &cuda_graph->num_nodes)); // Subsequent call with non-null argument gets nodes - cuda_ctx->cuda_graph->nodes.clear(); - cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes); - cuda_ctx->cuda_graph->params.clear(); - cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes); - if (cuda_ctx->cuda_graph->num_nodes > 0) { - CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes)); + cuda_graph->nodes.clear(); + cuda_graph->nodes.resize(cuda_graph->num_nodes); + cuda_graph->params.clear(); + cuda_graph->params.resize(cuda_graph->num_nodes); + if (cuda_graph->num_nodes > 0) { + CUDA_CHECK(cudaGraphGetNodes(cuda_graph->graph, cuda_graph->nodes.data(), &cuda_graph->num_nodes)); // Loop over nodes, and extract kernel parameters from each node - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { + for (size_t i = 0; i < cuda_graph->num_nodes; i++) { cudaGraphNodeType node_type; - CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type)); + CUDA_CHECK(cudaGraphNodeGetType(cuda_graph->nodes[i], &node_type)); if (node_type == cudaGraphNodeTypeKernel) { - cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime + // Get params using runtime + cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_graph->nodes[i], &cuda_graph->params[i]); if (stat == cudaErrorInvalidDeviceFunction) { // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. // We don't need to update blas nodes, so clear error and move on. @@ -2560,28 +2562,28 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto // replace that argument with the updated value in the CUDA graph // on update steps, the live parameters will already be captured int k = 0; - for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) { - char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++); - cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; - CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i])); + for (size_t i = 0; i < cuda_graph->num_nodes; i++) { + if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_graph->params[i].func) > 0) { + char ** updated_kernel_arg_ptr = cuda_graph->updated_kernel_arg.at(k++); + cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; + CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_graph->nodes[i], &cuda_graph->params[i])); } } } } -static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { +static bool is_cuda_graph_update_required(std::unique_ptr & cuda_graph, ggml_cgraph * cgraph) { bool cuda_graph_update_required = false; - if (cuda_ctx->cuda_graph->instance == nullptr) { + if (cuda_graph->instance == nullptr) { cuda_graph_update_required = true; } // Check if the graph size has changed - if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { + if (cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { cuda_graph_update_required = true; - cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); } // Loop over nodes in GGML graph to determine if CUDA graph update is required @@ -2589,25 +2591,25 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, for (int i = 0; i < cgraph->n_nodes; i++) { bool has_matching_properties = true; if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } if (!has_matching_properties) { cuda_graph_update_required = true; } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); } return cuda_graph_update_required; } -static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { +static void update_cuda_graph_executable(std::unique_ptr & cuda_graph) { cudaGraphExecUpdateResultInfo result_info; #ifdef __HIP_PLATFORM_AMD__ hipGraphNode_t errorNode; - hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info); + hipError_t stat = hipGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &errorNode, &result_info); #else - cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); + cudaError_t stat = cudaGraphExecUpdate(cuda_graph->instance, cuda_graph->graph, &result_info); #endif if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG @@ -2617,18 +2619,18 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate (void)cudaGetLastError(); - CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); - cuda_ctx->cuda_graph->instance = nullptr; - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + CUDA_CHECK(cudaGraphExecDestroy(cuda_graph->instance)); + cuda_graph->instance = nullptr; + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); } else { GGML_ASSERT(stat == cudaSuccess); } } #endif -static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - [[maybe_unused]] std::vector & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph, - bool & cuda_graph_update_required) { +static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::unique_ptr & cuda_graph, + ggml_cgraph * cgraph, [[maybe_unused]] std::vector & ggml_cuda_cpy_fn_ptrs, + bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. @@ -2662,12 +2664,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx #ifdef USE_CUDA_GRAPH if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture - if (cuda_ctx->cuda_graph->graph != nullptr) { - CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); - cuda_ctx->cuda_graph->graph = nullptr; + if (cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_graph->graph)); + cuda_graph->graph = nullptr; } - CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph->graph)); graph_evaluated_or_captured = true; // CUDA graph has been captured } else { graph_evaluated_or_captured = true; // ggml graph has been directly evaluated @@ -2675,18 +2677,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } if (use_cuda_graph) { - if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. - CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + if (cuda_graph->instance == nullptr) { // Create executable graph from captured graph. + CUDA_CHECK(cudaGraphInstantiate(&cuda_graph->instance, cuda_graph->graph, NULL, NULL, 0)); } // Perform update to graph (if required for this token), and change copy parameter (required for every token) - maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required); + maintain_cuda_graph(cuda_graph, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required); // Update graph executable - update_cuda_graph_executable(cuda_ctx); + update_cuda_graph_executable(cuda_graph); // Launch graph - CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); + CUDA_CHECK(cudaGraphLaunch(cuda_graph->instance, cuda_ctx->stream())); #else graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH @@ -2701,71 +2703,74 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // vector of pointers to CUDA cpy kernels, which are required to identify // kernel parameters which need updated in the graph for each token std::vector ggml_cuda_cpy_fn_ptrs; + cuda_ctx->cuda_graphs.resize(cuda_ctx->NUM_CUDA_GRAPHS); + for (auto & cuda_graph : cuda_ctx->cuda_graphs) { #ifdef USE_CUDA_GRAPH - static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); + static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - // Objects required for CUDA Graph - if (cuda_ctx->cuda_graph == nullptr) { - cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); - } + // Objects required for CUDA Graph + if (cuda_graph == nullptr) { + cuda_graph.reset(new ggml_cuda_graph()); + } - bool use_cuda_graph = true; - bool cuda_graph_update_required = false; + bool use_cuda_graph = true; + bool cuda_graph_update_required = false; - if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { - cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; + if (cuda_graph->graph == nullptr) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { + cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif + } } - } - // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, - // or previous graph capture failure. - // Also disable for multi-gpu for now. TO DO investigate - if (disable_cuda_graphs_due_to_env - || cuda_ctx->cuda_graph->disable_due_to_gpu_arch - || cuda_ctx->cuda_graph->disable_due_to_too_many_updates - || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { - use_cuda_graph = false; - } + // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, + // or previous graph capture failure. + // Also disable for multi-gpu for now. TO DO investigate + if (disable_cuda_graphs_due_to_env + || cuda_graph->disable_due_to_gpu_arch + || cuda_graph->disable_due_to_too_many_updates + || cuda_graph->disable_due_to_failed_graph_capture) { + use_cuda_graph = false; + } - if (use_cuda_graph) { - cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + if (use_cuda_graph) { + cuda_graph_update_required = is_cuda_graph_update_required(cuda_graph, cgraph); - use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, - ggml_cuda_cpy_fn_ptrs, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_graph, cgraph, + ggml_cuda_cpy_fn_ptrs, use_cuda_graph); - // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. - if (use_cuda_graph && cuda_graph_update_required) { - cuda_ctx->cuda_graph->number_consecutive_updates++; - } else { - cuda_ctx->cuda_graph->number_consecutive_updates = 0; - } + // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. + if (use_cuda_graph && cuda_graph_update_required) { + cuda_graph->number_consecutive_updates++; + } else { + cuda_graph->number_consecutive_updates = 0; + } - if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { - cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; + if (cuda_graph->number_consecutive_updates >= 4) { + cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif + } } - } - if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture - CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); - } + if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); + } #else - bool use_cuda_graph = false; - bool cuda_graph_update_required = false; + bool use_cuda_graph = false; + bool cuda_graph_update_required = false; #endif // USE_CUDA_GRAPH - bool graph_evaluated_or_captured = false; - - evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + bool graph_evaluated_or_captured = false; + evaluate_and_capture_cuda_graph(cuda_ctx, cuda_graph, cgraph, ggml_cuda_cpy_fn_ptrs, + graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + } return GGML_STATUS_SUCCESS; } From d03956af7c384cdf39a2d4c565b23491360e2187 Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger Date: Thu, 13 Feb 2025 14:36:21 +0000 Subject: [PATCH 2/4] Feature: Adds offset structure for each cuda graph. Tested by hard-coding 2 cuda graphs and setting custom offsets. --- ggml/src/ggml-cuda/common.cuh | 1 - ggml/src/ggml-cuda/ggml-cuda.cu | 46 ++++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 518af27a4c408..0abda85331dee 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -703,7 +703,6 @@ struct ggml_backend_cuda_context { cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - const int NUM_CUDA_GRAPHS = 1; std::vector> cuda_graphs; explicit ggml_backend_cuda_context(int device) : diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 365e8d96654d4..c8513a6551cb2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2413,13 +2413,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { GGML_UNUSED(backend); } +// groups cgraph->nodes offsets per cuda_graph +struct cgraph_offset { + int begin; + int end; +}; + #ifdef USE_CUDA_GRAPH static bool check_node_graph_compatibility_and_refresh_copy_ops(std::unique_ptr & cuda_graph, ggml_cgraph * cgraph, - std::vector & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) { + std::vector & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph, cgraph_offset & offset) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_graph->updated_kernel_arg.clear(); - for (int i = 0; i < cgraph->n_nodes; i++) { + for (int i = offset.begin; i < offset.end; i++) { ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -2572,7 +2578,8 @@ static void maintain_cuda_graph(std::unique_ptr & cuda_graph, s } } -static bool is_cuda_graph_update_required(std::unique_ptr & cuda_graph, ggml_cgraph * cgraph) { +static bool is_cuda_graph_update_required(std::unique_ptr & cuda_graph, ggml_cgraph * cgraph, + cgraph_offset & offset) { bool cuda_graph_update_required = false; @@ -2581,22 +2588,22 @@ static bool is_cuda_graph_update_required(std::unique_ptr & cud } // Check if the graph size has changed - if (cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { + if (cuda_graph->ggml_graph_properties.size() != (size_t)(offset.end - offset.begin)) { cuda_graph_update_required = true; - cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + cuda_graph->ggml_graph_properties.resize((offset.end - offset.begin)); } // Loop over nodes in GGML graph to determine if CUDA graph update is required // and store properties to allow this comparison for the next token - for (int i = 0; i < cgraph->n_nodes; i++) { + for (int i = offset.begin; i < offset.end; i++) { bool has_matching_properties = true; if (!cuda_graph_update_required) { - has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); + has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]); } if (!has_matching_properties) { cuda_graph_update_required = true; } - set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i]); + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph->ggml_graph_properties[i - offset.begin]); } return cuda_graph_update_required; @@ -2628,15 +2635,15 @@ static void update_cuda_graph_executable(std::unique_ptr & cuda } #endif -static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::unique_ptr & cuda_graph, +static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, [[maybe_unused]] std::unique_ptr & cuda_graph, ggml_cgraph * cgraph, [[maybe_unused]] std::vector & ggml_cuda_cpy_fn_ptrs, - bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { + bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required, cgraph_offset & offset) { while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. if (!use_cuda_graph || cuda_graph_update_required) { - for (int i = 0; i < cgraph->n_nodes; i++) { + for (int i = offset.begin; i < offset.end; i++) { ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -2703,9 +2710,18 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // vector of pointers to CUDA cpy kernels, which are required to identify // kernel parameters which need updated in the graph for each token std::vector ggml_cuda_cpy_fn_ptrs; - cuda_ctx->cuda_graphs.resize(cuda_ctx->NUM_CUDA_GRAPHS); + cuda_ctx->cuda_graphs.resize(2); + cgraph_offset offset {1,0}; for (auto & cuda_graph : cuda_ctx->cuda_graphs) { + // hard-coded test for 2 graphs + if (offset.begin == 1) { // first subset + offset.begin = 0; + offset.end = 400; + } else { // second subset + offset.begin = offset.end; + offset.end = cgraph->n_nodes; + } #ifdef USE_CUDA_GRAPH static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); @@ -2737,10 +2753,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, } if (use_cuda_graph) { - cuda_graph_update_required = is_cuda_graph_update_required(cuda_graph, cgraph); + cuda_graph_update_required = is_cuda_graph_update_required(cuda_graph, cgraph, offset); use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_graph, cgraph, - ggml_cuda_cpy_fn_ptrs, use_cuda_graph); + ggml_cuda_cpy_fn_ptrs, use_cuda_graph, offset); // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. if (use_cuda_graph && cuda_graph_update_required) { @@ -2769,7 +2785,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, bool graph_evaluated_or_captured = false; evaluate_and_capture_cuda_graph(cuda_ctx, cuda_graph, cgraph, ggml_cuda_cpy_fn_ptrs, - graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required, offset); } return GGML_STATUS_SUCCESS; } From 1d2f25ccde89486ecb84aaee8aec5814572f69b9 Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger Date: Thu, 13 Feb 2025 15:12:12 +0000 Subject: [PATCH 3/4] Feature: Implements mechanism to distribute work over CUDA graphs from the vulkan backend. The first two graphs are small to minimize idle time, and then graphs have uniform size. --- ggml/src/ggml-cuda/ggml-cuda.cu | 42 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c8513a6551cb2..323f22b28f909 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2710,18 +2710,36 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // vector of pointers to CUDA cpy kernels, which are required to identify // kernel parameters which need updated in the graph for each token std::vector ggml_cuda_cpy_fn_ptrs; - cuda_ctx->cuda_graphs.resize(2); - cgraph_offset offset {1,0}; - - for (auto & cuda_graph : cuda_ctx->cuda_graphs) { - // hard-coded test for 2 graphs - if (offset.begin == 1) { // first subset - offset.begin = 0; - offset.end = 400; - } else { // second subset - offset.begin = offset.end; - offset.end = cgraph->n_nodes; - } + + // Heuristic to minimize GPU idle time. Work is split over several CUDA graphs, + // to overlap graph building (CPU) and graph execution (GPU). + // The first graphs are small to minimize the time in which the CPU prepares work and the GPU is idle. + // After that, graph building (CPU) is done in parallel to the execution of another previously built graph (GPU). + int first_graph_subset = 20; + int second_graph_subset = 50; + int remaining_graph_subset = 100; + int remaining_nodes = (cgraph->n_nodes - first_graph_subset) - second_graph_subset; + int num_cuda_graphs_required = 2 + (remaining_nodes / remaining_graph_subset); + cuda_ctx->cuda_graphs.resize(num_cuda_graphs_required); + cgraph_offset offset {0,0}; + + for (size_t i = 0; i < cuda_ctx->cuda_graphs.size(); i++) { + auto & cuda_graph = cuda_ctx->cuda_graphs[i]; + + offset.begin = offset.end; + if (i == 0) offset.end += first_graph_subset; + if (i == 1) offset.end += second_graph_subset; + if (i >= 2) offset.end += remaining_graph_subset; + + // last graph does the rest + if ((i + 1) == cuda_ctx->cuda_graphs.size()) offset.end = cgraph->n_nodes; + + // special case for graphs smaller than the ramp-up heuristic + if (cgraph->n_nodes <= first_graph_subset + second_graph_subset) { + offset.end = cgraph->n_nodes; + if (i > 0) break; + } + #ifdef USE_CUDA_GRAPH static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); From 1591f9b8c4cc254fac51fb9f54ff28b649cd2170 Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger Date: Fri, 14 Feb 2025 10:08:38 +0000 Subject: [PATCH 4/4] FIX: Use make_unique() instead of new --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 323f22b28f909..cd4e8ccfe1198 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2745,7 +2745,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // Objects required for CUDA Graph if (cuda_graph == nullptr) { - cuda_graph.reset(new ggml_cuda_graph()); + cuda_graph = std::make_unique(); } bool use_cuda_graph = true;