From 0fb5ca66fa2cb4dc3756b10b2892b2f8c7c9a167 Mon Sep 17 00:00:00 2001 From: "Guo, Xiang1" Date: Mon, 19 May 2025 23:16:44 -0700 Subject: [PATCH] src: llama-graph: MLA kv cache: fix split graph backend assignment when kv cache store on CPU --- src/llama-graph.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 410d2608798b8..47c382f3d87f2 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1156,6 +1156,10 @@ ggml_tensor * llm_graph_context::build_attn_mha( // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA if (v_mla) { kqv = ggml_mul_mat(ctx0, v_mla, kqv); + // all nodes between the KV store and the attention output are run on the CPU + if (!cparams.offload_kqv) { + ggml_backend_sched_set_tensor_backend(sched, kqv, backend_cpu); + } } cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);