From e9bcf66a5cb611bc7a722edb35a5b38fed070f53 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 3 Oct 2023 17:49:36 +0200 Subject: [PATCH 1/3] per-layer KV --- llama.cpp | 109 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 38 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4a61eecdd328b..acc5ec7f7cf04 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1035,6 +1035,9 @@ struct llama_kv_cache { struct ggml_tensor * k = NULL; struct ggml_tensor * v = NULL; + std::vector k_l; // per layer + + std::vector v_l; struct ggml_context * ctx = NULL; @@ -1239,6 +1242,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); struct ggml_init_params params; @@ -1248,34 +1252,48 @@ static bool llama_kv_cache_init( cache.ctx = ggml_init(params); + size_t vram_kv_cache = 0; + if (!cache.ctx) { LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); return false; } - cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - ggml_set_name(cache.k, "cache_k"); - ggml_set_name(cache.v, "cache_v"); + // cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + // cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + // ggml_set_name(cache.k, "cache_k"); + // ggml_set_name(cache.v, "cache_v"); - (void) n_gpu_layers; + cache.k_l.reserve(n_layer); + cache.v_l.reserve(n_layer); + + const int i_gpu_start = n_layer - n_gpu_layers; + + for (uint32_t i = 0; i < n_layer; i++) { + ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + cache.k_l.push_back(k); + cache.v_l.push_back(v); #ifdef GGML_USE_CUBLAS - size_t vram_kv_cache = 0; + if ((int)i >= i_gpu_start) { + ggml_cuda_assign_buffers_no_scratch(k); + LLAMA_LOG_INFO("%s: offloading k[%d] cache to GPU\n", __func__, i); + vram_kv_cache += ggml_nbytes(k); - if (n_gpu_layers > (int)n_layer + 1) { - ggml_cuda_assign_buffers_no_scratch(cache.v); - LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += ggml_nbytes(cache.v); + ggml_cuda_assign_buffers_no_scratch(v); + LLAMA_LOG_INFO("%s: offloading v[%d] cache to GPU\n", __func__, i); + vram_kv_cache += ggml_nbytes(v); } - if (n_gpu_layers > (int)n_layer + 2) { - ggml_cuda_assign_buffers_no_scratch(cache.k); - LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += ggml_nbytes(cache.k); +#endif // GGML_USE_CUBLAS } + if (vram_kv_cache > 0) { LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } -#endif // GGML_USE_CUBLAS + + (void) n_gpu_layers; return true; } @@ -2634,17 +2652,17 @@ static struct ggml_cgraph * llm_build_llama( // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func_kq = llama_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers_no_alloc; } - if (n_gpu_layers > n_layer + 1) { + if (n_gpu_layers > 0) { offload_func_v = ggml_cuda_assign_buffers_no_alloc; } - if (n_gpu_layers > n_layer + 2) { + if (n_gpu_layers > 0) { offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } #endif // GGML_USE_CUBLAS @@ -2708,11 +2726,11 @@ static struct ggml_cgraph * llm_build_llama( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, + ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + ggml_element_size(kv_self.k_l[il])*n_embd_head, + ggml_element_size(kv_self.k_l[il])*n_embd_gqa, + 0), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(tmp); ggml_build_forward_expand(gf, tmp); @@ -2723,10 +2741,14 @@ static struct ggml_cgraph * llm_build_llama( ggml_format_name(inpL, "layer_inp_%d", il); offload_func_t offload_func = llama_nop; + offload_func_v = llama_nop; + offload_func_kq = llama_nop; #ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; + offload_func = ggml_cuda_assign_buffers_no_alloc; + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } #endif // GGML_USE_CUBLAS @@ -2775,13 +2797,13 @@ static struct ggml_cgraph * llm_build_llama( offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k_l[il])*n_embd_gqa)*(kv_head)); offload_func_kq(k); ggml_set_name(k, "k"); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v_l[il]), + kv_head*ggml_element_size(kv_self.v_l[il])); offload_func_v(v); ggml_set_name(v, "v"); @@ -2795,11 +2817,11 @@ static struct ggml_cgraph * llm_build_llama( ggml_set_name(Q, "Q"); struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, + ggml_view_3d(ctx0, kv_self.k_l[il], n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_element_size(kv_self.k_l[il])*n_embd_gqa, + ggml_element_size(kv_self.k_l[il])*n_embd_head, + 0); offload_func_kq(K); ggml_set_name(K, "K"); @@ -2826,11 +2848,11 @@ static struct ggml_cgraph * llm_build_llama( // split cached V into n_head heads struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, + ggml_view_3d(ctx0, kv_self.v_l[il], n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_element_size(kv_self.v_l[il])*n_ctx, + ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head, + 0); offload_func_v(V); ggml_set_name(V, "V"); @@ -6872,7 +6894,14 @@ struct llama_context * llama_new_context_with_model( } { - const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); + // const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); + size_t memory_size = 0; + for (auto & k : ctx->kv_self.k_l) { + memory_size += ggml_nbytes(k); + } + for (auto & v : ctx->kv_self.v_l) { + memory_size += ggml_nbytes(v); + } LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -6946,8 +6975,12 @@ struct llama_context * llama_new_context_with_model( } size_t kv_vram_size = 0; - add_tensor(ctx->kv_self.k, kv_vram_size); - add_tensor(ctx->kv_self.v, kv_vram_size); + for (auto & k : ctx->kv_self.k_l) { + add_tensor(k, kv_vram_size); + } + for (auto & v : ctx->kv_self.v_l) { + add_tensor(v, kv_vram_size); + } size_t ctx_vram_size = alloc_size + kv_vram_size; size_t total_vram_size = model_vram_size + ctx_vram_size; From 55f2f2fb43baf966ac37326f5fb9abe2112d38a1 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 4 Oct 2023 01:53:21 +0200 Subject: [PATCH 2/3] remove unnecessary copies --- llama.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index acc5ec7f7cf04..53793eeeb578c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2676,13 +2676,34 @@ static struct ggml_cgraph * llm_build_llama( } // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); + struct ggml_tensor * KQ_mask_gpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask_gpu); + ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu"); + ggml_allocr_alloc(lctx.alloc, KQ_mask_gpu); if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); + float * data = (float *) KQ_mask_gpu->data; + memset(data, 0, ggml_nbytes(KQ_mask_gpu)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + struct ggml_tensor * KQ_mask_cpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask_cpu, "KQ_mask_cpu"); + ggml_allocr_alloc(lctx.alloc, KQ_mask_cpu); + if (!ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask_cpu->data; + memset(data, 0, ggml_nbytes(KQ_mask_cpu)); for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -2699,12 +2720,21 @@ static struct ggml_cgraph * llm_build_llama( } // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); + struct ggml_tensor * KQ_pos_gpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + offload_func_kq(KQ_pos_gpu); + ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu"); + ggml_allocr_alloc(lctx.alloc, KQ_pos_gpu); if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; + int * data = (int *) KQ_pos_gpu->data; + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + struct ggml_tensor * KQ_pos_cpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_name(KQ_pos_cpu, "KQ_pos_cpu"); + ggml_allocr_alloc(lctx.alloc, KQ_pos_cpu); + if (!ggml_allocr_is_measure(lctx.alloc)) { + int * data = (int *) KQ_pos_cpu->data; for (int i = 0; i < n_tokens; ++i) { data[i] = batch.pos[i]; } @@ -2732,7 +2762,9 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k_l[il])*n_embd_gqa, 0), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); + if (il >= i_gpu_start) { + offload_func_kq(tmp); + } ggml_build_forward_expand(gf, tmp); } } @@ -2744,8 +2776,14 @@ static struct ggml_cgraph * llm_build_llama( offload_func_v = llama_nop; offload_func_kq = llama_nop; + struct ggml_tensor * KQ_mask = KQ_mask_cpu; + struct ggml_tensor * KQ_pos = KQ_pos_cpu; + + #ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { + KQ_mask = KQ_mask_gpu; + KQ_pos = KQ_pos_gpu; offload_func = ggml_cuda_assign_buffers_no_alloc; offload_func_v = ggml_cuda_assign_buffers_no_alloc; offload_func_kq = ggml_cuda_assign_buffers_no_alloc; @@ -2779,11 +2817,11 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + ggml_format_name(Kcur, "Kcur%d", il); struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + ggml_format_name(Qcur, "Qcur%d", il); // store key and value to memory { @@ -2839,7 +2877,7 @@ static struct ggml_cgraph * llm_build_llama( // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + ggml_format_name(KQ_masked, "KQ_masked%d", il); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); From f4f9367faa1d7bf1f77933fdce5fc4a7ad670207 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 6 Oct 2023 15:44:06 +0200 Subject: [PATCH 3/3] less code duplication, offload k and v separately --- llama.cpp | 113 ++++++++++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 67 deletions(-) diff --git a/llama.cpp b/llama.cpp index 53793eeeb578c..dac32e60989cb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,3 +1,7 @@ +// TODO: move to context params +bool offload_k = true; +bool offload_v = true; + #define LLAMA_API_INTERNAL #include "llama.h" @@ -1035,9 +1039,9 @@ struct llama_kv_cache { struct ggml_tensor * k = NULL; struct ggml_tensor * v = NULL; - std::vector k_l; // per layer - std::vector v_l; + std::vector k_l; // per layer + std::vector v_l; struct ggml_context * ctx = NULL; @@ -1259,11 +1263,6 @@ static bool llama_kv_cache_init( return false; } - // cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - // cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - // ggml_set_name(cache.k, "cache_k"); - // ggml_set_name(cache.v, "cache_v"); - cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); @@ -1278,13 +1277,14 @@ static bool llama_kv_cache_init( cache.v_l.push_back(v); #ifdef GGML_USE_CUBLAS if ((int)i >= i_gpu_start) { - ggml_cuda_assign_buffers_no_scratch(k); - LLAMA_LOG_INFO("%s: offloading k[%d] cache to GPU\n", __func__, i); - vram_kv_cache += ggml_nbytes(k); - - ggml_cuda_assign_buffers_no_scratch(v); - LLAMA_LOG_INFO("%s: offloading v[%d] cache to GPU\n", __func__, i); - vram_kv_cache += ggml_nbytes(v); + if (offload_k) { + ggml_cuda_assign_buffers_no_scratch(k); + vram_kv_cache += ggml_nbytes(k); + } + if (offload_v) { + ggml_cuda_assign_buffers_no_scratch(v); + vram_kv_cache += ggml_nbytes(v); + } } #endif // GGML_USE_CUBLAS } @@ -2659,10 +2659,10 @@ static struct ggml_cgraph * llm_build_llama( if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers_no_alloc; } - if (n_gpu_layers > 0) { + if (n_gpu_layers > 0 && offload_v) { offload_func_v = ggml_cuda_assign_buffers_no_alloc; } - if (n_gpu_layers > 0) { + if (n_gpu_layers > 0 && offload_k) { offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } #endif // GGML_USE_CUBLAS @@ -2676,13 +2676,12 @@ static struct ggml_cgraph * llm_build_llama( } // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_gpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask_gpu); - ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu"); - ggml_allocr_alloc(lctx.alloc, KQ_mask_gpu); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask, "KQ_mask"); + ggml_allocr_alloc(lctx.alloc, KQ_mask); if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask_gpu->data; - memset(data, 0, ggml_nbytes(KQ_mask_gpu)); + float * data = (float *) KQ_mask->data; + memset(data, 0, ggml_nbytes(KQ_mask)); for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -2698,48 +2697,25 @@ static struct ggml_cgraph * llm_build_llama( } } - struct ggml_tensor * KQ_mask_cpu = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask_cpu, "KQ_mask_cpu"); - ggml_allocr_alloc(lctx.alloc, KQ_mask_cpu); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask_cpu->data; - memset(data, 0, ggml_nbytes(KQ_mask_cpu)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + struct ggml_tensor * KQ_mask_gpu = ggml_view_tensor(ctx0, KQ_mask); + offload_func_kq(KQ_mask_gpu); + ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu"); // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos_gpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos_gpu); - ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu"); - ggml_allocr_alloc(lctx.alloc, KQ_pos_gpu); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos_gpu->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - struct ggml_tensor * KQ_pos_cpu = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(KQ_pos_cpu, "KQ_pos_cpu"); - ggml_allocr_alloc(lctx.alloc, KQ_pos_cpu); + struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_name(KQ_pos, "KQ_pos"); + ggml_allocr_alloc(lctx.alloc, KQ_pos); if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos_cpu->data; + int * data = (int *) KQ_pos->data; for (int i = 0; i < n_tokens; ++i) { data[i] = batch.pos[i]; } } + struct ggml_tensor * KQ_pos_gpu = ggml_view_tensor(ctx0, KQ_pos); + offload_func_kq(KQ_pos_gpu); + ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu"); + // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); @@ -2776,17 +2752,20 @@ static struct ggml_cgraph * llm_build_llama( offload_func_v = llama_nop; offload_func_kq = llama_nop; - struct ggml_tensor * KQ_mask = KQ_mask_cpu; - struct ggml_tensor * KQ_pos = KQ_pos_cpu; - + struct ggml_tensor * KQ_mask_l = KQ_mask; + struct ggml_tensor * KQ_pos_l = KQ_pos; #ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { - KQ_mask = KQ_mask_gpu; - KQ_pos = KQ_pos_gpu; - offload_func = ggml_cuda_assign_buffers_no_alloc; - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + offload_func = ggml_cuda_assign_buffers_no_alloc; + if (offload_k) { + KQ_mask_l = KQ_mask_gpu; + KQ_pos_l = KQ_pos_gpu; + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } + if (offload_v) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } } #endif // GGML_USE_CUBLAS @@ -2815,11 +2794,11 @@ static struct ggml_cgraph * llm_build_llama( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); ggml_format_name(Kcur, "Kcur%d", il); - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Qcur); ggml_format_name(Qcur, "Qcur%d", il); @@ -2875,7 +2854,7 @@ static struct ggml_cgraph * llm_build_llama( ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask_l); offload_func_kq(KQ_masked); ggml_format_name(KQ_masked, "KQ_masked%d", il);