Skip to content

Commit 1df57db

Browse files
committed
Added back K-cache view for v instead of nullptr
1 parent 7d13fc2 commit 1df57db

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

src/llama-graph.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1474,7 +1474,6 @@ ggml_tensor * llm_graph_context::build_attn(
14741474
0);
14751475
//cb(k, "k", il);
14761476

1477-
// note: MLA with flash attention now uses the last 512 elements of K-cache in place of a V-cache
14781477
ggml_tensor * v = nullptr;
14791478

14801479
if (v_trans) {
@@ -1489,6 +1488,13 @@ ggml_tensor * llm_graph_context::build_attn(
14891488
ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
14901489
ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
14911490
0);
1491+
} else {
1492+
// note: MLA with flash attention now uses the last 512 elements of K-cache in place of a V-cache
1493+
v = ggml_view_3d(ctx0, kv_self->k_l[il],
1494+
n_embd_head_v, n_kv, n_head_kv,
1495+
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
1496+
ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
1497+
n_embd_head_k-n_embd_head_v); // offset by n_rot elements
14921498
}
14931499

14941500
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);

0 commit comments

Comments
 (0)