Skip to content

Commit 2e9e969

Browse files
committed
fix: Fix logic for initializing inputs and attn layers for hybrid caches
Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 31be8ae commit 2e9e969

File tree

2 files changed

+25
-66
lines changed

2 files changed

+25
-66
lines changed

src/llama-graph.cpp

Lines changed: 17 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -413,13 +413,6 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
413413
}
414414
}
415415

416-
llm_graph_input_attn_kv_hybrid_recurrent::llm_graph_input_attn_kv_hybrid_recurrent(
417-
const llama_hparams & hparams,
418-
const llama_cparams & cparams,
419-
const llama_kv_cache_hybrid_recurrent_state * kv_state) :
420-
llm_graph_input_attn_kv_unified(hparams, cparams, kv_state->get_state_attn()) {
421-
}
422-
423416
//
424417
// llm_graph_context
425418
//
@@ -1294,7 +1287,9 @@ ggml_tensor * llm_graph_context::build_attn(
12941287
ggml_build_forward_expand(gf, k_cur);
12951288
ggml_build_forward_expand(gf, v_cur);
12961289

1297-
const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
1290+
// NOTE: For hybrid caches, this may be a child of mstate, so we use the one
1291+
// encapsulated in inp
1292+
const auto * kv_state = inp->kv_state;
12981293

12991294
// store to KV cache
13001295
{
@@ -1326,10 +1321,10 @@ ggml_tensor * llm_graph_context::build_attn(
13261321
return cur;
13271322
}
13281323

1329-
llm_graph_input_attn_kv_hybrid_recurrent * llm_graph_context::build_attn_inp_kv_hybrid_recurrent() const {
1324+
llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_hybrid_recurrent() const {
13301325
const auto * kv_state = static_cast<const llama_kv_cache_hybrid_recurrent_state *>(mstate);
13311326

1332-
auto inp = std::make_unique<llm_graph_input_attn_kv_hybrid_recurrent>(hparams, cparams, kv_state);
1327+
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_state->get_state_attn());
13331328

13341329
{
13351330
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Hybrid recurrent is not supported with SWA attention layers");
@@ -1343,25 +1338,7 @@ llm_graph_input_attn_kv_hybrid_recurrent * llm_graph_context::build_attn_inp_kv_
13431338
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
13441339
}
13451340

1346-
return (llm_graph_input_attn_kv_hybrid_recurrent *) res->add_input(std::move(inp));
1347-
}
1348-
1349-
ggml_tensor * llm_graph_context::build_attn(
1350-
llm_graph_input_attn_kv_hybrid_recurrent * inp,
1351-
ggml_cgraph * gf,
1352-
ggml_tensor * wo,
1353-
ggml_tensor * wo_b,
1354-
ggml_tensor * q_cur,
1355-
ggml_tensor * k_cur,
1356-
ggml_tensor * v_cur,
1357-
ggml_tensor * kq_b,
1358-
ggml_tensor * v_mla,
1359-
float kq_scale,
1360-
int il) const {
1361-
return build_attn(
1362-
static_cast<llm_graph_input_attn_kv_unified *>(inp),
1363-
gf, wo, wo_b, q_cur, k_cur, v_cur, kq_b, v_mla, kq_scale, il
1364-
);
1341+
return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
13651342
}
13661343

13671344
llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
@@ -1504,13 +1481,17 @@ ggml_tensor * llm_graph_context::build_attn(
15041481
}
15051482

15061483
ggml_tensor * llm_graph_context::build_copy_mask_state(
1507-
ggml_cgraph * gf,
1508-
ggml_tensor * s,
1509-
ggml_tensor * state_copy,
1510-
ggml_tensor * state_mask,
1511-
int32_t n_state,
1512-
int32_t n_seqs) const {
1513-
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
1484+
ggml_cgraph * gf,
1485+
ggml_tensor * s,
1486+
ggml_tensor * state_copy,
1487+
ggml_tensor * state_mask,
1488+
int32_t n_state,
1489+
int32_t n_seqs,
1490+
const llama_kv_cache_recurrent_state * kv_state) const {
1491+
1492+
if (kv_state == nullptr) {
1493+
kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
1494+
}
15141495

15151496
const auto n_kv = kv_state->get_n_kv();
15161497
const auto kv_head = kv_state->get_head();

src/llama-graph.h

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -298,16 +298,6 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
298298
const llama_kv_cache_unified_iswa_state * kv_state;
299299
};
300300

301-
class llm_graph_input_attn_kv_hybrid_recurrent : public llm_graph_input_attn_kv_unified {
302-
public:
303-
llm_graph_input_attn_kv_hybrid_recurrent(
304-
const llama_hparams & hparams,
305-
const llama_cparams & cparams,
306-
const llama_kv_cache_hybrid_recurrent_state * kv_state);
307-
308-
virtual ~llm_graph_input_attn_kv_hybrid_recurrent() = default;
309-
};
310-
311301
class llm_graph_input_attn_cross : public llm_graph_input_i {
312302
public:
313303
llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
@@ -598,20 +588,7 @@ struct llm_graph_context {
598588
float kq_scale,
599589
int il) const;
600590

601-
llm_graph_input_attn_kv_hybrid_recurrent * build_attn_inp_kv_hybrid_recurrent() const;
602-
603-
ggml_tensor * build_attn(
604-
llm_graph_input_attn_kv_hybrid_recurrent * inp,
605-
ggml_cgraph * gf,
606-
ggml_tensor * wo,
607-
ggml_tensor * wo_b,
608-
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
609-
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
610-
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
611-
ggml_tensor * kq_b,
612-
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
613-
float kq_scale,
614-
int il) const;
591+
llm_graph_input_attn_kv_unified * build_attn_inp_kv_hybrid_recurrent() const;
615592

616593
llm_graph_input_attn_cross * build_attn_inp_cross() const;
617594

@@ -633,12 +610,13 @@ struct llm_graph_context {
633610
//
634611

635612
ggml_tensor * build_copy_mask_state(
636-
ggml_cgraph * gf,
637-
ggml_tensor * s,
638-
ggml_tensor * state_copy,
639-
ggml_tensor * state_mask,
640-
int32_t n_state,
641-
int32_t n_seqs) const;
613+
ggml_cgraph * gf,
614+
ggml_tensor * s,
615+
ggml_tensor * state_copy,
616+
ggml_tensor * state_mask,
617+
int32_t n_state,
618+
int32_t n_seqs,
619+
const llama_kv_cache_recurrent_state * kv_state = nullptr) const;
642620

643621
ggml_tensor * build_rwkv_token_shift_load(
644622
ggml_cgraph * gf,

0 commit comments

Comments
 (0)