feat: Use common methods for accessing recurrent and unified caches in llama-graph

gabe-l-hart · gabe-l-hart · commit bbf6c35a262f · 2025-05-28T09:50:58.000-06:00
Branch: HybridRecurrentCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -954,7 +954,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_recurrent * kv_self = get_recurrent_cache();
 
     auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
 
@@ -971,7 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_recurrent * kv_self = get_recurrent_cache();
 
     auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
 
@@ -1025,7 +1025,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_unified * kv_self = get_unified_cache();
 
     auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
 
@@ -1231,7 +1231,7 @@ ggml_tensor * llm_graph_context::build_attn(
 }
 
 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_unified * kv_self = get_unified_cache();
 
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
@@ -1268,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const llama_kv_cache_unified * kv_self = get_unified_cache();
 
     // store to KV cache
     {
@@ -1439,14 +1439,38 @@ ggml_tensor * llm_graph_context::build_attn(
     return cur;
 }
 
+const llama_kv_cache_recurrent * llm_graph_context::get_recurrent_cache() const {
+    const llama_kv_cache_recurrent * kv_self = dynamic_cast<const llama_kv_cache_recurrent *>(memory);
+    if (!kv_self) {
+        const llama_kv_cache_hybrid_recurrent * kv_hybrid = dynamic_cast<const llama_kv_cache_hybrid_recurrent *>(memory);
+        if (kv_hybrid) {
+            kv_self = kv_hybrid->get_kv_recurrent();
+        }
+    }
+    GGML_ASSERT(kv_self);
+    return kv_self;
+}
+
+const llama_kv_cache_unified * llm_graph_context::get_unified_cache() const {
+    const llama_kv_cache_unified * kv_self = dynamic_cast<const llama_kv_cache_unified *>(memory);
+    if (!kv_self) {
+        const llama_kv_cache_hybrid_recurrent * kv_hybrid = dynamic_cast<const llama_kv_cache_hybrid_recurrent *>(memory);
+        if (kv_hybrid) {
+            kv_self = kv_hybrid->get_kv_attn();
+        }
+    }
+    GGML_ASSERT(kv_self);
+    return kv_self;
+}
+
 ggml_tensor * llm_graph_context::build_copy_mask_state(
          ggml_cgraph * gf,
          ggml_tensor * s,
          ggml_tensor * state_copy,
          ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_recurrent * kv_self = get_recurrent_cache();
 
     const auto n_kv    = kv_self->n;
     const auto kv_head = kv_self->head;
@@ -1478,7 +1502,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_recurrent * kv_self = get_recurrent_cache();
 
     const auto token_shift_count = hparams.token_shift_count;
 
@@ -1499,7 +1523,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const llama_kv_cache_recurrent * kv_self = get_recurrent_cache();
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -604,6 +604,11 @@ struct llm_graph_context {
     // recurrent
     //
 
+    // Getters to support hybrid cache
+    // TODO: Should these be protected to the derived class hierachy?
+    const llama_kv_cache_recurrent * get_recurrent_cache() const;
+    const llama_kv_cache_unified   * get_unified_cache() const;
+
     ggml_tensor * build_copy_mask_state(
              ggml_cgraph * gf,
              ggml_tensor * s,