feat: First pass at llama_kv_cache_hybrid

gabe-l-hart · gabe-l-hart · commit b8567cef2561 · 2025-05-16T11:07:58.000-06:00
This implementation covers both `llama_memory_i` and `llama_kv_cache`
interfaces, but they could very well not be correct.

Branch: HybridCache

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -2392,6 +2392,231 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
     return true;
 }
 
+//
+// llama_kv_cache_hybrid
+//
+llama_kv_cache_hybrid::llama_kv_cache_hybrid(
+    const llama_hparams & hparams,
+    const std::vector<child_cache> & children) :
+    m_hparams(hparams),
+    m_layer_cache_map(
+        [](const std::vector<child_cache>& caches) -> std::unordered_map<size_t, llama_kv_cache*> {
+            std::unordered_map<size_t, llama_kv_cache*> map;
+            for (const auto & cache : caches) {
+                for (size_t layer_id : cache.layer_ids) {
+                    map[layer_id] = cache.child;
+                }
+            }
+
+            return map;
+        }(children)
+    ),
+    m_children(
+        [](std::vector<child_cache> caches) -> std::set<llama_kv_cache*> {
+            // Sort the caches by the lowest layer ID so the order is repeatable
+            for (auto & cache : caches) {
+                GGML_ASSERT(cache.layer_ids.size() > 0);
+                std::sort(cache.layer_ids.begin(), cache.layer_ids.end());
+            }
+            std::sort(caches.begin(), caches.end(), [](const child_cache & a, const child_cache & b) {
+                return a.layer_ids[0] < b.layer_ids[0];
+            });
+            std::set<llama_kv_cache*> unique_caches;
+            for (const auto & cache : caches) {
+                unique_caches.insert(cache.child);
+            }
+            return unique_caches;
+        }(children)
+    ),
+    m_has_recurrent(
+        [](const std::vector<child_cache>& caches) -> bool {
+            for (const auto & cache : caches) {
+                if (dynamic_cast<llama_kv_cache_recurrent *>(cache.child)) {
+                    return true;
+                }
+            }
+            return false;
+        }(children)
+    )
+{
+    // Ensure at least one child
+    GGML_ASSERT(m_children.size() > 0);
+
+    // Ensure layers are not overlapping and are concurrent
+    std::set<size_t> seen_layers;
+    size_t max_layer = 0;
+    for (const auto & cache : children) {
+        for (const auto & layer_id : cache.layer_ids) {
+            GGML_ASSERT(seen_layers.find(layer_id) == seen_layers.end());
+            seen_layers.insert(layer_id);
+            if (layer_id > max_layer) {
+                max_layer = layer_id;
+            }
+        }
+    }
+    GGML_ASSERT(max_layer == seen_layers.size());
+}
+
+void llama_kv_cache_hybrid::clear() {
+    for (const auto & cache : m_children) {
+        cache->clear();
+    }
+}
+
+bool llama_kv_cache_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    // TODO: Will it cause problems if some caches are able to remove the seq
+    //  but others aren't?
+    bool removed = true;
+    for (const auto & cache : m_children) {
+        removed = cache->seq_rm(seq_id, p0, p1) && removed;
+    }
+    return removed;
+}
+
+void llama_kv_cache_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    for (const auto & cache : m_children) {
+        cache->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    }
+}
+
+void llama_kv_cache_hybrid::seq_keep(llama_seq_id seq_id) {
+    for (const auto & cache : m_children) {
+        cache->seq_keep(seq_id);
+    }
+}
+
+void llama_kv_cache_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    for (const auto & cache : m_children) {
+        cache->seq_add(seq_id, p0, p1, delta);
+    }
+}
+
+void llama_kv_cache_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    for (const auto & cache : m_children) {
+        cache->seq_div(seq_id, p0, p1, d);
+    }
+}
+
+llama_pos llama_kv_cache_hybrid::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos max_pos = 0;
+    for (const auto & cache : m_children) {
+        max_pos = std::max(max_pos, cache->seq_pos_max(seq_id));
+    }
+    return max_pos;
+}
+
+void llama_kv_cache_hybrid::restore() {
+    for (const auto & cache : m_children) {
+        cache->restore();
+    }
+}
+
+void llama_kv_cache_hybrid::commit() {
+    for (const auto & cache : m_children) {
+        cache->commit();
+    }
+}
+
+bool llama_kv_cache_hybrid::update(llama_context & ctx) {
+    bool updated = false;
+    for (const auto & cache : m_children) {
+        updated = cache->update(ctx) || updated;
+    }
+    return updated;
+}
+
+void llama_kv_cache_hybrid::defrag_sched(float thold) {
+    for (const auto & cache : m_children) {
+        cache->defrag_sched(thold);
+    }
+}
+
+void llama_kv_cache_hybrid::set_full() {
+    for (const auto & cache : m_children) {
+        cache->set_full();
+    }
+}
+
+llama_sbatch llama_kv_cache_hybrid::sbatch_init(const llama_batch & batch, bool logits_all) {
+    // If any of the caches are recurrent, require simple split
+    return llama_sbatch(batch, m_hparams.n_embd, m_has_recurrent, logits_all);
+}
+
+llama_ubatch llama_kv_cache_hybrid::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
+    if (m_has_recurrent) {
+        return sbatch.split_simple(n_ubatch);
+    }
+    if (embd_pooled) {
+        // Pooled embeddings cannot be split across ubatches (yet)
+        return sbatch.split_seq(n_ubatch);
+    }
+    return sbatch.split_equal(n_ubatch);
+}
+
+bool llama_kv_cache_hybrid::find_slot(const llama_ubatch & batch) {
+    bool found = true;
+    for (const auto & cache : m_children) {
+        found = cache->find_slot(batch) && found;
+    }
+    return found;
+}
+
+int32_t llama_kv_cache_hybrid::get_n_tokens() const {
+    // The number of tokens should be the same across all child caches
+    int32_t n_tokens = -1;
+    for (const auto & cache : m_children) {
+        const auto cache_n_tokens = cache->get_n_tokens();
+        GGML_ASSERT(n_tokens == -1 || cache_n_tokens == n_tokens);
+        n_tokens = cache_n_tokens;
+    }
+    return n_tokens;
+}
+
+int32_t llama_kv_cache_hybrid::get_used_cells() const {
+    // TODO: Is this correct?
+    // Return the largetst number of used cells
+    int32_t used_cells = -1;
+    for (const auto & cache : m_children) {
+        used_cells = std::max(used_cells, cache->get_used_cells());
+    }
+    return used_cells;
+}
+
+llama_pos llama_kv_cache_hybrid::get_pos_max() const {
+    llama_pos pos_max = -1;
+    for (const auto & cache : m_children) {
+        pos_max = std::max(pos_max, cache->get_pos_max());
+    }
+    return pos_max;
+}
+
+bool llama_kv_cache_hybrid::get_can_shift() const {
+    // TODO: Is this correct?
+    // If any children can shift, return true
+    for (const auto & cache : m_children) {
+        if (cache->get_can_shift()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void llama_kv_cache_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    // Write each cache state in order. Note that order is guaranteed at
+    // initialization by using an ordered set sorted by lowest layer ID
+    for (const auto & cache : m_children) {
+        cache->state_write(io, seq_id);
+    }
+}
+
+void llama_kv_cache_hybrid::state_read(llama_io_read_i  & io, llama_seq_id seq_id) {
+    // Read each cache state in order. Note that order is guaranteed at
+    // initialization by using an ordered set sorted by lowest layer ID
+    for (const auto & cache : m_children) {
+        cache->state_read(io, seq_id);
+    }
+}
+
 //
 // kv cache view
 //
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -9,6 +9,7 @@
 
 #include <set>
 #include <vector>
+#include <unordered_map>
 
 struct llama_cparams;
 struct llama_hparams;
@@ -389,6 +390,79 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 
+//
+// llama_kv_cache_hybrid
+//
+
+class llama_kv_cache_hybrid : public llama_kv_cache {
+public:
+
+    struct child_cache {
+        llama_kv_cache * child;
+        std::vector<size_t> layer_ids;
+    };
+
+    llama_kv_cache_hybrid(
+        const llama_hparams            & hparams,
+        const std::vector<child_cache> & children);
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id) override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    void restore() override;
+    void commit()  override;
+
+    bool update(llama_context & ctx) override;
+
+    void defrag_sched(float thold) override;
+
+    void set_full() override;
+
+    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
+
+    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
+
+    // updates the cache head
+    // Note: On success, it's important that cache.head points
+    // to the first cell of the slot.
+    bool find_slot(const llama_ubatch & batch) override;
+
+    int32_t get_n_tokens()   const override;
+    int32_t get_used_cells() const override;
+
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos get_pos_max() const override;
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+
+private:
+
+    const llama_hparams                                & m_hparams;
+    const std::unordered_map<size_t, llama_kv_cache *>   m_layer_cache_map;
+    const std::set<llama_kv_cache *>                     m_children; // Ordered for state IO
+    const bool                                           m_has_recurrent;
+};
+
 
 //
 // kv cache view