Added a shift kv cache function to llama as well, this allows for manually purging of old tokens from kv cache while keeping ones needed.

byroneverson · byroneverson · commit db34e6b4b2f6 · 2023-05-09T13:00:01.000-07:00
diff --git a/llama.cpp b/llama.cpp
@@ -2418,6 +2418,35 @@ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
     return ctx->model.kv_self.n;
 }
 
+// Assumes contiguous data
+void llama_shift_kv_cache(struct llama_context * ctx, int n) {
+    auto & model = ctx->model;
+    auto & kv_self = model.kv_self;
+    auto & hparams = model.hparams;
+    auto n_layer = hparams.n_layer;
+    auto n_embd = hparams.n_embd;
+    auto n_ctx = hparams.n_ctx;
+    for(int il = 0; il < n_layer; il++) {
+        // K: Embeddings are in regular order so moving them is easy as copying the memory
+        {
+            int elem_byte_size = ggml_element_size(kv_self.k);
+            uint8_t * dst_ptr = ((uint8_t *)kv_self.k->data) + (elem_byte_size * n_embd * (il * n_ctx));
+            uint8_t * src_ptr = ((uint8_t *)kv_self.k->data) + (elem_byte_size * n_embd * (il * n_ctx + n));
+            memcpy(dst_ptr, src_ptr, elem_byte_size * n_embd * (n_ctx - n));
+        }
+        
+        // V: Embeddings are transposed so each embedding element must be copied separately
+        {
+            int elem_byte_size = ggml_element_size(kv_self.v);
+            for(int i = 0; i < n_embd; i++) {
+                uint8_t * dst_ptr = ((uint8_t *)kv_self.v->data) + (elem_byte_size * (il * n_ctx * i));
+                uint8_t * src_ptr = ((uint8_t *)kv_self.v->data) + (elem_byte_size * (il * n_ctx * i + n));
+                memcpy(dst_ptr, src_ptr, elem_byte_size * (n_ctx - n));
+            }
+        }
+    }
+}
+
 #define LLAMA_MAX_RNG_STATE 64*1024
 
 void llama_set_rng_seed(struct llama_context * ctx, int seed) {
diff --git a/llama.h b/llama.h
@@ -126,6 +126,9 @@ extern "C" {
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
 
+    // Shifts the KV cache effectively removing the first n tokens
+    LLAMA_API void llama_shift_kv_cache(struct llama_context * ctx, int n);
+
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);