mamba : make the server and parallel examples work with whole sequences

compilade · compilade · commit 12de5c7ffd77 · 2024-02-25T19:55:46.000-05:00
A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -107,6 +107,9 @@ int main(int argc, char ** argv) {
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
 
+    // dedicate one sequence to the system prompt
+    params.n_parallel += 1;
+
     // requests to simulate
     const int32_t n_seq = params.n_sequences;
 
@@ -196,8 +199,8 @@ int main(int argc, char ** argv) {
         }
 
         // assign the system KV cache to all parallel sequences
-        for (int32_t i = 1; i < n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+        for (int32_t i = 1; i <= n_clients; ++i) {
+            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
         LOG_TEE("\n");
@@ -221,15 +224,17 @@ int main(int argc, char ** argv) {
 
             client.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
 
             client.n_decoded += 1;
         }
 
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
-            for (int i = 0; i < n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+            for (int i = 1; i <= n_clients; ++i) {
+                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                // but keep the system prompt
+                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
             LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -255,7 +260,7 @@ int main(int argc, char ** argv) {
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }
 
                     // extract the logits only for the last token
@@ -366,7 +371,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+                    llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -438,12 +438,16 @@ struct llama_server_context
                 return false;
             }
 
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
+            if (params.n_ctx != 0 && params.n_ctx < 2048) { // request larger context for the image embedding
                 params.n_ctx = 2048;
             }
         }
 
+        // dedicate one sequence to the system prompt
+        params.n_parallel += 1;
+
         std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        params.n_parallel -= 1; // but be sneaky about it
         if (model == nullptr)
         {
             LOG_ERROR("unable to load model", {{"model", params.model}});
@@ -923,9 +927,9 @@ struct llama_server_context
             }
 
             // assign the system KV cache to all parallel sequences
-            for (int32_t i = 1; i < params.n_parallel; ++i)
+            for (int32_t i = 1; i <= params.n_parallel; ++i)
             {
-                llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
         }
 
@@ -1400,7 +1404,7 @@ struct llama_server_context
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
             for (int i = 0; i < (int) append_tokens.size(); ++i)
             {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id + 1 }, true);
                 slot.n_past += 1;
             }
         }
@@ -1636,8 +1640,8 @@ struct llama_server_context
                         {"n_system_tokens", system_tokens.size()},
                         {"n_cache_tokens",  slot.cache_tokens.size()}
                     });
-                    llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
 
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
                     {
@@ -1689,7 +1693,7 @@ struct llama_server_context
 
             // TODO: we always have to take into account the "system_tokens"
             //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
             slot.n_past += 1;
         }
 
@@ -1852,13 +1856,28 @@ struct llama_server_context
                         }
                     }
 
+                    // keep only the common part
                     int p0 = (int) system_tokens.size() + slot.n_past;
                     LOG_INFO("kv cache rm [p0, end)", {
                         { "slot_id", slot.id },
                         { "task_id", slot.task_id },
                         { "p0",      p0 }
                     });
-                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
+                        // could not partially delete (likely using a non-Transformer model)
+                        // TODO: logging
+                        llama_kv_cache_seq_rm(ctx,    slot.id + 1, -1, -1);
+                        llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
+
+                        // there is no common part left (except for the system prompt)
+                        // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above
+                        slot.n_past = 0;
+                        slot.n_past_se = 0;
+                        slot.ga_i = 0;
+                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
+                        // TODO: is the system prompt ever in the sampling context?
+                        llama_sampling_reset(slot.ctx_sampling);
+                    }
 
                     LOG_VERBOSE("prompt ingested", {
                                                     {"n_past",  slot.n_past},
@@ -1887,7 +1906,7 @@ struct llama_server_context
                                 ga_i += ga_w/ga_n;
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
                         slot_npast++;
                     }
 
@@ -1941,9 +1960,9 @@ struct llama_server_context
                         LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
                         LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
+                        llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
+                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
 
                         slot.n_past_se -= bd;
 
diff --git a/llama.cpp b/llama.cpp
@@ -2231,7 +2231,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
     cache.used = 0;
 }
 
-static void llama_kv_cache_seq_rm(
+static bool llama_kv_cache_seq_rm(
         struct llama_kv_cache & cache,
                  llama_seq_id   seq_id,
                     llama_pos   p0,
@@ -2241,11 +2241,23 @@ static void llama_kv_cache_seq_rm(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    // models like Mamba can't have a state partially erased
     if (cache.unlimited) {
-        // can only remove whole sequences for models like Mamba
-        GGML_ASSERT(p0 == 0);
-        GGML_ASSERT((uint32_t)seq_id < cache.size);
-        GGML_ASSERT(cache.cells[seq_id].pos < p1);
+        if (seq_id >= (int64_t) cache.size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            // partial intersection is invalid
+            if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
+                return false;
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
     }
 
     for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2269,6 +2281,8 @@ static void llama_kv_cache_seq_rm(
 
     // If we freed up a slot, set head to it so searching can start there.
     if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+
+    return true;
 }
 
 static void llama_kv_cache_seq_cp(
@@ -12283,13 +12297,11 @@ struct llama_context * llama_new_context_with_model(
 
     // Mamba only needs a constant number of KV cache cells per sequence
     if (model->arch == LLM_ARCH_MAMBA) {
-        // Mamba needs as many KV cells as there are sequences kept at any time
-        // The extra cell allows dedicating a sequence id to the system prompt
-        // TODO: find a better way to get the max number of parallel sequences
-        kv_size = params.n_parallel + 1;
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_parallel);
         // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_set for Mamba's conv_state
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_state
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
     }
 
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
@@ -12799,8 +12811,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
     llama_kv_cache_clear(ctx->kv_self);
 }
 
-void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
+bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
diff --git a/llama.h b/llama.h
@@ -502,7 +502,7 @@ extern "C" {
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,

Original file line number	Diff line number	Diff line change
`@@ -438,12 +438,16 @@ struct llama_server_context`
`438`	`438`	`return false;`
`439`	`439`	`}`
`440`	`440`
`441`		`- if (params.n_ctx < 2048) { // request larger context for the image embedding`
	`441`	`+ if (params.n_ctx != 0 && params.n_ctx < 2048) { // request larger context for the image embedding`
`442`	`442`	`params.n_ctx = 2048;`
`443`	`443`	`}`
`444`	`444`	`}`
`445`	`445`
	`446`	`+ // dedicate one sequence to the system prompt`
	`447`	`+ params.n_parallel += 1;`
	`448`	`+`
`446`	`449`	`std::tie(model, ctx) = llama_init_from_gpt_params(params);`
	`450`	`+ params.n_parallel -= 1; // but be sneaky about it`
`447`	`451`	`if (model == nullptr)`
`448`	`452`	`{`
`449`	`453`	`LOG_ERROR("unable to load model", {{"model", params.model}});`
`@@ -923,9 +927,9 @@ struct llama_server_context`
`923`	`927`	`}`
`924`	`928`
`925`	`929`	`// assign the system KV cache to all parallel sequences`
`926`		`- for (int32_t i = 1; i < params.n_parallel; ++i)`
	`930`	`+ for (int32_t i = 1; i <= params.n_parallel; ++i)`
`927`	`931`	`{`
`928`		`- llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());`
	`932`	`+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);`
`929`	`933`	`}`
`930`	`934`	`}`
`931`	`935`
`@@ -1400,7 +1404,7 @@ struct llama_server_context`
`1400`	`1404`	`std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image`
`1401`	`1405`	`for (int i = 0; i < (int) append_tokens.size(); ++i)`
`1402`	`1406`	`{`
`1403`		`- llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);`
	`1407`	`+ llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id + 1 }, true);`
`1404`	`1408`	`slot.n_past += 1;`
`1405`	`1409`	`}`
`1406`	`1410`	`}`
`@@ -1636,8 +1640,8 @@ struct llama_server_context`
`1636`	`1640`	`{"n_system_tokens", system_tokens.size()},`
`1637`	`1641`	`{"n_cache_tokens", slot.cache_tokens.size()}`
`1638`	`1642`	`});`
`1639`		`- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);`
`1640`		`- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);`
	`1643`	`+ llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);`
	`1644`	`+ llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);`
`1641`	`1645`
`1642`	`1646`	`for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)`
`1643`	`1647`	`{`
`@@ -1689,7 +1693,7 @@ struct llama_server_context`
`1689`	`1693`
`1690`	`1694`	`// TODO: we always have to take into account the "system_tokens"`
`1691`	`1695`	`// this is not great and needs to be improved somehow`
`1692`		`- llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);`
	`1696`	`+ llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);`
`1693`	`1697`	`slot.n_past += 1;`
`1694`	`1698`	`}`
`1695`	`1699`
`@@ -1852,13 +1856,28 @@ struct llama_server_context`
`1852`	`1856`	`}`
`1853`	`1857`	`}`
`1854`	`1858`
	`1859`	`+ // keep only the common part`
`1855`	`1860`	`int p0 = (int) system_tokens.size() + slot.n_past;`
`1856`	`1861`	`LOG_INFO("kv cache rm [p0, end)", {`
`1857`	`1862`	`{ "slot_id", slot.id },`
`1858`	`1863`	`{ "task_id", slot.task_id },`
`1859`	`1864`	`{ "p0", p0 }`
`1860`	`1865`	`});`
`1861`		`- llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);`
	`1866`	`+ if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {`
	`1867`	`+ // could not partially delete (likely using a non-Transformer model)`
	`1868`	`+ // TODO: logging`
	`1869`	`+ llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);`
	`1870`	`+ llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);`
	`1871`	`+`
	`1872`	`+ // there is no common part left (except for the system prompt)`
	`1873`	`+ // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above`
	`1874`	`+ slot.n_past = 0;`
	`1875`	`+ slot.n_past_se = 0;`
	`1876`	`+ slot.ga_i = 0;`
	`1877`	`+ slot.num_prompt_tokens_processed = slot.num_prompt_tokens;`
	`1878`	`+ // TODO: is the system prompt ever in the sampling context?`
	`1879`	`+ llama_sampling_reset(slot.ctx_sampling);`
	`1880`	`+ }`
`1862`	`1881`
`1863`	`1882`	`LOG_VERBOSE("prompt ingested", {`
`1864`	`1883`	`{"n_past", slot.n_past},`
`@@ -1887,7 +1906,7 @@ struct llama_server_context`
`1887`	`1906`	`ga_i += ga_w/ga_n;`
`1888`	`1907`	`}`
`1889`	`1908`	`}`
`1890`		`- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);`
	`1909`	`+ llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);`
`1891`	`1910`	`slot_npast++;`
`1892`	`1911`	`}`
`1893`	`1912`
`@@ -1941,9 +1960,9 @@ struct llama_server_context`
`1941`	`1960`	`LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);`
`1942`	`1961`	`LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);`
`1943`	`1962`
`1944`		`- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);`
`1945`		`- llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);`
`1946`		`- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);`
	`1963`	`+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);`
	`1964`	`+ llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);`
	`1965`	`+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);`
`1947`	`1966`
`1948`	`1967`	`slot.n_past_se -= bd;`
`1949`	`1968`