kv-cells : fix tracking of seq_pos (ggml-org#14339)

ggerganov · Minh141120 · commit 65f31a07da1b · 2025-07-05T23:52:01.000+07:00
* kv-cells : fix tracking of seq_pos during cache reuse

ggml-ci

* cont : improve error message

ggml-ci

* cont : add more comments
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -251,21 +251,32 @@ bool llama_batch_allocr::init(
         }
 
         if (memory) {
+            bool ok = true;
+
             if (batch.token) {
                 if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
-                    LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
-                    return false;
+                    ok = false;
                 }
             } else {
                 assert(batch.embd);
 
                 // for embeddings (typically used as vision input), we allow them to have repeating positions
                 // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
                 if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
-                    LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
-                    return false;
+                    ok = false;
                 }
             }
+
+            if (!ok) {
+                LLAMA_LOG_ERROR(
+                        "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+                        " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+                        " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+                        " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+                        __func__, s, s, memory->seq_pos_max(s), s, seq_pos_min(s));
+
+                return false;
+            }
         }
 
         if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1019,7 +1019,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
                 pos_min[s] = std::numeric_limits<llama_pos>::max();
             }
 
-            // TODO: fix sequence indexing
             for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
                 const auto & seq_id = ubatch.seq_id[i][0];
 
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -398,7 +398,12 @@ class llama_kv_cells_unified {
     // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
     // if the position p is not present, seq_pos[s][p] is not set
     // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
-    std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
+    //
+    // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
+    //  - during performing a cache reuse via (rm + add)
+    //  - some vision models have input embeddings with repeating positions
+    //
+    std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
 
     // helper functions for updating `seq_pos`, once cell at a time:
 

Original file line number	Diff line number	Diff line change
`@@ -1019,7 +1019,6 @@ int llama_context::decode(const llama_batch & batch_inp) {`
`1019`	`1019`	`pos_min[s] = std::numeric_limits<llama_pos>::max();`
`1020`	`1020`	`}`
`1021`	`1021`
`1022`		`- // TODO: fix sequence indexing`
`1023`	`1022`	`for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {`
`1024`	`1023`	`const auto & seq_id = ubatch.seq_id[i][0];`
`1025`	`1024`