Skip to content

Commit 65f31a0

Browse files
ggerganovMinh141120
authored andcommitted
kv-cells : fix tracking of seq_pos (ggml-org#14339)
* kv-cells : fix tracking of seq_pos during cache reuse ggml-ci * cont : improve error message ggml-ci * cont : add more comments
1 parent 1f044a1 commit 65f31a0

File tree

3 files changed

+21
-6
lines changed

3 files changed

+21
-6
lines changed

src/llama-batch.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,21 +251,32 @@ bool llama_batch_allocr::init(
251251
}
252252

253253
if (memory) {
254+
bool ok = true;
255+
254256
if (batch.token) {
255257
if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
256-
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
257-
return false;
258+
ok = false;
258259
}
259260
} else {
260261
assert(batch.embd);
261262

262263
// for embeddings (typically used as vision input), we allow them to have repeating positions
263264
// ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
264265
if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
265-
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
266-
return false;
266+
ok = false;
267267
}
268268
}
269+
270+
if (!ok) {
271+
LLAMA_LOG_ERROR(
272+
"%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
273+
" - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
274+
" - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
275+
" it is required that the sequence positions remain consecutive: Y = X + 1\n",
276+
__func__, s, s, memory->seq_pos_max(s), s, seq_pos_min(s));
277+
278+
return false;
279+
}
269280
}
270281

271282
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {

src/llama-context.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1019,7 +1019,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
10191019
pos_min[s] = std::numeric_limits<llama_pos>::max();
10201020
}
10211021

1022-
// TODO: fix sequence indexing
10231022
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
10241023
const auto & seq_id = ubatch.seq_id[i][0];
10251024

src/llama-kv-cells.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,12 @@ class llama_kv_cells_unified {
398398
// the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
399399
// if the position p is not present, seq_pos[s][p] is not set
400400
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
401-
std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
401+
//
402+
// note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
403+
// - during performing a cache reuse via (rm + add)
404+
// - some vision models have input embeddings with repeating positions
405+
//
406+
std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
402407

403408
// helper functions for updating `seq_pos`, once cell at a time:
404409

0 commit comments

Comments
 (0)