File tree Expand file tree Collapse file tree 2 files changed +9
-18
lines changed Expand file tree Collapse file tree 2 files changed +9
-18
lines changed Original file line number Diff line number Diff line change @@ -1254,6 +1254,9 @@ int llama_context::decode(llama_batch & inp_batch) {
1254
1254
return -2 ;
1255
1255
};
1256
1256
1257
+ // handle any pending defrags/shifts
1258
+ kv_self_update ();
1259
+
1257
1260
int64_t n_outputs_prev = 0 ;
1258
1261
1259
1262
while (sbatch.n_tokens > 0 ) {
@@ -1293,14 +1296,6 @@ int llama_context::decode(llama_batch & inp_batch) {
1293
1296
1294
1297
// find KV slot
1295
1298
{
1296
- kv_self_update ();
1297
-
1298
- // if we have enough unused cells before the current head ->
1299
- // better to start searching from the beginning of the cache, hoping to fill it
1300
- if (kv_self->head > kv_self->used + 2 *ubatch.n_tokens ) {
1301
- kv_self->head = 0 ;
1302
- }
1303
-
1304
1299
if (!kv_self->find_slot (ubatch)) {
1305
1300
LLAMA_LOG_ERROR (" %s: failed to prepare ubatch\n " , __func__);
1306
1301
return -3 ;
@@ -1342,16 +1337,6 @@ int llama_context::decode(llama_batch & inp_batch) {
1342
1337
}
1343
1338
}
1344
1339
1345
- // update the kv ring buffer
1346
- {
1347
- kv_self->head += ubatch.n_tokens ;
1348
-
1349
- // Ensure kv cache head points to a valid index.
1350
- if (kv_self->head >= kv_self->size ) {
1351
- kv_self->head = 0 ;
1352
- }
1353
- }
1354
-
1355
1340
// plot the computation graph in dot format (for debugging purposes)
1356
1341
// if (n_past%100 == 0) {
1357
1342
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
Original file line number Diff line number Diff line change @@ -492,6 +492,12 @@ bool llama_kv_cache_unified::find_slot(
492
492
const uint32_t n_seqs = ubatch.n_seqs ;
493
493
const uint32_t n_seq_tokens = ubatch.n_seq_tokens ;
494
494
495
+ // if we have enough unused cells before the current head ->
496
+ // better to start searching from the beginning of the cache, hoping to fill it
497
+ if (head > used + 2 *ubatch.n_tokens ) {
498
+ head = 0 ;
499
+ }
500
+
495
501
if (recurrent) {
496
502
// For recurrent state architectures (like Mamba or RWKV),
497
503
// each cache cell can store the state for a whole sequence.
You can’t perform that action at this time.
0 commit comments