kv-cache : restore find_slot impl

ggerganov · ggerganov · commit cbe971ae669e · 2025-07-12T16:33:52.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
@@ -789,7 +789,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
         res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);
 
         res.strm[s] = seq_to_stream[seq_id];
-        res.idxs[s].resize(n_tokens);
+        res.idxs[s].reserve(n_tokens);
 
         const auto & cells = v_cells[seq_to_stream[seq_id]];
 
@@ -806,7 +806,6 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
             return { };
         }
 
-        uint32_t n_found  = 0;
         uint32_t n_tested = 0;
 
         // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
@@ -857,22 +856,20 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
                 }
 
                 if (can_use) {
-                    res.idxs[s][n_found] = idx;
-
-                    n_found++;
+                    res.idxs[s].push_back(idx);
                 } else {
                     if (cont) {
                         break;
                     }
                 }
             }
 
-            if (n_found == n_tokens) {
+            if (res.idxs[s].size() == n_tokens) {
                 break;
             }
 
             if (cont) {
-                n_found = 0;
+                res.idxs[s].clear();
             }
 
             if (n_tested >= cells.size()) {
@@ -882,7 +879,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_
         }
 
         // we didn't find a suitable slot - return empty result
-        if (n_found < n_tokens) {
+        if (res.idxs[s].size() < n_tokens) {
             return { };
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -789,7 +789,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_`
`789`	`789`	`res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);`
`790`	`790`
`791`	`791`	`res.strm[s] = seq_to_stream[seq_id];`
`792`		`- res.idxs[s].resize(n_tokens);`
	`792`	`+ res.idxs[s].reserve(n_tokens);`
`793`	`793`
`794`	`794`	`const auto & cells = v_cells[seq_to_stream[seq_id]];`
`795`	`795`
`@@ -806,7 +806,6 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_`
`806`	`806`	`return { };`
`807`	`807`	`}`
`808`	`808`
`809`		`- uint32_t n_found = 0;`
`810`	`809`	`uint32_t n_tested = 0;`
`811`	`810`
`812`	`811`	`// for continuous slots, we test that all tokens in the ubatch fit, starting from the current head`
`@@ -857,22 +856,20 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_`
`857`	`856`	`}`
`858`	`857`
`859`	`858`	`if (can_use) {`
`860`		`- res.idxs[s][n_found] = idx;`
`861`		`-`
`862`		`- n_found++;`
	`859`	`+ res.idxs[s].push_back(idx);`
`863`	`860`	`} else {`
`864`	`861`	`if (cont) {`
`865`	`862`	`break;`
`866`	`863`	`}`
`867`	`864`	`}`
`868`	`865`	`}`
`869`	`866`
`870`		`- if (n_found == n_tokens) {`
	`867`	`+ if (res.idxs[s].size() == n_tokens) {`
`871`	`868`	`break;`
`872`	`869`	`}`
`873`	`870`
`874`	`871`	`if (cont) {`
`875`		`- n_found = 0;`
	`872`	`+ res.idxs[s].clear();`
`876`	`873`	`}`
`877`	`874`
`878`	`875`	`if (n_tested >= cells.size()) {`
`@@ -882,7 +879,7 @@ llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_`
`882`	`879`	`}`
`883`	`880`
`884`	`881`	`// we didn't find a suitable slot - return empty result`
`885`		`- if (n_found < n_tokens) {`
	`882`	`+ if (res.idxs[s].size() < n_tokens) {`
`886`	`883`	`return { };`
`887`	`884`	`}`
`888`	`885`	`}`