@@ -1802,7 +1802,7 @@ struct llama_kv_cell {
1802
1802
struct llama_kv_cache {
1803
1803
bool has_shift = false;
1804
1804
bool do_defrag = false;
1805
- // with Mamba, a slot can hold the state for more than one past token
1805
+ // with Mamba, a cell can hold the state for more than one past token
1806
1806
bool unlimited = false;
1807
1807
1808
1808
// Note: The value of head isn't only used to optimize searching
@@ -2066,7 +2066,7 @@ static bool llama_kv_cache_init(
2066
2066
2067
2067
cache.has_shift = false;
2068
2068
2069
- // for now, only Mamba can hold state for more than one past token per slot
2069
+ // for now, only Mamba can hold state for more than one past token per cell
2070
2070
cache.unlimited = model.arch == LLM_ARCH_MAMBA;
2071
2071
2072
2072
cache.head = 0;
@@ -2325,7 +2325,7 @@ static void llama_kv_cache_seq_cp(
2325
2325
cache.cells[seq_id_dst].delta = seq_id_src;
2326
2326
// NOTE: a sequence can't have multiple sources, but can have multiple destinations.
2327
2327
// For compatibility with the other KV cache API functions,
2328
- // the seq_id(s) of a slot suggests an intent to "copy to" those id(s),
2328
+ // the seq_id(s) of a cell suggests an intent to "copy to" those id(s),
2329
2329
// so that when a sequence is copied, it can initially be found from the source cell.
2330
2330
cache.cells[seq_id_src].seq_id.insert(seq_id_dst);
2331
2331
// prevent the destination from getting cleared
@@ -12481,10 +12481,10 @@ struct llama_context * llama_new_context_with_model(
12481
12481
ggml_type type_k = params.type_k;
12482
12482
ggml_type type_v = params.type_v;
12483
12483
12484
- // Mamba only needs a constant number of KV cache slots per sequence
12484
+ // Mamba only needs a constant number of KV cache cells per sequence
12485
12485
if (model->arch == LLM_ARCH_MAMBA) {
12486
- // Mamba needs as many slots as there are distinct sequences processed at the same time
12487
- // The extra slot allows dedicating a sequence id to the system prompt
12486
+ // Mamba needs as many KV cells as there are sequences kept at any time
12487
+ // The extra cell allows dedicating a sequence id to the system prompt
12488
12488
// TODO: find a better way to get the max number of parallel sequences
12489
12489
kv_size = params.n_parallel + 1;
12490
12490
// it's probably best to keep as much precision as possible for the states
0 commit comments