@@ -205,8 +205,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
205
205
continue ;
206
206
}
207
207
208
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s ();
209
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
208
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s (il );
209
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
210
210
211
211
const char * dev_name = " CPU" ;
212
212
@@ -1447,7 +1447,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1447
1447
for (const auto & layer : layers) {
1448
1448
const uint32_t il = layer.il ;
1449
1449
1450
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s ();
1450
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s (il );
1451
1451
1452
1452
// Write key type
1453
1453
const int32_t k_type_i = (int32_t )layer.k ->type ;
@@ -1469,7 +1469,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1469
1469
for (const auto & layer : layers) {
1470
1470
const uint32_t il = layer.il ;
1471
1471
1472
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
1472
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
1473
1473
1474
1474
// Write value type
1475
1475
const int32_t v_type_i = (int32_t )layer.v ->type ;
@@ -1493,7 +1493,7 @@ void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::
1493
1493
for (const auto & layer : layers) {
1494
1494
const uint32_t il = layer.il ;
1495
1495
1496
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
1496
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
1497
1497
1498
1498
// Write value type
1499
1499
const int32_t v_type_i = (int32_t )layer.v ->type ;
@@ -1636,7 +1636,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1636
1636
for (const auto & layer : layers) {
1637
1637
const uint32_t il = layer.il ;
1638
1638
1639
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s ();
1639
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s (il );
1640
1640
1641
1641
// Read type of key
1642
1642
int32_t k_type_i_ref;
@@ -1666,7 +1666,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1666
1666
for (const auto & layer : layers) {
1667
1667
const uint32_t il = layer.il ;
1668
1668
1669
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
1669
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
1670
1670
1671
1671
// Read type of value
1672
1672
int32_t v_type_i_ref;
@@ -1696,7 +1696,7 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
1696
1696
for (const auto & layer : layers) {
1697
1697
const uint32_t il = layer.il ;
1698
1698
1699
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
1699
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
1700
1700
1701
1701
// Read type of value
1702
1702
int32_t v_type_i_ref;
@@ -2206,8 +2206,8 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
2206
2206
continue ;
2207
2207
}
2208
2208
2209
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (i) + hparams.n_embd_k_s ();
2210
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (i) + hparams.n_embd_v_s ();
2209
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (i) + hparams.n_embd_k_s (i );
2210
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (i) + hparams.n_embd_v_s (i );
2211
2211
2212
2212
const char * dev_name = " CPU" ;
2213
2213
@@ -2909,7 +2909,7 @@ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std
2909
2909
// Iterate and write all the keys first, each row is a cell
2910
2910
// Get whole range at a time
2911
2911
for (uint32_t il = 0 ; il < n_layer; ++il) {
2912
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s ();
2912
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s (il );
2913
2913
2914
2914
// Write key type
2915
2915
const int32_t k_type_i = (int32_t )k_l[il]->type ;
@@ -2929,7 +2929,7 @@ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std
2929
2929
2930
2930
if (!v_trans) {
2931
2931
for (uint32_t il = 0 ; il < n_layer; ++il) {
2932
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
2932
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
2933
2933
2934
2934
// Write value type
2935
2935
const int32_t v_type_i = (int32_t )v_l[il]->type ;
@@ -2950,7 +2950,7 @@ void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std
2950
2950
// When v is transposed, we also need the element size and get the element ranges from each row
2951
2951
const uint32_t kv_size = size;
2952
2952
for (uint32_t il = 0 ; il < n_layer; ++il) {
2953
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
2953
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
2954
2954
2955
2955
// Write value type
2956
2956
const int32_t v_type_i = (int32_t )v_l[il]->type ;
@@ -3097,7 +3097,7 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
3097
3097
3098
3098
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
3099
3099
for (uint32_t il = 0 ; il < n_layer; ++il) {
3100
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s ();
3100
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa (il) + hparams.n_embd_k_s (il );
3101
3101
3102
3102
// Read type of key
3103
3103
int32_t k_type_i_ref;
@@ -3125,7 +3125,7 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
3125
3125
3126
3126
if (!v_trans) {
3127
3127
for (uint32_t il = 0 ; il < n_layer; ++il) {
3128
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
3128
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
3129
3129
3130
3130
// Read type of value
3131
3131
int32_t v_type_i_ref;
@@ -3153,7 +3153,7 @@ bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t ce
3153
3153
} else {
3154
3154
// For each layer, read the values for each cell (transposed)
3155
3155
for (uint32_t il = 0 ; il < n_layer; ++il) {
3156
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s ();
3156
+ const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa (il) + hparams.n_embd_v_s (il );
3157
3157
3158
3158
// Read type of value
3159
3159
int32_t v_type_i_ref;
0 commit comments