@@ -4445,6 +4445,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4445
4445
return it->second;
4446
4446
}
4447
4447
4448
+ ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4449
+ // choose long/short freq factors based on the context size
4450
+ if (layers[il].rope_freqs != nullptr) {
4451
+ return layers[il].rope_freqs;
4452
+ }
4453
+
4454
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4455
+ return layers[il].rope_long;
4456
+ }
4457
+
4458
+ return layers[il].rope_short;
4459
+ }
4460
+
4448
4461
struct llm_build_llama : public llm_graph_context {
4449
4462
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4450
4463
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4485,7 +4498,7 @@ struct llm_build_llama : public llm_graph_context {
4485
4498
// self-attention
4486
4499
{
4487
4500
// rope freq factors for llama3; may return nullptr for llama2 and other models
4488
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4501
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
4489
4502
4490
4503
// compute Q and K and RoPE them
4491
4504
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4710,7 +4723,7 @@ struct llm_build_deci : public llm_graph_context {
4710
4723
} else if (n_head > 0) {
4711
4724
// self-attention
4712
4725
// rope freq factors for llama3; may return nullptr for llama2 and other models
4713
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4726
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
4714
4727
4715
4728
// compute Q and K and RoPE them
4716
4729
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7192,7 +7205,7 @@ struct llm_build_phi3 : public llm_graph_context {
7192
7205
// self-attention
7193
7206
{
7194
7207
// rope freq factors for 128k context
7195
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7208
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
7196
7209
7197
7210
ggml_tensor* attn_norm_output = build_norm(inpL,
7198
7211
model.layers[il].attn_norm,
@@ -7944,7 +7957,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7944
7957
for (int il = 0; il < n_layer; ++il) {
7945
7958
ggml_tensor * inpSA = inpL;
7946
7959
7947
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7960
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
7948
7961
7949
7962
// norm
7950
7963
cur = build_norm(inpL,
@@ -9012,7 +9025,7 @@ struct llm_build_cohere2 : public llm_graph_context {
9012
9025
// self-attention
9013
9026
{
9014
9027
// rope freq factors for 128k context
9015
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
9028
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
9016
9029
9017
9030
// compute Q and K and RoPE them
9018
9031
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9950,7 +9963,7 @@ struct llm_build_deepseek : public llm_graph_context {
9950
9963
// self-attention
9951
9964
{
9952
9965
// rope freq factors for llama3; may return nullptr for llama2 and other models
9953
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
9966
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
9954
9967
9955
9968
// compute Q and K and RoPE them
9956
9969
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11314,7 +11327,7 @@ struct llm_build_exaone : public llm_graph_context {
11314
11327
// self-attention
11315
11328
{
11316
11329
// rope freq factors for llama3; may return nullptr for llama2 and other models
11317
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
11330
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
11318
11331
11319
11332
// compute Q and K and RoPE them
11320
11333
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12695,7 +12708,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12695
12708
// self-attention
12696
12709
{
12697
12710
// rope freq factors for llama3; may return nullptr for llama2 and other models
12698
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
12711
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
12699
12712
12700
12713
// compute Q and K and RoPE them
12701
12714
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12818,28 +12831,6 @@ struct llm_build_bailingmoe : public llm_graph_context {
12818
12831
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
12819
12832
llama_memory_i * res;
12820
12833
12821
- const bool offload = cparams.offload_kqv;
12822
-
12823
- auto get_buft = [this, offload](int il) {
12824
- const char * dev_name = "CPU";
12825
-
12826
- ggml_backend_buffer_type_t buft;
12827
- if (offload) {
12828
- auto * dev = dev_layer(il);
12829
- buft = ggml_backend_dev_buffer_type(dev);
12830
-
12831
- dev_name = ggml_backend_dev_name(dev);
12832
- } else {
12833
- buft = ggml_backend_cpu_buffer_type();
12834
- }
12835
-
12836
- LLAMA_LOG_DEBUG("layer %3d: dev = %s\n", il, dev_name);
12837
-
12838
- return buft;
12839
- };
12840
-
12841
- LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
12842
-
12843
12834
switch (arch) {
12844
12835
case LLM_ARCH_MAMBA:
12845
12836
case LLM_ARCH_RWKV6:
@@ -12848,13 +12839,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12848
12839
case LLM_ARCH_ARWKV7:
12849
12840
{
12850
12841
res = new llama_kv_cache_recurrent(
12851
- hparams,
12852
- {
12853
- /*.get_rope_factors =*/ nullptr,
12854
- /*.get_buft =*/ get_buft,
12855
- },
12842
+ *this,
12856
12843
GGML_TYPE_F32,
12857
12844
GGML_TYPE_F32,
12845
+ cparams.offload_kqv,
12858
12846
std::max((uint32_t) 1, cparams.n_seq_max));
12859
12847
} break;
12860
12848
default:
@@ -12866,25 +12854,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12866
12854
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
12867
12855
12868
12856
res = new llama_kv_cache_unified(
12869
- hparams,
12870
- {
12871
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12872
- // choose long/short freq factors based on the context size
12873
- if (layers[il].rope_freqs != nullptr) {
12874
- return layers[il].rope_freqs;
12875
- }
12876
-
12877
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12878
- return layers[il].rope_long;
12879
- }
12880
-
12881
- return layers[il].rope_short;
12882
- },
12883
- /*.get_buft =*/ get_buft,
12884
- },
12857
+ *this,
12885
12858
params.type_k,
12886
12859
params.type_v,
12887
12860
!cparams.flash_attn,
12861
+ cparams.offload_kqv,
12888
12862
cparams.n_ctx,
12889
12863
padding);
12890
12864
}
0 commit comments