@@ -5670,12 +5670,10 @@ struct llm_build_falcon : public llm_graph_context {
5670
5670
cur = build_lora_mm(model.layers[il].wqkv, cur);
5671
5671
cb(cur, "wqkv", il);
5672
5672
5673
- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
5674
- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
5673
+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5674
+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5675
5675
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5676
5676
5677
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5678
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5679
5677
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5680
5678
5681
5679
// using mode = 2 for neox mode
@@ -5952,12 +5950,10 @@ struct llm_build_dbrx : public llm_graph_context {
5952
5950
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
5953
5951
cb(cur, "wqkv_clamped", il);
5954
5952
5955
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
5956
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
5953
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
5954
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
5957
5955
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5958
5956
5959
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
5960
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
5961
5957
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
5962
5958
5963
5959
Qcur = ggml_rope_ext(
@@ -6468,12 +6464,10 @@ struct llm_build_neo_bert : public llm_graph_context {
6468
6464
cur = build_lora_mm(model.layers[il].wqkv, cur);
6469
6465
cb(cur, "wqkv", il);
6470
6466
6471
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6472
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6467
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6468
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6473
6469
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6474
6470
6475
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6476
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6477
6471
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6478
6472
6479
6473
// RoPE
@@ -6703,8 +6697,8 @@ struct llm_build_mpt : public llm_graph_context {
6703
6697
cb(cur, "wqkv_clamped", il);
6704
6698
}
6705
6699
6706
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6707
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6700
+ ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd));
6701
+ ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd));
6708
6702
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6709
6703
6710
6704
cb(Qcur, "Qcur", il);
@@ -6724,6 +6718,12 @@ struct llm_build_mpt : public llm_graph_context {
6724
6718
model.layers[il].attn_k_norm_b,
6725
6719
LLM_NORM, il);
6726
6720
cb(Kcur, "Kcur", il);
6721
+ } else {
6722
+ Qcur = ggml_cont(ctx0, Qcur);
6723
+ cb(Qcur, "Qcur", il);
6724
+
6725
+ Kcur = ggml_cont(ctx0, Kcur);
6726
+ cb(Kcur, "Kcur", il);
6727
6727
}
6728
6728
6729
6729
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -6978,12 +6978,10 @@ struct llm_build_qwen : public llm_graph_context {
6978
6978
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6979
6979
cb(cur, "bqkv", il);
6980
6980
6981
- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
6982
- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
6981
+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
6982
+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
6983
6983
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
6984
6984
6985
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6986
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6987
6985
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6988
6986
6989
6987
// using mode = 2 for neox mode
@@ -7748,21 +7746,21 @@ struct llm_build_phi2 : public llm_graph_context {
7748
7746
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
7749
7747
cb(cur, "bqkv", il);
7750
7748
7751
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
7752
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
7749
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
7750
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
7753
7751
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
7754
7752
} else {
7755
7753
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7756
7754
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7757
7755
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7756
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7757
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7758
7758
}
7759
7759
7760
7760
cb(Qcur, "Qcur", il);
7761
7761
cb(Kcur, "Kcur", il);
7762
7762
cb(Vcur, "Vcur", il);
7763
7763
7764
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7765
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7766
7764
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7767
7765
7768
7766
Qcur = ggml_rope_ext(
@@ -7886,21 +7884,21 @@ struct llm_build_phi3 : public llm_graph_context {
7886
7884
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
7887
7885
cb(cur, "wqkv", il);
7888
7886
7889
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd) ));
7890
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd) ));
7887
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd));
7888
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd));
7891
7889
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
7892
7890
} else {
7893
7891
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
7894
7892
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
7895
7893
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
7894
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7895
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7896
7896
}
7897
7897
7898
7898
cb(Qcur, "Qcur", il);
7899
7899
cb(Kcur, "Kcur", il);
7900
7900
cb(Vcur, "Vcur", il);
7901
7901
7902
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7903
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7904
7902
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
7905
7903
7906
7904
Qcur = ggml_rope_ext(
@@ -8256,12 +8254,10 @@ struct llm_build_codeshell : public llm_graph_context {
8256
8254
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
8257
8255
cb(cur, "bqkv", il);
8258
8256
8259
- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
8260
- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
8257
+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
8258
+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
8261
8259
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
8262
8260
8263
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
8264
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
8265
8261
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
8266
8262
8267
8263
Qcur = ggml_rope_ext(
@@ -8677,8 +8673,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8677
8673
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
8678
8674
cb(k_pe, "k_pe", il);
8679
8675
8680
- // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
8681
- kv_compressed = ggml_cont(ctx0, kv_compressed);
8682
8676
kv_compressed = build_norm(kv_compressed,
8683
8677
model.layers[il].attn_kv_a_norm, NULL,
8684
8678
LLM_NORM_RMS, il);
@@ -8710,7 +8704,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8710
8704
0);
8711
8705
cb(v_states, "v_states", il);
8712
8706
8713
- q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8714
8707
q_pe = ggml_rope_ext(
8715
8708
ctx0, q_pe, inp_pos, rope_factors,
8716
8709
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8719,7 +8712,6 @@ struct llm_build_minicpm3 : public llm_graph_context {
8719
8712
cb(q_pe, "q_pe", il);
8720
8713
8721
8714
// shared RoPE key
8722
- k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
8723
8715
k_pe = ggml_rope_ext(
8724
8716
ctx0, k_pe, inp_pos, rope_factors,
8725
8717
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -10784,10 +10776,10 @@ struct llm_build_openelm : public llm_graph_context {
10784
10776
10785
10777
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
10786
10778
10787
- ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0) );
10779
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0);
10788
10780
cb(Qcur, "Qcur", il);
10789
10781
10790
- ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head) );
10782
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head);
10791
10783
cb(Kcur, "Kcur", il);
10792
10784
10793
10785
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
@@ -10909,12 +10901,10 @@ struct llm_build_gptneox : public llm_graph_context {
10909
10901
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
10910
10902
cb(cur, "bqkv", il);
10911
10903
10912
- ggml_tensor * Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
10913
- ggml_tensor * Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
10904
+ ggml_tensor * Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
10905
+ ggml_tensor * Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
10914
10906
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
10915
10907
10916
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
10917
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
10918
10908
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
10919
10909
10920
10910
Qcur = ggml_rope_ext(
@@ -12159,20 +12149,20 @@ struct llm_build_chatglm : public llm_graph_context {
12159
12149
if (model.layers[il].bv) {
12160
12150
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12161
12151
}
12152
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12153
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12162
12154
} else {
12163
12155
cur = build_lora_mm(model.layers[il].wqkv, cur);
12164
12156
cb(cur, "wqkv", il);
12165
12157
if (model.layers[il].bqkv) {
12166
12158
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12167
12159
cb(cur, "bqkv", il);
12168
12160
}
12169
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
12170
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
12161
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12162
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12171
12163
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12172
12164
}
12173
12165
12174
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12175
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12176
12166
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12177
12167
12178
12168
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
@@ -12293,20 +12283,20 @@ struct llm_build_glm4 : public llm_graph_context {
12293
12283
if (model.layers[il].bv) {
12294
12284
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12295
12285
}
12286
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12287
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12296
12288
} else {
12297
12289
cur = build_lora_mm(model.layers[il].wqkv, cur);
12298
12290
cb(cur, "wqkv", il);
12299
12291
if (model.layers[il].bqkv) {
12300
12292
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
12301
12293
cb(cur, "bqkv", il);
12302
12294
}
12303
- Qcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd , n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd) ));
12304
- Kcur = ggml_cont (ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa , n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd) ));
12295
+ Qcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
12296
+ Kcur = ggml_view_3d (ctx0, cur, n_embd_head, n_head_kv , n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
12305
12297
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
12306
12298
}
12307
12299
12308
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12309
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12310
12300
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12311
12301
12312
12302
Qcur = ggml_rope_ext(
0 commit comments