@@ -6319,56 +6319,57 @@ struct llm_build_neo_bert : public llm_graph_context {
6319
6319
6320
6320
auto * inp_attn = build_attn_inp_no_cache();
6321
6321
6322
- // iterate layers
6322
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
6323
+
6323
6324
for (int il = 0; il < n_layer; ++il) {
6324
6325
ggml_tensor * cur = inpL;
6325
6326
6326
- ggml_tensor * Qcur;
6327
- ggml_tensor * Kcur;
6328
- ggml_tensor * Vcur;
6329
-
6330
6327
// pre-norm
6331
6328
cur = build_norm(inpL,
6332
6329
model.layers[il].attn_norm, NULL,
6333
6330
LLM_NORM_RMS, il);
6334
6331
6335
- // self-attention
6336
- cur = build_lora_mm(model.layers[il].wqkv, cur);
6337
- cb(cur, "wqkv", il);
6338
-
6339
- Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6340
- Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6341
- Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6342
-
6343
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6344
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6345
- Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6346
-
6347
- // RoPE
6348
- Qcur = ggml_rope_ext(
6349
- ctx0, Qcur, inp_pos, nullptr,
6350
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6351
- ext_factor, attn_factor, beta_fast, beta_slow
6352
- );
6332
+ {
6333
+ ggml_tensor * Qcur;
6334
+ ggml_tensor * Kcur;
6335
+ ggml_tensor * Vcur;
6353
6336
6354
- Kcur = ggml_rope_ext(
6355
- ctx0, Kcur, inp_pos, nullptr,
6356
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6357
- ext_factor, attn_factor, beta_fast, beta_slow
6358
- );
6337
+ // self-attention
6338
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
6339
+ cb(cur, "wqkv", il);
6340
+
6341
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
6342
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
6343
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
6344
+
6345
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6346
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
6347
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
6348
+
6349
+ // RoPE
6350
+ Qcur = ggml_rope_ext(
6351
+ ctx0, Qcur, inp_pos, nullptr,
6352
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6353
+ ext_factor, attn_factor, beta_fast, beta_slow
6354
+ );
6359
6355
6360
- cb(Qcur, "Qcur", il);
6361
- cb(Kcur, "Kcur", il);
6362
- cb(Vcur, "Vcur", il);
6356
+ Kcur = ggml_rope_ext(
6357
+ ctx0, Kcur, inp_pos, nullptr,
6358
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
6359
+ ext_factor, attn_factor, beta_fast, beta_slow
6360
+ );
6363
6361
6364
- cur = build_attn(inp_attn, gf,
6365
- model.layers[il].wo, nullptr,
6366
- Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6367
- cb(cur, "kqv_out", il);
6362
+ cb(Qcur, "Qcur", il);
6363
+ cb(Kcur, "Kcur", il);
6364
+ cb(Vcur, "Vcur", il);
6368
6365
6369
- if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
6370
- // skip computing output for unused tokens
6371
- ggml_tensor * inp_out_ids = build_inp_out_ids();
6366
+ cur = build_attn(inp_attn, gf,
6367
+ model.layers[il].wo, nullptr,
6368
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
6369
+ cb(cur, "kqv_out", il);
6370
+ }
6371
+
6372
+ if (il == n_layer - 1 && inp_out_ids) {
6372
6373
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
6373
6374
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
6374
6375
}
@@ -7929,13 +7930,8 @@ struct llm_build_plamo : public llm_graph_context {
7929
7930
model.layers[il].wo, NULL,
7930
7931
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
7931
7932
}
7932
- ggml_tensor * sa_out = cur;
7933
-
7934
- cur = attention_norm;
7935
7933
7936
- if (il == n_layer - 1) {
7937
- // skip computing output for unused tokens
7938
- ggml_tensor * inp_out_ids = build_inp_out_ids();
7934
+ if (il == n_layer - 1 && inp_out_ids) {
7939
7935
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
7940
7936
sa_inp = ggml_get_rows(ctx0, sa_inp, inp_out_ids);
7941
7937
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
@@ -9638,6 +9634,8 @@ struct llm_build_mamba : public llm_graph_context {
9638
9634
9639
9635
auto * rs_inp = build_rs_inp();
9640
9636
9637
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
9638
+
9641
9639
for (int il = 0; il < n_layer; ++il) {
9642
9640
// norm
9643
9641
cur = build_norm(inpL,
@@ -13995,6 +13993,8 @@ struct llm_build_dots1 : public llm_graph_context {
13995
13993
13996
13994
auto * inp_attn = build_attn_inp_kv_unified();
13997
13995
13996
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13997
+
13998
13998
for (int il = 0; il < n_layer; ++il) {
13999
13999
ggml_tensor * inpSA = inpL;
14000
14000
@@ -14047,9 +14047,7 @@ struct llm_build_dots1 : public llm_graph_context {
14047
14047
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
14048
14048
}
14049
14049
14050
- if (il == n_layer - 1) {
14051
- // skip computing output for unused tokens
14052
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14050
+ if (il == n_layer - 1 && inp_out_ids) {
14053
14051
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14054
14052
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14055
14053
}
@@ -14147,6 +14145,8 @@ struct llm_build_arcee : public llm_graph_context {
14147
14145
14148
14146
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14149
14147
14148
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14149
+
14150
14150
for (int il = 0; il < n_layer; ++il) {
14151
14151
ggml_tensor * inpSA = inpL;
14152
14152
@@ -14209,9 +14209,7 @@ struct llm_build_arcee : public llm_graph_context {
14209
14209
cb(cur, "attn_out", il);
14210
14210
}
14211
14211
14212
- if (il == n_layer - 1) {
14213
- // skip computing output for unused tokens
14214
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14212
+ if (il == n_layer - 1 && inp_out_ids) {
14215
14213
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14216
14214
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14217
14215
}
0 commit comments