@@ -10189,7 +10189,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
10189
10189
}
10190
10190
};
10191
10191
10192
- struct llm_graph_context_mamba : public virtual llm_graph_context {
10192
+ struct llm_graph_context_mamba : public llm_graph_context {
10193
10193
llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
10194
10194
10195
10195
ggml_tensor * build_mamba_layer(
@@ -10466,8 +10466,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
10466
10466
};
10467
10467
10468
10468
struct llm_build_mamba : public llm_graph_context_mamba {
10469
- llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10470
- : llm_graph_context(params), llm_graph_context_mamba(params) {
10469
+ llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
10471
10470
ggml_tensor * cur;
10472
10471
ggml_tensor * inpL;
10473
10472
@@ -10524,8 +10523,7 @@ struct llm_build_mamba : public llm_graph_context_mamba {
10524
10523
};
10525
10524
10526
10525
struct llm_build_jamba : public llm_graph_context_mamba {
10527
- llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10528
- : llm_graph_context(params), llm_graph_context_mamba(params) {
10526
+ llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
10529
10527
const int64_t n_embd_head = hparams.n_embd_head_v;
10530
10528
10531
10529
ggml_tensor * cur;
@@ -13958,8 +13956,78 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
13958
13956
}
13959
13957
};
13960
13958
13961
- struct llm_graph_context_granite : public virtual llm_graph_context {
13962
- llm_graph_context_granite(const llm_graph_params & params) : llm_graph_context(params) {}
13959
+ struct llm_build_granite : public llm_graph_context {
13960
+ llm_build_granite(
13961
+ const llama_model & model,
13962
+ const llm_graph_params & params,
13963
+ ggml_cgraph * gf)
13964
+ : llm_graph_context(params) {
13965
+
13966
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13967
+
13968
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13969
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
13970
+
13971
+ ggml_tensor * cur;
13972
+ ggml_tensor * inpL;
13973
+
13974
+ inpL = build_inp_embd(model.tok_embd);
13975
+
13976
+ // inp_pos - built only if rope enabled
13977
+ ggml_tensor * inp_pos = nullptr;
13978
+ if (hparams.rope_finetuned) {
13979
+ inp_pos = build_inp_pos();
13980
+ }
13981
+
13982
+ auto * inp_attn = build_attn_inp_kv_unified();
13983
+
13984
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13985
+
13986
+ for (int il = 0; il < n_layer; ++il) {
13987
+ ggml_tensor * inpSA = inpL;
13988
+
13989
+ // norm
13990
+ cur = build_norm(inpL,
13991
+ model.layers[il].attn_norm, NULL,
13992
+ LLM_NORM_RMS, il);
13993
+ cb(cur, "attn_norm", il);
13994
+
13995
+ // self-attention
13996
+ cur = build_attention_layer(
13997
+ gf, cur, inp_pos, inp_attn,
13998
+ model, n_embd_head, il);
13999
+
14000
+ if (il == n_layer - 1 && inp_out_ids) {
14001
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14002
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14003
+ }
14004
+
14005
+ // ffn
14006
+ cur = build_layer_ffn(cur, inpSA, model, il);
14007
+
14008
+ // input for next layer
14009
+ inpL = cur;
14010
+ }
14011
+
14012
+ cur = inpL;
14013
+
14014
+ cur = build_norm(cur,
14015
+ model.output_norm, NULL,
14016
+ LLM_NORM_RMS, -1);
14017
+
14018
+ cb(cur, "result_norm", -1);
14019
+ res->t_embd = cur;
14020
+
14021
+ // lm_head
14022
+ cur = build_lora_mm(model.output, cur);
14023
+
14024
+ // For Granite architectures - scale logits
14025
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14026
+ cb(cur, "result_output", -1);
14027
+ res->t_logits = cur;
14028
+
14029
+ ggml_build_forward_expand(gf, cur);
14030
+ }
13963
14031
13964
14032
ggml_tensor * build_attention_layer(
13965
14033
ggml_cgraph * gf,
@@ -14104,89 +14172,13 @@ struct llm_graph_context_granite : public virtual llm_graph_context {
14104
14172
}
14105
14173
};
14106
14174
14107
- struct llm_build_granite : public llm_graph_context_granite {
14108
- llm_build_granite(
14109
- const llama_model & model,
14110
- const llm_graph_params & params,
14111
- ggml_cgraph * gf)
14112
- : llm_graph_context(params), llm_graph_context_granite(params) {
14113
-
14114
- const int64_t n_embd_head = hparams.n_embd_head_v;
14115
-
14116
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14117
- GGML_ASSERT(n_embd_head == hparams.n_rot);
14118
-
14119
- ggml_tensor * cur;
14120
- ggml_tensor * inpL;
14121
-
14122
- inpL = build_inp_embd(model.tok_embd);
14123
-
14124
- // inp_pos - built only if rope enabled
14125
- ggml_tensor * inp_pos = nullptr;
14126
- if (hparams.rope_finetuned) {
14127
- inp_pos = build_inp_pos();
14128
- }
14129
-
14130
- auto * inp_attn = build_attn_inp_kv_unified();
14131
-
14132
- ggml_tensor * inp_out_ids = build_inp_out_ids();
14133
-
14134
- for (int il = 0; il < n_layer; ++il) {
14135
- ggml_tensor * inpSA = inpL;
14136
-
14137
- // norm
14138
- cur = build_norm(inpL,
14139
- model.layers[il].attn_norm, NULL,
14140
- LLM_NORM_RMS, il);
14141
- cb(cur, "attn_norm", il);
14142
-
14143
- // self-attention
14144
- cur = build_attention_layer(
14145
- gf, cur, inp_pos, inp_attn,
14146
- model, n_embd_head, il);
14147
-
14148
- if (il == n_layer - 1 && inp_out_ids) {
14149
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14150
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14151
- }
14152
-
14153
- // ffn
14154
- cur = build_layer_ffn(cur, inpSA, model, il);
14155
-
14156
- // input for next layer
14157
- inpL = cur;
14158
- }
14159
-
14160
- cur = inpL;
14161
-
14162
- cur = build_norm(cur,
14163
- model.output_norm, NULL,
14164
- LLM_NORM_RMS, -1);
14165
-
14166
- cb(cur, "result_norm", -1);
14167
- res->t_embd = cur;
14168
-
14169
- // lm_head
14170
- cur = build_lora_mm(model.output, cur);
14171
-
14172
- // For Granite architectures - scale logits
14173
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14174
- cb(cur, "result_output", -1);
14175
- res->t_logits = cur;
14176
-
14177
- ggml_build_forward_expand(gf, cur);
14178
- }
14179
- };
14180
-
14181
- struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_graph_context_granite {
14175
+ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
14182
14176
14183
14177
llm_build_granite_hybrid(
14184
14178
const llama_model & model,
14185
14179
const llm_graph_params & params,
14186
14180
ggml_cgraph * gf) :
14187
- llm_graph_context(params),
14188
- llm_graph_context_mamba(params),
14189
- llm_graph_context_granite(params) {
14181
+ llm_graph_context_mamba(params) {
14190
14182
14191
14183
const int64_t n_embd_head = hparams.n_embd_head_v;
14192
14184
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14258,6 +14250,148 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_gra
14258
14250
14259
14251
ggml_build_forward_expand(gf, cur);
14260
14252
}
14253
+
14254
+ ggml_tensor * build_attention_layer(
14255
+ ggml_cgraph * gf,
14256
+ ggml_tensor * cur,
14257
+ ggml_tensor * inp_pos,
14258
+ llm_graph_input_attn_kv_unified * inp_attn,
14259
+ const llama_model & model,
14260
+ const int64_t n_embd_head,
14261
+ const int il) {
14262
+
14263
+ // compute Q and K and (optionally) RoPE them
14264
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14265
+ cb(Qcur, "Qcur", il);
14266
+ if (model.layers[il].bq) {
14267
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14268
+ cb(Qcur, "Qcur", il);
14269
+ }
14270
+
14271
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14272
+ cb(Kcur, "Kcur", il);
14273
+ if (model.layers[il].bk) {
14274
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14275
+ cb(Kcur, "Kcur", il);
14276
+ }
14277
+
14278
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14279
+ cb(Vcur, "Vcur", il);
14280
+ if (model.layers[il].bv) {
14281
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14282
+ cb(Vcur, "Vcur", il);
14283
+ }
14284
+
14285
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14286
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14287
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14288
+
14289
+ const bool use_rope = hparams.rope_finetuned;
14290
+ if (use_rope) {
14291
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14292
+ Qcur = ggml_rope_ext(
14293
+ ctx0, Qcur, inp_pos, rope_factors,
14294
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14295
+ ext_factor, attn_factor, beta_fast, beta_slow
14296
+ );
14297
+
14298
+ Kcur = ggml_rope_ext(
14299
+ ctx0, Kcur, inp_pos, rope_factors,
14300
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14301
+ ext_factor, attn_factor, beta_fast, beta_slow
14302
+ );
14303
+ }
14304
+
14305
+ cb(Qcur, "Qcur", il);
14306
+ cb(Kcur, "Kcur", il);
14307
+ cb(Vcur, "Vcur", il);
14308
+
14309
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14310
+ cur = build_attn(inp_attn, gf,
14311
+ model.layers[il].wo, model.layers[il].bo,
14312
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14313
+ cb(cur, "attn_out", il);
14314
+ return cur;
14315
+ }
14316
+
14317
+ ggml_tensor * build_layer_ffn(
14318
+ ggml_tensor * cur,
14319
+ ggml_tensor * inpSA,
14320
+ const llama_model & model,
14321
+ const int il) {
14322
+
14323
+ // For Granite architectures - scale residual
14324
+ if (hparams.f_residual_scale) {
14325
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14326
+ }
14327
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14328
+ cb(ffn_inp, "ffn_inp", il);
14329
+
14330
+ // feed-forward network (non-MoE)
14331
+ if (model.layers[il].ffn_gate_inp == nullptr) {
14332
+
14333
+ cur = build_norm(ffn_inp,
14334
+ model.layers[il].ffn_norm, NULL,
14335
+ LLM_NORM_RMS, il);
14336
+ cb(cur, "ffn_norm", il);
14337
+
14338
+ cur = build_ffn(cur,
14339
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14340
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14341
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14342
+ NULL,
14343
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14344
+ cb(cur, "ffn_out", il);
14345
+
14346
+ } else {
14347
+ // MoE branch
14348
+ cur = build_norm(ffn_inp,
14349
+ model.layers[il].ffn_norm, NULL,
14350
+ LLM_NORM_RMS, il);
14351
+ cb(cur, "ffn_norm", il);
14352
+
14353
+ ggml_tensor * moe_out = build_moe_ffn(cur,
14354
+ model.layers[il].ffn_gate_inp,
14355
+ model.layers[il].ffn_up_exps,
14356
+ model.layers[il].ffn_gate_exps,
14357
+ model.layers[il].ffn_down_exps,
14358
+ nullptr,
14359
+ n_expert, n_expert_used,
14360
+ LLM_FFN_SILU, true,
14361
+ false, 0.0,
14362
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14363
+ il);
14364
+ cb(moe_out, "ffn_moe_out", il);
14365
+
14366
+ // For Granite MoE Shared
14367
+ if (hparams.n_ff_shexp > 0) {
14368
+ ggml_tensor * ffn_shexp = build_ffn(cur,
14369
+ model.layers[il].ffn_up_shexp, NULL, NULL,
14370
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
14371
+ model.layers[il].ffn_down_shexp, NULL, NULL,
14372
+ NULL,
14373
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
14374
+ cb(ffn_shexp, "ffn_shexp", il);
14375
+
14376
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
14377
+ cb(cur, "ffn_out", il);
14378
+ } else {
14379
+ cur = moe_out;
14380
+ }
14381
+ }
14382
+
14383
+ // For Granite architectures - scale residual
14384
+ if (hparams.f_residual_scale) {
14385
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14386
+ }
14387
+ cur = ggml_add(ctx0, cur, ffn_inp);
14388
+ cb(cur, "ffn_out", il);
14389
+
14390
+ cur = build_cvec(cur, il);
14391
+ cb(cur, "l_out", il);
14392
+
14393
+ return cur;
14394
+ }
14261
14395
};
14262
14396
14263
14397
// ref: https://github.com/facebookresearch/chameleon
@@ -15192,8 +15326,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
15192
15326
};
15193
15327
15194
15328
struct llm_build_falcon_h1 : public llm_graph_context_mamba {
15195
- llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
15196
- : llm_graph_context(params), llm_graph_context_mamba(params) {
15329
+ llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
15197
15330
const int64_t n_embd_head = hparams.n_embd_head_v;
15198
15331
15199
15332
ggml_tensor * cur;
0 commit comments