Skip to content

Commit 8dd7f97

Browse files
committed
refactor: Remove use of diamond inheritance
Per PR discussion, it's simpler to keep this with basic inheritance and not introduce the complexity of virtual inheritance and multiple inheritance ggml-org#13550 (comment) Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 6875697 commit 8dd7f97

File tree

1 file changed

+220
-87
lines changed

1 file changed

+220
-87
lines changed

src/llama-model.cpp

Lines changed: 220 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -10189,7 +10189,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
1018910189
}
1019010190
};
1019110191

10192-
struct llm_graph_context_mamba : public virtual llm_graph_context {
10192+
struct llm_graph_context_mamba : public llm_graph_context {
1019310193
llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
1019410194

1019510195
ggml_tensor * build_mamba_layer(
@@ -10466,8 +10466,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
1046610466
};
1046710467

1046810468
struct llm_build_mamba : public llm_graph_context_mamba {
10469-
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10470-
: llm_graph_context(params), llm_graph_context_mamba(params) {
10469+
llm_build_mamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1047110470
ggml_tensor * cur;
1047210471
ggml_tensor * inpL;
1047310472

@@ -10524,8 +10523,7 @@ struct llm_build_mamba : public llm_graph_context_mamba {
1052410523
};
1052510524

1052610525
struct llm_build_jamba : public llm_graph_context_mamba {
10527-
llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
10528-
: llm_graph_context(params), llm_graph_context_mamba(params) {
10526+
llm_build_jamba(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1052910527
const int64_t n_embd_head = hparams.n_embd_head_v;
1053010528

1053110529
ggml_tensor * cur;
@@ -13958,8 +13956,78 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
1395813956
}
1395913957
};
1396013958

13961-
struct llm_graph_context_granite : public virtual llm_graph_context {
13962-
llm_graph_context_granite(const llm_graph_params & params) : llm_graph_context(params) {}
13959+
struct llm_build_granite : public llm_graph_context {
13960+
llm_build_granite(
13961+
const llama_model & model,
13962+
const llm_graph_params & params,
13963+
ggml_cgraph * gf)
13964+
: llm_graph_context(params) {
13965+
13966+
const int64_t n_embd_head = hparams.n_embd_head_v;
13967+
13968+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13969+
GGML_ASSERT(n_embd_head == hparams.n_rot);
13970+
13971+
ggml_tensor * cur;
13972+
ggml_tensor * inpL;
13973+
13974+
inpL = build_inp_embd(model.tok_embd);
13975+
13976+
// inp_pos - built only if rope enabled
13977+
ggml_tensor * inp_pos = nullptr;
13978+
if (hparams.rope_finetuned) {
13979+
inp_pos = build_inp_pos();
13980+
}
13981+
13982+
auto * inp_attn = build_attn_inp_kv_unified();
13983+
13984+
ggml_tensor * inp_out_ids = build_inp_out_ids();
13985+
13986+
for (int il = 0; il < n_layer; ++il) {
13987+
ggml_tensor * inpSA = inpL;
13988+
13989+
// norm
13990+
cur = build_norm(inpL,
13991+
model.layers[il].attn_norm, NULL,
13992+
LLM_NORM_RMS, il);
13993+
cb(cur, "attn_norm", il);
13994+
13995+
// self-attention
13996+
cur = build_attention_layer(
13997+
gf, cur, inp_pos, inp_attn,
13998+
model, n_embd_head, il);
13999+
14000+
if (il == n_layer - 1 && inp_out_ids) {
14001+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14002+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14003+
}
14004+
14005+
// ffn
14006+
cur = build_layer_ffn(cur, inpSA, model, il);
14007+
14008+
// input for next layer
14009+
inpL = cur;
14010+
}
14011+
14012+
cur = inpL;
14013+
14014+
cur = build_norm(cur,
14015+
model.output_norm, NULL,
14016+
LLM_NORM_RMS, -1);
14017+
14018+
cb(cur, "result_norm", -1);
14019+
res->t_embd = cur;
14020+
14021+
// lm_head
14022+
cur = build_lora_mm(model.output, cur);
14023+
14024+
// For Granite architectures - scale logits
14025+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14026+
cb(cur, "result_output", -1);
14027+
res->t_logits = cur;
14028+
14029+
ggml_build_forward_expand(gf, cur);
14030+
}
1396314031

1396414032
ggml_tensor * build_attention_layer(
1396514033
ggml_cgraph * gf,
@@ -14104,89 +14172,13 @@ struct llm_graph_context_granite : public virtual llm_graph_context {
1410414172
}
1410514173
};
1410614174

14107-
struct llm_build_granite : public llm_graph_context_granite {
14108-
llm_build_granite(
14109-
const llama_model & model,
14110-
const llm_graph_params & params,
14111-
ggml_cgraph * gf)
14112-
: llm_graph_context(params), llm_graph_context_granite(params) {
14113-
14114-
const int64_t n_embd_head = hparams.n_embd_head_v;
14115-
14116-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14117-
GGML_ASSERT(n_embd_head == hparams.n_rot);
14118-
14119-
ggml_tensor * cur;
14120-
ggml_tensor * inpL;
14121-
14122-
inpL = build_inp_embd(model.tok_embd);
14123-
14124-
// inp_pos - built only if rope enabled
14125-
ggml_tensor * inp_pos = nullptr;
14126-
if (hparams.rope_finetuned) {
14127-
inp_pos = build_inp_pos();
14128-
}
14129-
14130-
auto * inp_attn = build_attn_inp_kv_unified();
14131-
14132-
ggml_tensor * inp_out_ids = build_inp_out_ids();
14133-
14134-
for (int il = 0; il < n_layer; ++il) {
14135-
ggml_tensor * inpSA = inpL;
14136-
14137-
// norm
14138-
cur = build_norm(inpL,
14139-
model.layers[il].attn_norm, NULL,
14140-
LLM_NORM_RMS, il);
14141-
cb(cur, "attn_norm", il);
14142-
14143-
// self-attention
14144-
cur = build_attention_layer(
14145-
gf, cur, inp_pos, inp_attn,
14146-
model, n_embd_head, il);
14147-
14148-
if (il == n_layer - 1 && inp_out_ids) {
14149-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14150-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14151-
}
14152-
14153-
// ffn
14154-
cur = build_layer_ffn(cur, inpSA, model, il);
14155-
14156-
// input for next layer
14157-
inpL = cur;
14158-
}
14159-
14160-
cur = inpL;
14161-
14162-
cur = build_norm(cur,
14163-
model.output_norm, NULL,
14164-
LLM_NORM_RMS, -1);
14165-
14166-
cb(cur, "result_norm", -1);
14167-
res->t_embd = cur;
14168-
14169-
// lm_head
14170-
cur = build_lora_mm(model.output, cur);
14171-
14172-
// For Granite architectures - scale logits
14173-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
14174-
cb(cur, "result_output", -1);
14175-
res->t_logits = cur;
14176-
14177-
ggml_build_forward_expand(gf, cur);
14178-
}
14179-
};
14180-
14181-
struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_graph_context_granite {
14175+
struct llm_build_granite_hybrid : public llm_graph_context_mamba {
1418214176

1418314177
llm_build_granite_hybrid(
1418414178
const llama_model & model,
1418514179
const llm_graph_params & params,
1418614180
ggml_cgraph * gf) :
14187-
llm_graph_context(params),
14188-
llm_graph_context_mamba(params),
14189-
llm_graph_context_granite(params) {
14181+
llm_graph_context_mamba(params) {
1419014182

1419114183
const int64_t n_embd_head = hparams.n_embd_head_v;
1419214184
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14258,6 +14250,148 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba, public llm_gra
1425814250

1425914251
ggml_build_forward_expand(gf, cur);
1426014252
}
14253+
14254+
ggml_tensor * build_attention_layer(
14255+
ggml_cgraph * gf,
14256+
ggml_tensor * cur,
14257+
ggml_tensor * inp_pos,
14258+
llm_graph_input_attn_kv_unified * inp_attn,
14259+
const llama_model & model,
14260+
const int64_t n_embd_head,
14261+
const int il) {
14262+
14263+
// compute Q and K and (optionally) RoPE them
14264+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14265+
cb(Qcur, "Qcur", il);
14266+
if (model.layers[il].bq) {
14267+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14268+
cb(Qcur, "Qcur", il);
14269+
}
14270+
14271+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14272+
cb(Kcur, "Kcur", il);
14273+
if (model.layers[il].bk) {
14274+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14275+
cb(Kcur, "Kcur", il);
14276+
}
14277+
14278+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14279+
cb(Vcur, "Vcur", il);
14280+
if (model.layers[il].bv) {
14281+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14282+
cb(Vcur, "Vcur", il);
14283+
}
14284+
14285+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il), n_tokens);
14286+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14287+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
14288+
14289+
const bool use_rope = hparams.rope_finetuned;
14290+
if (use_rope) {
14291+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
14292+
Qcur = ggml_rope_ext(
14293+
ctx0, Qcur, inp_pos, rope_factors,
14294+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14295+
ext_factor, attn_factor, beta_fast, beta_slow
14296+
);
14297+
14298+
Kcur = ggml_rope_ext(
14299+
ctx0, Kcur, inp_pos, rope_factors,
14300+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14301+
ext_factor, attn_factor, beta_fast, beta_slow
14302+
);
14303+
}
14304+
14305+
cb(Qcur, "Qcur", il);
14306+
cb(Kcur, "Kcur", il);
14307+
cb(Vcur, "Vcur", il);
14308+
14309+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14310+
cur = build_attn(inp_attn, gf,
14311+
model.layers[il].wo, model.layers[il].bo,
14312+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14313+
cb(cur, "attn_out", il);
14314+
return cur;
14315+
}
14316+
14317+
ggml_tensor * build_layer_ffn(
14318+
ggml_tensor * cur,
14319+
ggml_tensor * inpSA,
14320+
const llama_model & model,
14321+
const int il) {
14322+
14323+
// For Granite architectures - scale residual
14324+
if (hparams.f_residual_scale) {
14325+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14326+
}
14327+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14328+
cb(ffn_inp, "ffn_inp", il);
14329+
14330+
// feed-forward network (non-MoE)
14331+
if (model.layers[il].ffn_gate_inp == nullptr) {
14332+
14333+
cur = build_norm(ffn_inp,
14334+
model.layers[il].ffn_norm, NULL,
14335+
LLM_NORM_RMS, il);
14336+
cb(cur, "ffn_norm", il);
14337+
14338+
cur = build_ffn(cur,
14339+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14340+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14341+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
14342+
NULL,
14343+
LLM_FFN_SILU, LLM_FFN_PAR, il);
14344+
cb(cur, "ffn_out", il);
14345+
14346+
} else {
14347+
// MoE branch
14348+
cur = build_norm(ffn_inp,
14349+
model.layers[il].ffn_norm, NULL,
14350+
LLM_NORM_RMS, il);
14351+
cb(cur, "ffn_norm", il);
14352+
14353+
ggml_tensor * moe_out = build_moe_ffn(cur,
14354+
model.layers[il].ffn_gate_inp,
14355+
model.layers[il].ffn_up_exps,
14356+
model.layers[il].ffn_gate_exps,
14357+
model.layers[il].ffn_down_exps,
14358+
nullptr,
14359+
n_expert, n_expert_used,
14360+
LLM_FFN_SILU, true,
14361+
false, 0.0,
14362+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
14363+
il);
14364+
cb(moe_out, "ffn_moe_out", il);
14365+
14366+
// For Granite MoE Shared
14367+
if (hparams.n_ff_shexp > 0) {
14368+
ggml_tensor * ffn_shexp = build_ffn(cur,
14369+
model.layers[il].ffn_up_shexp, NULL, NULL,
14370+
model.layers[il].ffn_gate_shexp, NULL, NULL,
14371+
model.layers[il].ffn_down_shexp, NULL, NULL,
14372+
NULL,
14373+
LLM_FFN_SILU, LLM_FFN_PAR, il);
14374+
cb(ffn_shexp, "ffn_shexp", il);
14375+
14376+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
14377+
cb(cur, "ffn_out", il);
14378+
} else {
14379+
cur = moe_out;
14380+
}
14381+
}
14382+
14383+
// For Granite architectures - scale residual
14384+
if (hparams.f_residual_scale) {
14385+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
14386+
}
14387+
cur = ggml_add(ctx0, cur, ffn_inp);
14388+
cb(cur, "ffn_out", il);
14389+
14390+
cur = build_cvec(cur, il);
14391+
cb(cur, "l_out", il);
14392+
14393+
return cur;
14394+
}
1426114395
};
1426214396

1426314397
// ref: https://github.com/facebookresearch/chameleon
@@ -15192,8 +15326,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
1519215326
};
1519315327

1519415328
struct llm_build_falcon_h1 : public llm_graph_context_mamba {
15195-
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf)
15196-
: llm_graph_context(params), llm_graph_context_mamba(params) {
15329+
llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context_mamba(params) {
1519715330
const int64_t n_embd_head = hparams.n_embd_head_v;
1519815331

1519915332
ggml_tensor * cur;

0 commit comments

Comments
 (0)