Skip to content

Commit b61cf7d

Browse files
ikawrakowIwan Kawrakow
andauthored
Add support for Granite and GraniteMoE models (#102)
* Add Granite and GranoteMoE models * Granite: avoid NaNs on CUDA by scaling Q before K*Q multiplication --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 462c6cd commit b61cf7d

File tree

1 file changed

+117
-1
lines changed

1 file changed

+117
-1
lines changed

src/llama.cpp

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ enum llm_arch {
212212
LLM_ARCH_T5,
213213
LLM_ARCH_T5ENCODER,
214214
LLM_ARCH_JAIS,
215+
LLM_ARCH_GRANITE = 46,
216+
LLM_ARCH_GRANITE_MOE,
215217
LLM_ARCH_UNKNOWN,
216218
};
217219

@@ -257,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
257259
{ LLM_ARCH_T5, "t5" },
258260
{ LLM_ARCH_T5ENCODER, "t5encoder" },
259261
{ LLM_ARCH_JAIS, "jais" },
262+
{ LLM_ARCH_GRANITE, "granite" },
263+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
260264
{ LLM_ARCH_UNKNOWN, "(unknown)" },
261265
};
262266

@@ -293,6 +297,12 @@ enum llm_kv {
293297
LLM_KV_DECODER_START_TOKEN_ID,
294298
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
295299
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300+
LLM_KV_SWIN_NORM,
301+
LLM_KV_RESCALE_EVERY_N_LAYERS,
302+
LLM_KV_TIME_MIX_EXTRA_DIM,
303+
LLM_KV_TIME_DECAY_EXTRA_DIM,
304+
LLM_KV_RESIDUAL_SCALE,
305+
LLM_KV_EMBEDDING_SCALE,
296306

297307
LLM_KV_ATTENTION_HEAD_COUNT,
298308
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -307,6 +317,7 @@ enum llm_kv {
307317
LLM_KV_ATTENTION_KV_LORA_RANK,
308318
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
309319
LLM_KV_ATTENTION_SLIDING_WINDOW,
320+
LLM_KV_ATTENTION_SCALE,
310321

311322
LLM_KV_ROPE_DIMENSION_COUNT,
312323
LLM_KV_ROPE_FREQ_BASE,
@@ -391,6 +402,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
391402
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
392403
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
393404
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
405+
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
406+
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
394407

395408
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
396409
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -405,6 +418,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
405418
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
406419
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
407420
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
421+
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
408422

409423
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
410424
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1298,6 +1312,42 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
12981312
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
12991313
},
13001314
},
1315+
{
1316+
LLM_ARCH_GRANITE,
1317+
{
1318+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1319+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1320+
{ LLM_TENSOR_OUTPUT, "output" },
1321+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1327+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1328+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330+
},
1331+
},
1332+
{
1333+
LLM_ARCH_GRANITE_MOE,
1334+
{
1335+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337+
{ LLM_TENSOR_OUTPUT, "output" },
1338+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1339+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1340+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1341+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1342+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1343+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1344+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1345+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1346+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1347+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1348+
},
1349+
},
1350+
13011351
{
13021352
LLM_ARCH_UNKNOWN,
13031353
{
@@ -2203,6 +2253,11 @@ struct llama_hparams {
22032253
float f_max_alibi_bias = 0.0f;
22042254
float f_logit_scale = 0.0f;
22052255

2256+
// Additional scale factors (Granite/Granite MoE)
2257+
float f_residual_scale = 0.0f;
2258+
float f_embedding_scale = 0.0f;
2259+
float f_attention_scale = 0.0f;
2260+
22062261
bool causal_attn = true;
22072262
bool use_alibi = false;
22082263
bool attn_soft_cap = false;
@@ -2259,6 +2314,9 @@ struct llama_hparams {
22592314
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
22602315
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
22612316
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2317+
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2318+
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2319+
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
22622320

22632321
return false;
22642322
}
@@ -5283,6 +5341,22 @@ static void llm_load_hparams(
52835341
default: model.type = e_model::MODEL_UNKNOWN;
52845342
}
52855343
} break;
5344+
case LLM_ARCH_GRANITE:
5345+
case LLM_ARCH_GRANITE_MOE:
5346+
{
5347+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5348+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5349+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5350+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5351+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5352+
5353+
switch (hparams.n_layer) {
5354+
case 32: model.type = e_model::MODEL_3B; break;
5355+
case 40: model.type = e_model::MODEL_3B; break;
5356+
// Add additional layer/vocab/etc checks here for other model sizes
5357+
default: model.type = e_model::MODEL_UNKNOWN;
5358+
}
5359+
} break;
52865360
default: (void)0;
52875361
}
52885362

@@ -5970,6 +6044,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
59706044
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
59716045
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
59726046
}
6047+
6048+
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6049+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6050+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6051+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6052+
}
6053+
59736054
}
59746055

59756056
// Returns false if cancelled by progress_callback
@@ -6138,6 +6219,8 @@ static bool llm_load_tensors(
61386219
case LLM_ARCH_LLAMA:
61396220
case LLM_ARCH_REFACT:
61406221
case LLM_ARCH_MINICPM:
6222+
case LLM_ARCH_GRANITE:
6223+
case LLM_ARCH_GRANITE_MOE:
61416224
{
61426225
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
61436226

@@ -7927,6 +8010,11 @@ static struct ggml_tensor * llm_build_inp_embd(
79278010
ggml_set_input(lctx.inp_embd);
79288011
}
79298012

8013+
// For Granite architecture
8014+
if (hparams.f_embedding_scale != 0.0f) {
8015+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8016+
}
8017+
79308018
cb(inpL, "inp_embd", -1);
79318019

79328020
return inpL;
@@ -8358,12 +8446,15 @@ static struct ggml_tensor * llm_build_kqv(
83588446
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
83598447
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
83608448
}
8449+
//ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
83618450

83628451
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
83638452
} else {
83648453
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
83658454
cb(kq, "kq", il);
83668455

8456+
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
8457+
83678458
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
83688459
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
83698460
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -8917,6 +9008,8 @@ struct llm_build_context {
89179008
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
89189009
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
89199010

9011+
//const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
9012+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
89209013
for (int il = 0; il < n_layer; ++il) {
89219014
struct ggml_tensor * inpSA = inpL;
89229015

@@ -8933,6 +9026,9 @@ struct llm_build_context {
89339026

89349027
// compute Q and K and RoPE them
89359028
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9029+
if (hparams.f_attention_scale != 0) {
9030+
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9031+
}
89369032
cb(Qcur, "Qcur", il);
89379033
if (model.layers[il].bq) {
89389034
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -8969,7 +9065,7 @@ struct llm_build_context {
89699065

89709066
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
89719067
model.layers[il].wo, model.layers[il].bo,
8972-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
9068+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
89739069
}
89749070

89759071
if (il == n_layer - 1) {
@@ -8980,6 +9076,11 @@ struct llm_build_context {
89809076
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
89819077
}
89829078

9079+
// For Granite architecture
9080+
if (hparams.f_residual_scale) {
9081+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9082+
}
9083+
89839084
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
89849085
cb(ffn_inp, "ffn_inp", il);
89859086

@@ -9016,6 +9117,11 @@ struct llm_build_context {
90169117
cb(cur, "ffn_moe_out", il);
90179118
}
90189119

9120+
// For Granite architecture
9121+
if (hparams.f_residual_scale) {
9122+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9123+
}
9124+
90199125
cur = ggml_add(ctx0, cur, ffn_inp);
90209126
cb(cur, "ffn_out", il);
90219127

@@ -9035,6 +9141,12 @@ struct llm_build_context {
90359141

90369142
// lm_head
90379143
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9144+
9145+
// For Granite architecture
9146+
if (hparams.f_logit_scale) {
9147+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
9148+
}
9149+
90389150
cb(cur, "result_output", -1);
90399151

90409152
ggml_build_forward_expand(gf, cur);
@@ -14032,6 +14144,8 @@ static struct ggml_cgraph * llama_build_graph(
1403214144

1403314145
switch (model.arch) {
1403414146
case LLM_ARCH_LLAMA:
14147+
case LLM_ARCH_GRANITE:
14148+
case LLM_ARCH_GRANITE_MOE:
1403514149
{
1403614150
result = llm.build_llama();
1403714151
} break;
@@ -17470,6 +17584,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1747017584
case LLM_ARCH_ARCTIC:
1747117585
case LLM_ARCH_DEEPSEEK2:
1747217586
case LLM_ARCH_CHATGLM:
17587+
case LLM_ARCH_GRANITE:
17588+
case LLM_ARCH_GRANITE_MOE:
1747317589
return LLAMA_ROPE_TYPE_NORM;
1747417590

1747517591
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)