@@ -212,6 +212,8 @@ enum llm_arch {
212
212
LLM_ARCH_T5,
213
213
LLM_ARCH_T5ENCODER,
214
214
LLM_ARCH_JAIS,
215
+ LLM_ARCH_GRANITE = 46,
216
+ LLM_ARCH_GRANITE_MOE,
215
217
LLM_ARCH_UNKNOWN,
216
218
};
217
219
@@ -257,6 +259,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
257
259
{ LLM_ARCH_T5, "t5" },
258
260
{ LLM_ARCH_T5ENCODER, "t5encoder" },
259
261
{ LLM_ARCH_JAIS, "jais" },
262
+ { LLM_ARCH_GRANITE, "granite" },
263
+ { LLM_ARCH_GRANITE_MOE, "granitemoe" },
260
264
{ LLM_ARCH_UNKNOWN, "(unknown)" },
261
265
};
262
266
@@ -293,6 +297,12 @@ enum llm_kv {
293
297
LLM_KV_DECODER_START_TOKEN_ID,
294
298
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
295
299
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300
+ LLM_KV_SWIN_NORM,
301
+ LLM_KV_RESCALE_EVERY_N_LAYERS,
302
+ LLM_KV_TIME_MIX_EXTRA_DIM,
303
+ LLM_KV_TIME_DECAY_EXTRA_DIM,
304
+ LLM_KV_RESIDUAL_SCALE,
305
+ LLM_KV_EMBEDDING_SCALE,
296
306
297
307
LLM_KV_ATTENTION_HEAD_COUNT,
298
308
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -307,6 +317,7 @@ enum llm_kv {
307
317
LLM_KV_ATTENTION_KV_LORA_RANK,
308
318
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
309
319
LLM_KV_ATTENTION_SLIDING_WINDOW,
320
+ LLM_KV_ATTENTION_SCALE,
310
321
311
322
LLM_KV_ROPE_DIMENSION_COUNT,
312
323
LLM_KV_ROPE_FREQ_BASE,
@@ -391,6 +402,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
391
402
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
392
403
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
393
404
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
405
+ { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
406
+ { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
394
407
395
408
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
396
409
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -405,6 +418,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
405
418
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
406
419
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
407
420
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
421
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
408
422
409
423
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
410
424
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1298,6 +1312,42 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1298
1312
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1299
1313
},
1300
1314
},
1315
+ {
1316
+ LLM_ARCH_GRANITE,
1317
+ {
1318
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1319
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1320
+ { LLM_TENSOR_OUTPUT, "output" },
1321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1327
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1328
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330
+ },
1331
+ },
1332
+ {
1333
+ LLM_ARCH_GRANITE_MOE,
1334
+ {
1335
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337
+ { LLM_TENSOR_OUTPUT, "output" },
1338
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1339
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1340
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1341
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1342
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1343
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1344
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1345
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1346
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1347
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1348
+ },
1349
+ },
1350
+
1301
1351
{
1302
1352
LLM_ARCH_UNKNOWN,
1303
1353
{
@@ -2203,6 +2253,11 @@ struct llama_hparams {
2203
2253
float f_max_alibi_bias = 0.0f;
2204
2254
float f_logit_scale = 0.0f;
2205
2255
2256
+ // Additional scale factors (Granite/Granite MoE)
2257
+ float f_residual_scale = 0.0f;
2258
+ float f_embedding_scale = 0.0f;
2259
+ float f_attention_scale = 0.0f;
2260
+
2206
2261
bool causal_attn = true;
2207
2262
bool use_alibi = false;
2208
2263
bool attn_soft_cap = false;
@@ -2259,6 +2314,9 @@ struct llama_hparams {
2259
2314
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2260
2315
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2261
2316
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2317
+ if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2318
+ if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2319
+ if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2262
2320
2263
2321
return false;
2264
2322
}
@@ -5283,6 +5341,22 @@ static void llm_load_hparams(
5283
5341
default: model.type = e_model::MODEL_UNKNOWN;
5284
5342
}
5285
5343
} break;
5344
+ case LLM_ARCH_GRANITE:
5345
+ case LLM_ARCH_GRANITE_MOE:
5346
+ {
5347
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5348
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5349
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5350
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5351
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5352
+
5353
+ switch (hparams.n_layer) {
5354
+ case 32: model.type = e_model::MODEL_3B; break;
5355
+ case 40: model.type = e_model::MODEL_3B; break;
5356
+ // Add additional layer/vocab/etc checks here for other model sizes
5357
+ default: model.type = e_model::MODEL_UNKNOWN;
5358
+ }
5359
+ } break;
5286
5360
default: (void)0;
5287
5361
}
5288
5362
@@ -5970,6 +6044,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5970
6044
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5971
6045
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5972
6046
}
6047
+
6048
+ if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6049
+ LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6050
+ LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6051
+ LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6052
+ }
6053
+
5973
6054
}
5974
6055
5975
6056
// Returns false if cancelled by progress_callback
@@ -6138,6 +6219,8 @@ static bool llm_load_tensors(
6138
6219
case LLM_ARCH_LLAMA:
6139
6220
case LLM_ARCH_REFACT:
6140
6221
case LLM_ARCH_MINICPM:
6222
+ case LLM_ARCH_GRANITE:
6223
+ case LLM_ARCH_GRANITE_MOE:
6141
6224
{
6142
6225
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6143
6226
@@ -7927,6 +8010,11 @@ static struct ggml_tensor * llm_build_inp_embd(
7927
8010
ggml_set_input(lctx.inp_embd);
7928
8011
}
7929
8012
8013
+ // For Granite architecture
8014
+ if (hparams.f_embedding_scale != 0.0f) {
8015
+ inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8016
+ }
8017
+
7930
8018
cb(inpL, "inp_embd", -1);
7931
8019
7932
8020
return inpL;
@@ -8358,12 +8446,15 @@ static struct ggml_tensor * llm_build_kqv(
8358
8446
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
8359
8447
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
8360
8448
}
8449
+ //ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
8361
8450
8362
8451
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
8363
8452
} else {
8364
8453
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
8365
8454
cb(kq, "kq", il);
8366
8455
8456
+ //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
8457
+
8367
8458
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
8368
8459
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
8369
8460
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -8917,6 +9008,8 @@ struct llm_build_context {
8917
9008
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
8918
9009
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
8919
9010
9011
+ //const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
9012
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
8920
9013
for (int il = 0; il < n_layer; ++il) {
8921
9014
struct ggml_tensor * inpSA = inpL;
8922
9015
@@ -8933,6 +9026,9 @@ struct llm_build_context {
8933
9026
8934
9027
// compute Q and K and RoPE them
8935
9028
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9029
+ if (hparams.f_attention_scale != 0) {
9030
+ Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9031
+ }
8936
9032
cb(Qcur, "Qcur", il);
8937
9033
if (model.layers[il].bq) {
8938
9034
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -8969,7 +9065,7 @@ struct llm_build_context {
8969
9065
8970
9066
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
8971
9067
model.layers[il].wo, model.layers[il].bo,
8972
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
9068
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
8973
9069
}
8974
9070
8975
9071
if (il == n_layer - 1) {
@@ -8980,6 +9076,11 @@ struct llm_build_context {
8980
9076
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
8981
9077
}
8982
9078
9079
+ // For Granite architecture
9080
+ if (hparams.f_residual_scale) {
9081
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9082
+ }
9083
+
8983
9084
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
8984
9085
cb(ffn_inp, "ffn_inp", il);
8985
9086
@@ -9016,6 +9117,11 @@ struct llm_build_context {
9016
9117
cb(cur, "ffn_moe_out", il);
9017
9118
}
9018
9119
9120
+ // For Granite architecture
9121
+ if (hparams.f_residual_scale) {
9122
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9123
+ }
9124
+
9019
9125
cur = ggml_add(ctx0, cur, ffn_inp);
9020
9126
cb(cur, "ffn_out", il);
9021
9127
@@ -9035,6 +9141,12 @@ struct llm_build_context {
9035
9141
9036
9142
// lm_head
9037
9143
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9144
+
9145
+ // For Granite architecture
9146
+ if (hparams.f_logit_scale) {
9147
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
9148
+ }
9149
+
9038
9150
cb(cur, "result_output", -1);
9039
9151
9040
9152
ggml_build_forward_expand(gf, cur);
@@ -14032,6 +14144,8 @@ static struct ggml_cgraph * llama_build_graph(
14032
14144
14033
14145
switch (model.arch) {
14034
14146
case LLM_ARCH_LLAMA:
14147
+ case LLM_ARCH_GRANITE:
14148
+ case LLM_ARCH_GRANITE_MOE:
14035
14149
{
14036
14150
result = llm.build_llama();
14037
14151
} break;
@@ -17470,6 +17584,8 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17470
17584
case LLM_ARCH_ARCTIC:
17471
17585
case LLM_ARCH_DEEPSEEK2:
17472
17586
case LLM_ARCH_CHATGLM:
17587
+ case LLM_ARCH_GRANITE:
17588
+ case LLM_ARCH_GRANITE_MOE:
17473
17589
return LLAMA_ROPE_TYPE_NORM;
17474
17590
17475
17591
// the pairs of head values are offset by n_rot/2
0 commit comments