@@ -212,8 +212,6 @@ enum llm_arch {
212
212
LLM_ARCH_T5,
213
213
LLM_ARCH_T5ENCODER,
214
214
LLM_ARCH_JAIS,
215
- LLM_ARCH_GRANITE = 46,
216
- LLM_ARCH_GRANITE_MOE,
217
215
LLM_ARCH_UNKNOWN,
218
216
};
219
217
@@ -259,8 +257,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259
257
{ LLM_ARCH_T5, "t5" },
260
258
{ LLM_ARCH_T5ENCODER, "t5encoder" },
261
259
{ LLM_ARCH_JAIS, "jais" },
262
- { LLM_ARCH_GRANITE, "granite" },
263
- { LLM_ARCH_GRANITE_MOE, "granitemoe" },
264
260
{ LLM_ARCH_UNKNOWN, "(unknown)" },
265
261
};
266
262
@@ -297,12 +293,6 @@ enum llm_kv {
297
293
LLM_KV_DECODER_START_TOKEN_ID,
298
294
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
299
295
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300
- LLM_KV_SWIN_NORM,
301
- LLM_KV_RESCALE_EVERY_N_LAYERS,
302
- LLM_KV_TIME_MIX_EXTRA_DIM,
303
- LLM_KV_TIME_DECAY_EXTRA_DIM,
304
- LLM_KV_RESIDUAL_SCALE,
305
- LLM_KV_EMBEDDING_SCALE,
306
296
307
297
LLM_KV_ATTENTION_HEAD_COUNT,
308
298
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -317,7 +307,6 @@ enum llm_kv {
317
307
LLM_KV_ATTENTION_KV_LORA_RANK,
318
308
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
319
309
LLM_KV_ATTENTION_SLIDING_WINDOW,
320
- LLM_KV_ATTENTION_SCALE,
321
310
322
311
LLM_KV_ROPE_DIMENSION_COUNT,
323
312
LLM_KV_ROPE_FREQ_BASE,
@@ -402,8 +391,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
402
391
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
403
392
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
404
393
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
405
- { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
406
- { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
407
394
408
395
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
409
396
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -418,7 +405,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
418
405
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
419
406
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
420
407
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
421
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
422
408
423
409
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
424
410
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1312,42 +1298,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1312
1298
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1313
1299
},
1314
1300
},
1315
- {
1316
- LLM_ARCH_GRANITE,
1317
- {
1318
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1319
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1320
- { LLM_TENSOR_OUTPUT, "output" },
1321
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1327
- { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1328
- { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329
- { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330
- },
1331
- },
1332
- {
1333
- LLM_ARCH_GRANITE_MOE,
1334
- {
1335
- { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336
- { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337
- { LLM_TENSOR_OUTPUT, "output" },
1338
- { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1339
- { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1340
- { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1341
- { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1342
- { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1343
- { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1344
- { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1345
- { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1346
- { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1347
- { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1348
- },
1349
- },
1350
-
1351
1301
{
1352
1302
LLM_ARCH_UNKNOWN,
1353
1303
{
@@ -2253,11 +2203,6 @@ struct llama_hparams {
2253
2203
float f_max_alibi_bias = 0.0f;
2254
2204
float f_logit_scale = 0.0f;
2255
2205
2256
- // Additional scale factors (Granite/Granite MoE)
2257
- float f_residual_scale = 0.0f;
2258
- float f_embedding_scale = 0.0f;
2259
- float f_attention_scale = 0.0f;
2260
-
2261
2206
bool causal_attn = true;
2262
2207
bool use_alibi = false;
2263
2208
bool attn_soft_cap = false;
@@ -2314,9 +2259,6 @@ struct llama_hparams {
2314
2259
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
2315
2260
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
2316
2261
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2317
- if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2318
- if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2319
- if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
2320
2262
2321
2263
return false;
2322
2264
}
@@ -5341,22 +5283,6 @@ static void llm_load_hparams(
5341
5283
default: model.type = e_model::MODEL_UNKNOWN;
5342
5284
}
5343
5285
} break;
5344
- case LLM_ARCH_GRANITE:
5345
- case LLM_ARCH_GRANITE_MOE:
5346
- {
5347
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5348
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5349
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5350
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5351
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5352
-
5353
- switch (hparams.n_layer) {
5354
- case 32: model.type = e_model::MODEL_3B; break;
5355
- case 40: model.type = e_model::MODEL_3B; break;
5356
- // Add additional layer/vocab/etc checks here for other model sizes
5357
- default: model.type = e_model::MODEL_UNKNOWN;
5358
- }
5359
- } break;
5360
5286
default: (void)0;
5361
5287
}
5362
5288
@@ -6044,13 +5970,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
6044
5970
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6045
5971
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6046
5972
}
6047
-
6048
- if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6049
- LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6050
- LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6051
- LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6052
- }
6053
-
6054
5973
}
6055
5974
6056
5975
// Returns false if cancelled by progress_callback
@@ -6219,8 +6138,6 @@ static bool llm_load_tensors(
6219
6138
case LLM_ARCH_LLAMA:
6220
6139
case LLM_ARCH_REFACT:
6221
6140
case LLM_ARCH_MINICPM:
6222
- case LLM_ARCH_GRANITE:
6223
- case LLM_ARCH_GRANITE_MOE:
6224
6141
{
6225
6142
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6226
6143
@@ -8010,11 +7927,6 @@ static struct ggml_tensor * llm_build_inp_embd(
8010
7927
ggml_set_input(lctx.inp_embd);
8011
7928
}
8012
7929
8013
- // For Granite architecture
8014
- if (hparams.f_embedding_scale != 0.0f) {
8015
- inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8016
- }
8017
-
8018
7930
cb(inpL, "inp_embd", -1);
8019
7931
8020
7932
return inpL;
@@ -8446,15 +8358,12 @@ static struct ggml_tensor * llm_build_kqv(
8446
8358
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
8447
8359
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
8448
8360
}
8449
- //ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
8450
8361
8451
8362
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
8452
8363
} else {
8453
8364
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
8454
8365
cb(kq, "kq", il);
8455
8366
8456
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
8457
-
8458
8367
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
8459
8368
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
8460
8369
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -9008,8 +8917,6 @@ struct llm_build_context {
9008
8917
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
9009
8918
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
9010
8919
9011
- //const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
9012
- const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
9013
8920
for (int il = 0; il < n_layer; ++il) {
9014
8921
struct ggml_tensor * inpSA = inpL;
9015
8922
@@ -9026,9 +8933,6 @@ struct llm_build_context {
9026
8933
9027
8934
// compute Q and K and RoPE them
9028
8935
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9029
- if (hparams.f_attention_scale != 0) {
9030
- Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9031
- }
9032
8936
cb(Qcur, "Qcur", il);
9033
8937
if (model.layers[il].bq) {
9034
8938
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -9065,7 +8969,7 @@ struct llm_build_context {
9065
8969
9066
8970
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
9067
8971
model.layers[il].wo, model.layers[il].bo,
9068
- Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale , cb, il);
8972
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)) , cb, il);
9069
8973
}
9070
8974
9071
8975
if (il == n_layer - 1) {
@@ -9076,11 +8980,6 @@ struct llm_build_context {
9076
8980
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
9077
8981
}
9078
8982
9079
- // For Granite architecture
9080
- if (hparams.f_residual_scale) {
9081
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9082
- }
9083
-
9084
8983
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
9085
8984
cb(ffn_inp, "ffn_inp", il);
9086
8985
@@ -9117,11 +9016,6 @@ struct llm_build_context {
9117
9016
cb(cur, "ffn_moe_out", il);
9118
9017
}
9119
9018
9120
- // For Granite architecture
9121
- if (hparams.f_residual_scale) {
9122
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9123
- }
9124
-
9125
9019
cur = ggml_add(ctx0, cur, ffn_inp);
9126
9020
cb(cur, "ffn_out", il);
9127
9021
@@ -9141,12 +9035,6 @@ struct llm_build_context {
9141
9035
9142
9036
// lm_head
9143
9037
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9144
-
9145
- // For Granite architecture
9146
- if (hparams.f_logit_scale) {
9147
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
9148
- }
9149
-
9150
9038
cb(cur, "result_output", -1);
9151
9039
9152
9040
ggml_build_forward_expand(gf, cur);
@@ -14144,8 +14032,6 @@ static struct ggml_cgraph * llama_build_graph(
14144
14032
14145
14033
switch (model.arch) {
14146
14034
case LLM_ARCH_LLAMA:
14147
- case LLM_ARCH_GRANITE:
14148
- case LLM_ARCH_GRANITE_MOE:
14149
14035
{
14150
14036
result = llm.build_llama();
14151
14037
} break;
@@ -17584,8 +17470,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
17584
17470
case LLM_ARCH_ARCTIC:
17585
17471
case LLM_ARCH_DEEPSEEK2:
17586
17472
case LLM_ARCH_CHATGLM:
17587
- case LLM_ARCH_GRANITE:
17588
- case LLM_ARCH_GRANITE_MOE:
17589
17473
return LLAMA_ROPE_TYPE_NORM;
17590
17474
17591
17475
// the pairs of head values are offset by n_rot/2
0 commit comments