Skip to content

Commit 93f5895

Browse files
committed
Revert "Add support for Granite and GraniteMoE models (#102)"
This reverts commit b61cf7d.
1 parent b61cf7d commit 93f5895

File tree

1 file changed

+1
-117
lines changed

1 file changed

+1
-117
lines changed

src/llama.cpp

Lines changed: 1 addition & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,6 @@ enum llm_arch {
212212
LLM_ARCH_T5,
213213
LLM_ARCH_T5ENCODER,
214214
LLM_ARCH_JAIS,
215-
LLM_ARCH_GRANITE = 46,
216-
LLM_ARCH_GRANITE_MOE,
217215
LLM_ARCH_UNKNOWN,
218216
};
219217

@@ -259,8 +257,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
259257
{ LLM_ARCH_T5, "t5" },
260258
{ LLM_ARCH_T5ENCODER, "t5encoder" },
261259
{ LLM_ARCH_JAIS, "jais" },
262-
{ LLM_ARCH_GRANITE, "granite" },
263-
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
264260
{ LLM_ARCH_UNKNOWN, "(unknown)" },
265261
};
266262

@@ -297,12 +293,6 @@ enum llm_kv {
297293
LLM_KV_DECODER_START_TOKEN_ID,
298294
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
299295
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
300-
LLM_KV_SWIN_NORM,
301-
LLM_KV_RESCALE_EVERY_N_LAYERS,
302-
LLM_KV_TIME_MIX_EXTRA_DIM,
303-
LLM_KV_TIME_DECAY_EXTRA_DIM,
304-
LLM_KV_RESIDUAL_SCALE,
305-
LLM_KV_EMBEDDING_SCALE,
306296

307297
LLM_KV_ATTENTION_HEAD_COUNT,
308298
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -317,7 +307,6 @@ enum llm_kv {
317307
LLM_KV_ATTENTION_KV_LORA_RANK,
318308
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
319309
LLM_KV_ATTENTION_SLIDING_WINDOW,
320-
LLM_KV_ATTENTION_SCALE,
321310

322311
LLM_KV_ROPE_DIMENSION_COUNT,
323312
LLM_KV_ROPE_FREQ_BASE,
@@ -402,8 +391,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
402391
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
403392
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
404393
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
405-
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
406-
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
407394

408395
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
409396
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -418,7 +405,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
418405
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
419406
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
420407
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
421-
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
422408

423409
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
424410
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -1312,42 +1298,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
13121298
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
13131299
},
13141300
},
1315-
{
1316-
LLM_ARCH_GRANITE,
1317-
{
1318-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1319-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1320-
{ LLM_TENSOR_OUTPUT, "output" },
1321-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1322-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1323-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1324-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1325-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1326-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1327-
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1328-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1329-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1330-
},
1331-
},
1332-
{
1333-
LLM_ARCH_GRANITE_MOE,
1334-
{
1335-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1336-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1337-
{ LLM_TENSOR_OUTPUT, "output" },
1338-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1339-
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1340-
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1341-
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1342-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1343-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1344-
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1345-
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1346-
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1347-
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1348-
},
1349-
},
1350-
13511301
{
13521302
LLM_ARCH_UNKNOWN,
13531303
{
@@ -2253,11 +2203,6 @@ struct llama_hparams {
22532203
float f_max_alibi_bias = 0.0f;
22542204
float f_logit_scale = 0.0f;
22552205

2256-
// Additional scale factors (Granite/Granite MoE)
2257-
float f_residual_scale = 0.0f;
2258-
float f_embedding_scale = 0.0f;
2259-
float f_attention_scale = 0.0f;
2260-
22612206
bool causal_attn = true;
22622207
bool use_alibi = false;
22632208
bool attn_soft_cap = false;
@@ -2314,9 +2259,6 @@ struct llama_hparams {
23142259
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
23152260
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
23162261
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
2317-
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
2318-
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
2319-
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
23202262

23212263
return false;
23222264
}
@@ -5341,22 +5283,6 @@ static void llm_load_hparams(
53415283
default: model.type = e_model::MODEL_UNKNOWN;
53425284
}
53435285
} break;
5344-
case LLM_ARCH_GRANITE:
5345-
case LLM_ARCH_GRANITE_MOE:
5346-
{
5347-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5348-
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
5349-
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
5350-
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
5351-
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
5352-
5353-
switch (hparams.n_layer) {
5354-
case 32: model.type = e_model::MODEL_3B; break;
5355-
case 40: model.type = e_model::MODEL_3B; break;
5356-
// Add additional layer/vocab/etc checks here for other model sizes
5357-
default: model.type = e_model::MODEL_UNKNOWN;
5358-
}
5359-
} break;
53605286
default: (void)0;
53615287
}
53625288

@@ -6044,13 +5970,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
60445970
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
60455971
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
60465972
}
6047-
6048-
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
6049-
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
6050-
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
6051-
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
6052-
}
6053-
60545973
}
60555974

60565975
// Returns false if cancelled by progress_callback
@@ -6219,8 +6138,6 @@ static bool llm_load_tensors(
62196138
case LLM_ARCH_LLAMA:
62206139
case LLM_ARCH_REFACT:
62216140
case LLM_ARCH_MINICPM:
6222-
case LLM_ARCH_GRANITE:
6223-
case LLM_ARCH_GRANITE_MOE:
62246141
{
62256142
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
62266143

@@ -8010,11 +7927,6 @@ static struct ggml_tensor * llm_build_inp_embd(
80107927
ggml_set_input(lctx.inp_embd);
80117928
}
80127929

8013-
// For Granite architecture
8014-
if (hparams.f_embedding_scale != 0.0f) {
8015-
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
8016-
}
8017-
80187930
cb(inpL, "inp_embd", -1);
80197931

80207932
return inpL;
@@ -8446,15 +8358,12 @@ static struct ggml_tensor * llm_build_kqv(
84468358
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
84478359
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
84488360
}
8449-
//ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
84508361

84518362
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
84528363
} else {
84538364
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
84548365
cb(kq, "kq", il);
84558366

8456-
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
8457-
84588367
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
84598368
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
84608369
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -9008,8 +8917,6 @@ struct llm_build_context {
90088917
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
90098918
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
90108919

9011-
//const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
9012-
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
90138920
for (int il = 0; il < n_layer; ++il) {
90148921
struct ggml_tensor * inpSA = inpL;
90158922

@@ -9026,9 +8933,6 @@ struct llm_build_context {
90268933

90278934
// compute Q and K and RoPE them
90288935
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
9029-
if (hparams.f_attention_scale != 0) {
9030-
Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
9031-
}
90328936
cb(Qcur, "Qcur", il);
90338937
if (model.layers[il].bq) {
90348938
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@@ -9065,7 +8969,7 @@ struct llm_build_context {
90658969

90668970
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
90678971
model.layers[il].wo, model.layers[il].bo,
9068-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
8972+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
90698973
}
90708974

90718975
if (il == n_layer - 1) {
@@ -9076,11 +8980,6 @@ struct llm_build_context {
90768980
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
90778981
}
90788982

9079-
// For Granite architecture
9080-
if (hparams.f_residual_scale) {
9081-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9082-
}
9083-
90848983
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
90858984
cb(ffn_inp, "ffn_inp", il);
90868985

@@ -9117,11 +9016,6 @@ struct llm_build_context {
91179016
cb(cur, "ffn_moe_out", il);
91189017
}
91199018

9120-
// For Granite architecture
9121-
if (hparams.f_residual_scale) {
9122-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
9123-
}
9124-
91259019
cur = ggml_add(ctx0, cur, ffn_inp);
91269020
cb(cur, "ffn_out", il);
91279021

@@ -9141,12 +9035,6 @@ struct llm_build_context {
91419035

91429036
// lm_head
91439037
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
9144-
9145-
// For Granite architecture
9146-
if (hparams.f_logit_scale) {
9147-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
9148-
}
9149-
91509038
cb(cur, "result_output", -1);
91519039

91529040
ggml_build_forward_expand(gf, cur);
@@ -14144,8 +14032,6 @@ static struct ggml_cgraph * llama_build_graph(
1414414032

1414514033
switch (model.arch) {
1414614034
case LLM_ARCH_LLAMA:
14147-
case LLM_ARCH_GRANITE:
14148-
case LLM_ARCH_GRANITE_MOE:
1414914035
{
1415014036
result = llm.build_llama();
1415114037
} break;
@@ -17584,8 +17470,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1758417470
case LLM_ARCH_ARCTIC:
1758517471
case LLM_ARCH_DEEPSEEK2:
1758617472
case LLM_ARCH_CHATGLM:
17587-
case LLM_ARCH_GRANITE:
17588-
case LLM_ARCH_GRANITE_MOE:
1758917473
return LLAMA_ROPE_TYPE_NORM;
1759017474

1759117475
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)