@@ -1561,6 +1561,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1561
1561
default: type = LLM_TYPE_UNKNOWN;
1562
1562
}
1563
1563
} break;
1564
+ case LLM_ARCH_SMOLLM3:
1565
+ {
1566
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1567
+ hparams.n_no_rope_layer_step = 4;
1568
+
1569
+ switch (hparams.n_layer) {
1570
+ case 36: type = LLM_TYPE_3B; break;
1571
+ default: type = LLM_TYPE_UNKNOWN;
1572
+ }
1573
+ } break;
1564
1574
default: throw std::runtime_error("unsupported model architecture");
1565
1575
}
1566
1576
@@ -4524,6 +4534,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4524
4534
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
4525
4535
}
4526
4536
} break;
4537
+ case LLM_ARCH_SMOLLM3:
4538
+ {
4539
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
4540
+
4541
+ // output
4542
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
4543
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
4544
+
4545
+ // if output is NULL, init from the input tok embed
4546
+ if (output == NULL) {
4547
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4548
+ }
4549
+
4550
+ for (int i = 0; i < n_layer; ++i) {
4551
+ auto & layer = layers[i];
4552
+
4553
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
4554
+
4555
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
4556
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
4557
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
4558
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
4559
+
4560
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
4561
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
4562
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4563
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4564
+ }
4565
+ } break;
4527
4566
default:
4528
4567
throw std::runtime_error("unknown architecture");
4529
4568
}
@@ -14846,6 +14885,142 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
14846
14885
cb(cur, "result_norm", -1);
14847
14886
res->t_embd = cur;
14848
14887
14888
+ // lm_head
14889
+ cur = build_lora_mm(model.output, cur);
14890
+ cb(cur, "result_output", -1);
14891
+ res->t_logits = cur;
14892
+
14893
+ ggml_build_forward_expand(gf, cur);
14894
+ }
14895
+ };
14896
+
14897
+ struct llm_build_smollm3 : public llm_graph_context {
14898
+ llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
14899
+ const int64_t n_embd_head = hparams.n_embd_head_v;
14900
+
14901
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
14902
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
14903
+
14904
+ ggml_tensor * cur;
14905
+ ggml_tensor * inpL;
14906
+
14907
+ inpL = build_inp_embd(model.tok_embd);
14908
+
14909
+ // inp_pos - contains the positions
14910
+ ggml_tensor * inp_pos = build_inp_pos();
14911
+
14912
+ auto * inp_attn = build_attn_inp_kv_unified();
14913
+
14914
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
14915
+
14916
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
14917
+
14918
+ for (int il = 0; il < n_layer; ++il) {
14919
+ ggml_tensor * inpSA = inpL;
14920
+
14921
+ const bool use_rope = (il + 1) % hparams.n_no_rope_layer_step != 0;
14922
+
14923
+ // norm
14924
+ cur = build_norm(inpL,
14925
+ model.layers[il].attn_norm, NULL,
14926
+ LLM_NORM_RMS, il);
14927
+ cb(cur, "attn_norm", il);
14928
+
14929
+ // self-attention
14930
+ {
14931
+ // compute Q and K and RoPE them
14932
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
14933
+ cb(Qcur, "Qcur", il);
14934
+ if (model.layers[il].bq) {
14935
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
14936
+ cb(Qcur, "Qcur", il);
14937
+ }
14938
+
14939
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
14940
+ cb(Kcur, "Kcur", il);
14941
+ if (model.layers[il].bk) {
14942
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
14943
+ cb(Kcur, "Kcur", il);
14944
+ }
14945
+
14946
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
14947
+ cb(Vcur, "Vcur", il);
14948
+ if (model.layers[il].bv) {
14949
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
14950
+ cb(Vcur, "Vcur", il);
14951
+ }
14952
+
14953
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14954
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14955
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
14956
+
14957
+ if (use_rope) {
14958
+ Qcur = ggml_rope_ext(
14959
+ ctx0, Qcur, inp_pos, nullptr,
14960
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14961
+ ext_factor, attn_factor, beta_fast, beta_slow
14962
+ );
14963
+
14964
+ Kcur = ggml_rope_ext(
14965
+ ctx0, Kcur, inp_pos, nullptr,
14966
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
14967
+ ext_factor, attn_factor, beta_fast, beta_slow
14968
+ );
14969
+ }
14970
+
14971
+ cb(Qcur, "Qcur", il);
14972
+ cb(Kcur, "Kcur", il);
14973
+ cb(Vcur, "Vcur", il);
14974
+
14975
+ cur = build_attn(inp_attn, gf,
14976
+ model.layers[il].wo, model.layers[il].bo,
14977
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
14978
+ cb(cur, "attn_out", il);
14979
+ }
14980
+
14981
+ if (il == n_layer - 1 && inp_out_ids) {
14982
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
14983
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
14984
+ }
14985
+
14986
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
14987
+ cb(ffn_inp, "ffn_inp", il);
14988
+
14989
+ // feed-forward network
14990
+ {
14991
+ cur = build_norm(ffn_inp,
14992
+ model.layers[il].ffn_norm, NULL,
14993
+ LLM_NORM_RMS, il);
14994
+ cb(cur, "ffn_norm", il);
14995
+
14996
+ cur = build_ffn(cur,
14997
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
14998
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
14999
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
15000
+ NULL,
15001
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
15002
+ cb(cur, "ffn_out", il);
15003
+ }
15004
+
15005
+ cur = ggml_add(ctx0, cur, ffn_inp);
15006
+ cb(cur, "ffn_out", il);
15007
+
15008
+ cur = build_cvec(cur, il);
15009
+ cb(cur, "l_out", il);
15010
+
15011
+ // input for next layer
15012
+ inpL = cur;
15013
+ }
15014
+
15015
+ cur = inpL;
15016
+
15017
+ cur = build_norm(cur,
15018
+ model.output_norm, NULL,
15019
+ LLM_NORM_RMS, -1);
15020
+
15021
+ cb(cur, "result_norm", -1);
15022
+ res->t_embd = cur;
15023
+
14849
15024
// lm_head
14850
15025
cur = build_lora_mm(model.output, cur);
14851
15026
@@ -15240,6 +15415,10 @@ llm_graph_result_ptr llama_model::build_graph(
15240
15415
{
15241
15416
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params, gf);
15242
15417
} break;
15418
+ case LLM_ARCH_SMOLLM3:
15419
+ {
15420
+ llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
15421
+ } break;
15243
15422
default:
15244
15423
GGML_ABORT("fatal error");
15245
15424
}
@@ -15391,6 +15570,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
15391
15570
case LLM_ARCH_CHAMELEON:
15392
15571
case LLM_ARCH_BAILINGMOE:
15393
15572
case LLM_ARCH_NEO_BERT:
15573
+ case LLM_ARCH_SMOLLM3:
15394
15574
case LLM_ARCH_ARCEE:
15395
15575
case LLM_ARCH_ERNIE4_5:
15396
15576
return LLAMA_ROPE_TYPE_NORM;
0 commit comments