@@ -3267,10 +3267,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3267
3267
{
3268
3268
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3269
3269
3270
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3270
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3271
3271
// if output is NULL, init from the input tok embed, duplicated to allow offloading
3272
3272
if (output == NULL) {
3273
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_DUPLICATED);
3273
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3274
3274
}
3275
3275
}
3276
3276
@@ -3313,10 +3313,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3313
3313
{
3314
3314
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3315
3315
3316
- output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3316
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
3317
3317
// if output is NULL, init from the input tok embed, duplicated to allow offloading
3318
3318
if (output == NULL) {
3319
- output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader:: TENSOR_DUPLICATED);
3319
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3320
3320
}
3321
3321
}
3322
3322
@@ -3352,56 +3352,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3352
3352
3353
3353
// out_proj
3354
3354
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
3355
-
3356
- layer.wq = nullptr;
3357
- layer.wk = nullptr;
3358
- layer.wv = nullptr;
3359
- layer.wo = nullptr;
3360
-
3361
3355
} else {
3362
3356
// Attention layers
3363
3357
3364
3358
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
3365
3359
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
3366
3360
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
3367
3361
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
3368
-
3369
- layer.ssm_in = nullptr;
3370
- layer.ssm_conv1d = nullptr;
3371
- layer.ssm_conv1d_b = nullptr;
3372
- layer.ssm_x = nullptr;
3373
- layer.ssm_dt_norm = nullptr;
3374
- layer.ssm_dt = nullptr;
3375
- layer.ssm_dt_b = nullptr;
3376
- layer.ssm_b_norm = nullptr;
3377
- layer.ssm_c_norm = nullptr;
3378
- layer.ssm_a = nullptr;
3379
- layer.ssm_d = nullptr;
3380
- layer.ssm_out = nullptr;
3381
3362
}
3382
3363
3383
3364
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3384
3365
3385
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, llama_model_loader:: TENSOR_NOT_REQUIRED);
3366
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
3386
3367
3387
3368
if (layer.ffn_gate_inp) {
3388
3369
// MoE
3389
3370
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3390
3371
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
3391
3372
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
3392
-
3393
- layer.ffn_gate = nullptr;
3394
- layer.ffn_down = nullptr;
3395
- layer.ffn_up = nullptr;
3396
3373
} else {
3397
3374
// FFN (no MoE)
3398
3375
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
3399
3376
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
3400
3377
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3401
-
3402
- layer.ffn_gate_exps = nullptr;
3403
- layer.ffn_down_exps = nullptr;
3404
- layer.ffn_up_exps = nullptr;
3405
3378
}
3406
3379
}
3407
3380
} break;
@@ -10228,7 +10201,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
10228
10201
// TODO: skip computing output earlier for unused tokens
10229
10202
10230
10203
y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
10231
- y = ggml_mul (ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)) );
10204
+ y = ggml_swiglu_split (ctx0, ggml_cont(ctx0, z), y );
10232
10205
10233
10206
// {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
10234
10207
cur = build_lora_mm(layer.ssm_out, y);
@@ -10352,7 +10325,7 @@ struct llm_graph_context_mamba : public virtual llm_graph_context {
10352
10325
// TODO: skip computing output earlier for unused tokens
10353
10326
10354
10327
y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
10355
- y = ggml_mul (ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)) );
10328
+ y = ggml_swiglu_split (ctx0, ggml_cont(ctx0, z), y );
10356
10329
10357
10330
// grouped RMS norm
10358
10331
y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
0 commit comments