@@ -6214,30 +6214,12 @@ struct llm_build_modern_bert : public llm_graph_context {
6214
6214
ggml_tensor * ffn_inp = cur;
6215
6215
cb(ffn_inp, "ffn_inp", il);
6216
6216
6217
- // feed-forward network
6218
- ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur);
6219
- cb(ffn_up, "ffn_up", il);
6220
-
6221
- int64_t split_point = ffn_up->ne[0] / 2;
6222
- ggml_tensor * output_ffn_up = ggml_cont(ctx0, ggml_view_2d(
6223
- ctx0, ffn_up, split_point,
6224
- ffn_up->ne[1], ffn_up->nb[1], 0
6225
- ));
6226
- ggml_tensor * output_ffn_gate = ggml_cont(ctx0, ggml_view_2d(
6227
- ctx0, ffn_up, split_point,
6228
- ffn_up->ne[1], ffn_up->nb[1],
6229
- split_point * ggml_element_size(ffn_up)
6230
- ));
6231
-
6232
- // Apply activation function
6233
- output_ffn_up = ggml_gelu(ctx0, output_ffn_up);
6234
-
6235
- // Element-wise multiplication
6236
- ggml_tensor * gated = ggml_mul(ctx0, output_ffn_up, output_ffn_gate);
6237
- cb(gated, "ffn_gated", il);
6238
-
6239
- // Final projection
6240
- cur = build_lora_mm(model.layers[il].ffn_down, gated);
6217
+ cur = build_ffn(cur,
6218
+ model.layers[il].ffn_up,
6219
+ NULL, NULL, NULL, NULL, NULL,
6220
+ model.layers[il].ffn_down,
6221
+ NULL, NULL, NULL,
6222
+ LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
6241
6223
6242
6224
// attentions bypass the intermediate layer
6243
6225
cur = ggml_add(ctx0, cur, ffn_inp);
0 commit comments