Skip to content

Commit d498af3

Browse files
authored
graph : avoid huge warm-up graphs for MoE models (#14753)
* graph : avoid huge warm-up graphs for MoE models ggml-ci * cont : bump max nodes to 8x model tensors
1 parent eacdeb5 commit d498af3

File tree

2 files changed

+6
-3
lines changed

2 files changed

+6
-3
lines changed

src/llama-context.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
13121312
//
13131313

13141314
uint32_t llama_context::graph_max_nodes() const {
1315-
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
1315+
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
13161316
}
13171317

13181318
llm_graph_result * llama_context::get_gf_res_reserve() const {

src/llama-graph.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
906906
}
907907

908908
// aggregate experts
909+
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
910+
// to avoid potentially a large number of add nodes during warmup
911+
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
909912
ggml_tensor * moe_out = nullptr;
910-
for (int i = 0; i < n_expert_used; ++i) {
913+
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
911914
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
912915
experts->nb[2], i*experts->nb[1]);
913916

@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
918921
}
919922
}
920923

921-
if (n_expert_used == 1) {
924+
if (hparams.n_expert_used == 1) {
922925
// avoid returning a non-contiguous tensor
923926
moe_out = ggml_cont(ctx0, moe_out);
924927
}

0 commit comments

Comments
 (0)