graph : avoid huge warm-up graphs for MoE models (#14753)

ggerganov · web-flow · commit d498af3d5a00 · 2025-07-18T14:31:15.000+03:00
* graph : avoid huge warm-up graphs for MoE models

ggml-ci

* cont : bump max nodes to 8x model tensors
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 //
 
 uint32_t llama_context::graph_max_nodes() const {
-    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
 }
 
 llm_graph_result * llama_context::get_gf_res_reserve() const {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     }
 
     // aggregate experts
+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+    //       to avoid potentially a large number of add nodes during warmup
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
     ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
         ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
                 experts->nb[2], i*experts->nb[1]);
 
@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         }
     }
 
-    if (n_expert_used == 1) {
+    if (hparams.n_expert_used == 1) {
         // avoid returning a non-contiguous tensor
         moe_out = ggml_cont(ctx0, moe_out);
     }

Original file line number	Diff line number	Diff line change
`@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {`
`1312`	`1312`	`//`
`1313`	`1313`
`1314`	`1314`	`uint32_t llama_context::graph_max_nodes() const {`
`1315`		`- return std::max<uint32_t>(65536u, 5u*model.n_tensors());`
	`1315`	`+ return std::max<uint32_t>(1024u, 8u*model.n_tensors());`
`1316`	`1316`	`}`
`1317`	`1317`
`1318`	`1318`	`llm_graph_result * llama_context::get_gf_res_reserve() const {`
Original file line number	Diff line number	Diff line change
`@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(`
`906`	`906`	`}`
`907`	`907`
`908`	`908`	`// aggregate experts`
	`909`	`+ // note: here we explicitly use hparams.n_expert_used instead of n_expert_used`
	`910`	`+ // to avoid potentially a large number of add nodes during warmup`
	`911`	`+ // ref: https://github.com/ggml-org/llama.cpp/pull/14753`
`909`	`912`	`ggml_tensor * moe_out = nullptr;`
`910`		`- for (int i = 0; i < n_expert_used; ++i) {`
	`913`	`+ for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {`
`911`	`914`	`ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,`
`912`	`915`	`experts->nb[2], i*experts->nb[1]);`
`913`	`916`
`@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(`
`918`	`921`	`}`
`919`	`922`	`}`
`920`	`923`
`921`		`- if (n_expert_used == 1) {`
	`924`	`+ if (hparams.n_expert_used == 1) {`
`922`	`925`	`// avoid returning a non-contiguous tensor`
`923`	`926`	`moe_out = ggml_cont(ctx0, moe_out);`
`924`	`927`	`}`