From adeff7dbd3bacb63d54056cbd7f74aa6b9e3dac4 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 29 May 2026 08:38:24 +0000 Subject: [PATCH] Add extra nodes when dealing with MLA and amb --- common/common.cpp | 4 ++++ src/graphs/build_arctic.cpp | 2 +- src/graphs/build_baichuan.cpp | 2 +- src/graphs/build_bailingmoe2.cpp | 2 +- src/graphs/build_bert.cpp | 2 +- src/graphs/build_bitnet.cpp | 4 ++-- src/graphs/build_bloom.cpp | 2 +- src/graphs/build_chatglm.cpp | 2 +- src/graphs/build_codeshell.cpp | 2 +- src/graphs/build_cohere2.cpp | 2 +- src/graphs/build_command_r.cpp | 2 +- src/graphs/build_dbrx.cpp | 2 +- src/graphs/build_deci.cpp | 2 +- src/graphs/build_deepseek2.cpp | 2 +- src/graphs/build_dots1.cpp | 2 +- src/graphs/build_ernie45.cpp | 4 ++-- src/graphs/build_falcon.cpp | 2 +- src/graphs/build_gemma.cpp | 2 +- src/graphs/build_gemma2.cpp | 2 +- src/graphs/build_gemma3.cpp | 2 +- src/graphs/build_gemma4.cpp | 6 +++--- src/graphs/build_glm4.cpp | 5 ++--- src/graphs/build_gpt2.cpp | 2 +- src/graphs/build_gptneox.cpp | 2 +- src/graphs/build_grok.cpp | 2 +- src/graphs/build_hunyuan.cpp | 2 +- src/graphs/build_internlm2.cpp | 2 +- src/graphs/build_jais.cpp | 2 +- src/graphs/build_llama.cpp | 2 +- src/graphs/build_mamba.cpp | 2 +- src/graphs/build_mimo2.cpp | 2 +- src/graphs/build_minicpm.cpp | 2 +- src/graphs/build_minimaxm2.cpp | 2 +- src/graphs/build_mistral3.cpp | 2 +- src/graphs/build_mpt.cpp | 2 +- src/graphs/build_olmo.cpp | 2 +- src/graphs/build_openai.cpp | 2 +- src/graphs/build_openelm.cpp | 2 +- src/graphs/build_orion.cpp | 2 +- src/graphs/build_phi2.cpp | 2 +- src/graphs/build_phi3.cpp | 2 +- src/graphs/build_plamo.cpp | 2 +- src/graphs/build_qwen.cpp | 2 +- src/graphs/build_qwen2.cpp | 6 +++--- src/graphs/build_qwen3.cpp | 8 ++++---- src/graphs/build_qwen35.cpp | 4 ++-- src/graphs/build_qwen3next.cpp | 2 +- src/graphs/build_refact.cpp | 2 +- src/graphs/build_seedoss.cpp | 2 +- src/graphs/build_smollm3.cpp | 2 +- src/graphs/build_stablelm.cpp | 2 +- src/graphs/build_starcoder.cpp | 2 +- src/graphs/build_starcoder2.cpp | 2 +- src/graphs/build_step35.cpp | 2 +- src/graphs/build_t5.cpp | 4 ++-- src/graphs/build_xverse.cpp | 2 +- src/llama-build-context.cpp | 5 +++++ src/llama-build-context.h | 2 ++ src/llama-context.h | 2 ++ src/llama.cpp | 33 +++++++++++++++++++++++++++++++- 60 files changed, 112 insertions(+), 69 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ffb8d5fd..ac2e3226 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1788,6 +1788,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa if (arg == "-amb" || arg == "--attention-max-batch") { CHECK_ARG params.attn_max_batch = std::stoi(argv[i]); + if (params.attn_max_batch > 0 && params.attn_max_batch < 128) { + LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX amb = %d is too low. Changing to 128\n", params.attn_max_batch); + params.attn_max_batch = 128; + } return true; } if (arg == "-no-fmoe" || arg == "--no-fused-moe") { diff --git a/src/graphs/build_arctic.cpp b/src/graphs/build_arctic.cpp index eded4d9f..aae3d85f 100644 --- a/src/graphs/build_arctic.cpp +++ b/src/graphs/build_arctic.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_baichuan.cpp b/src/graphs/build_baichuan.cpp index 94b88329..11ef6e2e 100644 --- a/src/graphs/build_baichuan.cpp +++ b/src/graphs/build_baichuan.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_bailingmoe2.cpp b/src/graphs/build_bailingmoe2.cpp index 17022235..6a1e54e9 100644 --- a/src/graphs/build_bailingmoe2.cpp +++ b/src/graphs/build_bailingmoe2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_bailingmoe2() { - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); //const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_bert.cpp b/src/graphs/build_bert.cpp index 936876df..5574ba8f 100644 --- a/src/graphs/build_bert.cpp +++ b/src/graphs/build_bert.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_bitnet.cpp b/src/graphs/build_bitnet.cpp index 5247c9b3..9b0a4d72 100644 --- a/src/graphs/build_bitnet.cpp +++ b/src/graphs/build_bitnet.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -148,7 +148,7 @@ ggml_cgraph * llm_build_context::build_bitnet() { } ggml_cgraph * llm_build_context::build_bitnet_158() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_bloom.cpp b/src/graphs/build_bloom.cpp index e6f36827..b712a2ec 100644 --- a/src/graphs/build_bloom.cpp +++ b/src/graphs/build_bloom.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_chatglm.cpp b/src/graphs/build_chatglm.cpp index 7880d892..9c3da126 100644 --- a/src/graphs/build_chatglm.cpp +++ b/src/graphs/build_chatglm.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_codeshell.cpp b/src/graphs/build_codeshell.cpp index 2567f0c8..bceb5755 100644 --- a/src/graphs/build_codeshell.cpp +++ b/src/graphs/build_codeshell.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_cohere2.cpp b/src/graphs/build_cohere2.cpp index 56150cfd..aaaa7b47 100644 --- a/src/graphs/build_cohere2.cpp +++ b/src/graphs/build_cohere2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_cohere2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_command_r.cpp b/src/graphs/build_command_r.cpp index f3c21481..805d4ffc 100644 --- a/src/graphs/build_command_r.cpp +++ b/src/graphs/build_command_r.cpp @@ -4,7 +4,7 @@ ggml_cgraph * llm_build_context::build_command_r() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_dbrx.cpp b/src/graphs/build_dbrx.cpp index 657aca69..33f14b7d 100644 --- a/src/graphs/build_dbrx.cpp +++ b/src/graphs/build_dbrx.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_deci.cpp b/src/graphs/build_deci.cpp index 8c26174b..73038d48 100644 --- a/src/graphs/build_deci.cpp +++ b/src/graphs/build_deci.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_deci() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_deepseek2.cpp b/src/graphs/build_deepseek2.cpp index 1d78a826..c5129b8f 100644 --- a/src/graphs/build_deepseek2.cpp +++ b/src/graphs/build_deepseek2.cpp @@ -769,7 +769,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { #else const bool use_f32_attn_precision = lctx.cparams.graph_attn_precision == GGML_TYPE_F32; #endif - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_dots1.cpp b/src/graphs/build_dots1.cpp index d22ab464..3ab32eef 100644 --- a/src/graphs/build_dots1.cpp +++ b/src/graphs/build_dots1.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_dots1() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); diff --git a/src/graphs/build_ernie45.cpp b/src/graphs/build_ernie45.cpp index 5d6ba492..f194c7d8 100644 --- a/src/graphs/build_ernie45.cpp +++ b/src/graphs/build_ernie45.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_ernie4_5() { - struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -119,7 +119,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() { } ggml_cgraph * llm_build_context::build_ernie4_5_moe() { - struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_falcon.cpp b/src/graphs/build_falcon.cpp index c6c267a2..143c1384 100644 --- a/src/graphs/build_falcon.cpp +++ b/src/graphs/build_falcon.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_gemma.cpp b/src/graphs/build_gemma.cpp index 4c378a32..87280436 100644 --- a/src/graphs/build_gemma.cpp +++ b/src/graphs/build_gemma.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head_k = hparams.n_embd_head_k(0); diff --git a/src/graphs/build_gemma2.cpp b/src/graphs/build_gemma2.cpp index 5362e060..c5bfc691 100644 --- a/src/graphs/build_gemma2.cpp +++ b/src/graphs/build_gemma2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head_k = hparams.n_embd_head_k_full; diff --git a/src/graphs/build_gemma3.cpp b/src/graphs/build_gemma3.cpp index d0d3682e..3033acc5 100644 --- a/src/graphs/build_gemma3.cpp +++ b/src/graphs/build_gemma3.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_gemma3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); struct ggml_tensor * cur; struct ggml_tensor * inpL; diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp index 64853ea6..8a2c8029 100644 --- a/src/graphs/build_gemma4.cpp +++ b/src/graphs/build_gemma4.cpp @@ -169,7 +169,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_ int n_device = model.splits.size(); GGML_ASSERT(n_device > 1); GGML_ASSERT(cparams.flash_attn); - auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = llm.new_graph_custom(); bool is_moe = hparams.n_expert > 0; @@ -527,7 +527,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_ } ggml_cgraph * llm_build_context::build_gemma4_mtp() { - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -731,7 +731,7 @@ ggml_cgraph * llm_build_context::build_gemma4() { KQ_mask, KQ_mask_swa, n_tokens, cb); } - auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); ggml_tensor * inp_per_layer = nullptr; if (model.tok_embd_per_layer) { diff --git a/src/graphs/build_glm4.cpp b/src/graphs/build_glm4.cpp index 305a1732..cb567cc4 100644 --- a/src/graphs/build_glm4.cpp +++ b/src/graphs/build_glm4.cpp @@ -3,8 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_glm4_moe() { - // create a new graph - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -159,7 +158,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { } ggml_cgraph * llm_build_context::build_glm4() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_gpt2.cpp b/src/graphs/build_gpt2.cpp index 15bb22ee..94b75403 100644 --- a/src/graphs/build_gpt2.cpp +++ b/src/graphs/build_gpt2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_gptneox.cpp b/src/graphs/build_gptneox.cpp index d11ee3af..e3fcb36f 100644 --- a/src/graphs/build_gptneox.cpp +++ b/src/graphs/build_gptneox.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_grok.cpp b/src/graphs/build_grok.cpp index d96f9f07..4f67fbec 100644 --- a/src/graphs/build_grok.cpp +++ b/src/graphs/build_grok.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_hunyuan.cpp b/src/graphs/build_hunyuan.cpp index 6026258e..23217d5d 100644 --- a/src/graphs/build_hunyuan.cpp +++ b/src/graphs/build_hunyuan.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_hunyuan_moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); diff --git a/src/graphs/build_internlm2.cpp b/src/graphs/build_internlm2.cpp index 53969364..8588f699 100644 --- a/src/graphs/build_internlm2.cpp +++ b/src/graphs/build_internlm2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_jais.cpp b/src/graphs/build_jais.cpp index cc872f41..20436fb9 100644 --- a/src/graphs/build_jais.cpp +++ b/src/graphs/build_jais.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_llama.cpp b/src/graphs/build_llama.cpp index 765c882c..16d04ee9 100644 --- a/src/graphs/build_llama.cpp +++ b/src/graphs/build_llama.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_mamba.cpp b/src/graphs/build_mamba.cpp index 938786b0..b3ef77c3 100644 --- a/src/graphs/build_mamba.cpp +++ b/src/graphs/build_mamba.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t d_model = n_embd; const int64_t d_conv = hparams.ssm_d_conv; diff --git a/src/graphs/build_mimo2.cpp b/src/graphs/build_mimo2.cpp index 304c0d18..cb5a1ea1 100644 --- a/src/graphs/build_mimo2.cpp +++ b/src/graphs/build_mimo2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_mimo2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); //const int64_t n_embd_head = hparams.n_embd_head_v(0); //GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_minicpm.cpp b/src/graphs/build_minicpm.cpp index 0d4d131a..3a6a58e6 100644 --- a/src/graphs/build_minicpm.cpp +++ b/src/graphs/build_minicpm.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_minimaxm2.cpp b/src/graphs/build_minimaxm2.cpp index 8c632616..c6417fa4 100644 --- a/src/graphs/build_minimaxm2.cpp +++ b/src/graphs/build_minimaxm2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph* llm_build_context::build_minimaxm2() { - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64 diff --git a/src/graphs/build_mistral3.cpp b/src/graphs/build_mistral3.cpp index 4be53b23..74230491 100644 --- a/src/graphs/build_mistral3.cpp +++ b/src/graphs/build_mistral3.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_mistral3() { - auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_mpt.cpp b/src/graphs/build_mpt.cpp index 32776460..561b07c0 100644 --- a/src/graphs/build_mpt.cpp +++ b/src/graphs/build_mpt.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_olmo.cpp b/src/graphs/build_olmo.cpp index fe7f29b7..30be0cbc 100644 --- a/src/graphs/build_olmo.cpp +++ b/src/graphs/build_olmo.cpp @@ -9,7 +9,7 @@ // * removed bias // * removed MoE ggml_cgraph * llm_build_context::build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_openai.cpp b/src/graphs/build_openai.cpp index 4229d827..5d96e8df 100644 --- a/src/graphs/build_openai.cpp +++ b/src/graphs/build_openai.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_openai_moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_openelm.cpp b/src/graphs/build_openelm.cpp index b22fe9a4..042e6cf4 100644 --- a/src/graphs/build_openelm.cpp +++ b/src/graphs/build_openelm.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_orion.cpp b/src/graphs/build_orion.cpp index a4f880b4..aa7d99de 100644 --- a/src/graphs/build_orion.cpp +++ b/src/graphs/build_orion.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_phi2.cpp b/src/graphs/build_phi2.cpp index 5025413a..67aec7b2 100644 --- a/src/graphs/build_phi2.cpp +++ b/src/graphs/build_phi2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_phi3.cpp b/src/graphs/build_phi3.cpp index 9cb5b78e..13b1de14 100644 --- a/src/graphs/build_phi3.cpp +++ b/src/graphs/build_phi3.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_plamo.cpp b/src/graphs/build_plamo.cpp index e0788931..f8e1b9fe 100644 --- a/src/graphs/build_plamo.cpp +++ b/src/graphs/build_plamo.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_plamo() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_qwen.cpp b/src/graphs/build_qwen.cpp index 9187e2b4..db402ce5 100644 --- a/src/graphs/build_qwen.cpp +++ b/src/graphs/build_qwen.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_qwen2.cpp b/src/graphs/build_qwen2.cpp index 01d757cf..8887fdb0 100644 --- a/src/graphs/build_qwen2.cpp +++ b/src/graphs/build_qwen2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -94,7 +94,7 @@ ggml_cgraph * llm_build_context::build_qwen2() { } ggml_cgraph * llm_build_context::build_qwen2vl() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); @@ -195,7 +195,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() { } ggml_cgraph * llm_build_context::build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_qwen3.cpp b/src/graphs/build_qwen3.cpp index 4172ddfa..1f44900b 100644 --- a/src/graphs/build_qwen3.cpp +++ b/src/graphs/build_qwen3.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_qwen3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -104,7 +104,7 @@ ggml_cgraph * llm_build_context::build_qwen3() { } ggml_cgraph * llm_build_context::build_qwen3moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -167,7 +167,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { } ggml_cgraph * llm_build_context::build_qwen3vl() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds const size_t n_deepstack_layers = hparams.n_deepstack_layers; @@ -236,7 +236,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() { } ggml_cgraph * llm_build_context::build_qwen3vlmoe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_qwen35.cpp b/src/graphs/build_qwen35.cpp index 2fd0eecd..9ffc66e0 100644 --- a/src/graphs/build_qwen35.cpp +++ b/src/graphs/build_qwen35.cpp @@ -5,7 +5,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); @@ -88,7 +88,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() { ggml_cgraph * llm_build_context::build_qwen35() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_qwen3next.cpp b/src/graphs/build_qwen3next.cpp index 3ae32419..dfe97665 100644 --- a/src/graphs/build_qwen3next.cpp +++ b/src/graphs/build_qwen3next.cpp @@ -5,7 +5,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); delta_net delta(lctx, batch); diff --git a/src/graphs/build_refact.cpp b/src/graphs/build_refact.cpp index 3f63d3c1..cf603648 100644 --- a/src/graphs/build_refact.cpp +++ b/src/graphs/build_refact.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_seedoss.cpp b/src/graphs/build_seedoss.cpp index a7036e71..4781c0e2 100644 --- a/src/graphs/build_seedoss.cpp +++ b/src/graphs/build_seedoss.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_seedoss() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_smollm3.cpp b/src/graphs/build_smollm3.cpp index 8046d7ea..a92069e4 100644 --- a/src/graphs/build_smollm3.cpp +++ b/src/graphs/build_smollm3.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph* llm_build_context::build_smollm3() { - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64 diff --git a/src/graphs/build_stablelm.cpp b/src/graphs/build_stablelm.cpp index 629fe23e..52d028c4 100644 --- a/src/graphs/build_stablelm.cpp +++ b/src/graphs/build_stablelm.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_stablelm() { - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_starcoder.cpp b/src/graphs/build_starcoder.cpp index 411d0d06..77fab907 100644 --- a/src/graphs/build_starcoder.cpp +++ b/src/graphs/build_starcoder.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); diff --git a/src/graphs/build_starcoder2.cpp b/src/graphs/build_starcoder2.cpp index b6d1e78e..75a3cef1 100644 --- a/src/graphs/build_starcoder2.cpp +++ b/src/graphs/build_starcoder2.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/graphs/build_step35.cpp b/src/graphs/build_step35.cpp index 4763112f..d22f3774 100644 --- a/src/graphs/build_step35.cpp +++ b/src/graphs/build_step35.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_step35() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); ggml_tensor * cur; auto inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); auto inp_pos = build_inp_pos(); diff --git a/src/graphs/build_t5.cpp b/src/graphs/build_t5.cpp index c983f807..e1cbca28 100644 --- a/src/graphs/build_t5.cpp +++ b/src/graphs/build_t5.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_t5_encoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -121,7 +121,7 @@ ggml_cgraph * llm_build_context::build_t5_encoder() { } ggml_cgraph * llm_build_context::build_t5_decoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; diff --git a/src/graphs/build_xverse.cpp b/src/graphs/build_xverse.cpp index 66a6eefc..a3ef18ff 100644 --- a/src/graphs/build_xverse.cpp +++ b/src/graphs/build_xverse.cpp @@ -3,7 +3,7 @@ #include "../llama-context.h" ggml_cgraph * llm_build_context::build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false); + ggml_cgraph * gf = new_graph_custom(); const int64_t n_embd_head = hparams.n_embd_head_v(0); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0)); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 54a649ab..8823dccf 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -2935,3 +2935,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens int32_t llama_model_n_nextn_layer(const llama_model * model) { return model->hparams.nextn_predict_layers; } + +ggml_cgraph * llm_build_context::new_graph_custom() { + int max_nodes = lctx.max_nodes(n_tokens, n_kv); + return ggml_new_graph_custom(ctx0, max_nodes, false); +} diff --git a/src/llama-build-context.h b/src/llama-build-context.h index 73490c3a..c0e2f1e4 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -509,4 +509,6 @@ llm_expert_gating_func_type gating_op, struct ggml_cgraph * gf, struct ggml_tensor * inp_pos ); + + ggml_cgraph * new_graph_custom(); }; diff --git a/src/llama-context.h b/src/llama-context.h index db0018d7..34745d2d 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -319,5 +319,7 @@ struct llama_context { struct llama_context & lctx); void set_mtp_op_type(llama_mtp_op_type value); + int max_nodes(int n_tokens, int n_kv) const; + }; diff --git a/src/llama.cpp b/src/llama.cpp index a14259cc..c589b3aa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -696,6 +696,37 @@ llama_context::~llama_context() { ggml_backend_buffer_free(buf_output); } +int llama_context::max_nodes(int n_tokens, int n_kv) const { + int max_nodes = model.max_nodes(n_tokens); + if (model.is_mla_model() && + cparams.mla_attn > 1 && + n_tokens >= 128 && + cparams.attn_max_batch > 0 && + model.split_mode != LLAMA_SPLIT_MODE_GRAPH && + model.layers[0].wkv_b) { + // In this case we perform the attention computation iteratively, and this adds + // 10 nodes per layer per iteration. Although in many cases the 65536 nodes we + // estimate by default are enough to accomodate, to be safe we add the additional + // number of nodes required for the iterative MLA evaluation. + int n_head = model.hparams.n_head(); + auto wkv_b = model.layers[0].wkv_b; + auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024); + if (cparams.attn_max_batch < kv_f32_size) { + int n_max_head = 1; + for (int niter = 2; niter < n_head; ++niter) { + if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) { + n_max_head = n_head/niter; + break; + } + } + GGML_ASSERT(n_head % n_max_head == 0); + int n_iter = n_head / n_max_head; + max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer; + } + } + return max_nodes; +} + // // kv cache helpers // @@ -7128,7 +7159,7 @@ struct llama_context * llama_init_from_model( } int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch); - const size_t max_nodes = model->max_nodes(n_tokens); + const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx); // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));