Add extra nodes when dealing with MLA and amb (#1899)

2026-06-28 04:30:15 -05:00 · 2026-05-29 15:17:24 +03:00 · 2026-05-29 15:17:24 +03:00 · 8960c5ba5e
commit 8960c5ba5e
parent e75337fec3
60 changed files with 112 additions and 69 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1788,6 +1788,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "-amb" || arg == "--attention-max-batch") {
        CHECK_ARG
        params.attn_max_batch = std::stoi(argv[i]);
+        if (params.attn_max_batch > 0 && params.attn_max_batch < 128) {
+            LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX amb = %d is too low. Changing to 128\n", params.attn_max_batch);
+            params.attn_max_batch = 128;
+        }
        return true;
    }
    if (arg == "-no-fmoe" || arg == "--no-fused-moe") {
--- a/src/graphs/build_arctic.cpp
+++ b/src/graphs/build_arctic.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_arctic() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_baichuan.cpp
+++ b/src/graphs/build_baichuan.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_baichuan() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_bailingmoe2.cpp
+++ b/src/graphs/build_bailingmoe2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_bailingmoe2() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    //const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

--- a/src/graphs/build_bert.cpp
+++ b/src/graphs/build_bert.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_bert() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_bitnet.cpp
+++ b/src/graphs/build_bitnet.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_bitnet() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -148,7 +148,7 @@ ggml_cgraph * llm_build_context::build_bitnet() {
 }

 ggml_cgraph * llm_build_context::build_bitnet_158() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_bloom.cpp
+++ b/src/graphs/build_bloom.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_bloom() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_chatglm.cpp
+++ b/src/graphs/build_chatglm.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_chatglm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_codeshell.cpp
+++ b/src/graphs/build_codeshell.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_codeshell() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_cohere2.cpp
+++ b/src/graphs/build_cohere2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_cohere2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_command_r.cpp
+++ b/src/graphs/build_command_r.cpp
@ -4,7 +4,7 @@

 ggml_cgraph * llm_build_context::build_command_r() {

-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_dbrx.cpp
+++ b/src/graphs/build_dbrx.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_dbrx() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_deci.cpp
+++ b/src/graphs/build_deci.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_deci() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_deepseek2.cpp
+++ b/src/graphs/build_deepseek2.cpp
@ -769,7 +769,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
 #else
    const bool use_f32_attn_precision = lctx.cparams.graph_attn_precision == GGML_TYPE_F32;
 #endif
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_dots1.cpp
+++ b/src/graphs/build_dots1.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_dots1() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);

--- a/src/graphs/build_ernie45.cpp
+++ b/src/graphs/build_ernie45.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_ernie4_5() {
-    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -119,7 +119,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() {
 }

 ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
-    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_falcon.cpp
+++ b/src/graphs/build_falcon.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_falcon() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_gemma.cpp
+++ b/src/graphs/build_gemma.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_gemma() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head_k = hparams.n_embd_head_k(0);

--- a/src/graphs/build_gemma2.cpp
+++ b/src/graphs/build_gemma2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_gemma2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head_k = hparams.n_embd_head_k_full;

--- a/src/graphs/build_gemma3.cpp
+++ b/src/graphs/build_gemma3.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_gemma3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@ -169,7 +169,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
    int n_device = model.splits.size();
    GGML_ASSERT(n_device > 1);
    GGML_ASSERT(cparams.flash_attn);
-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = llm.new_graph_custom();

    bool is_moe = hparams.n_expert > 0;

@ -527,7 +527,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
 }

 ggml_cgraph * llm_build_context::build_gemma4_mtp() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd          = hparams.n_embd;
    const int64_t n_vocab         = hparams.n_vocab;
@ -731,7 +731,7 @@ ggml_cgraph * llm_build_context::build_gemma4() {
                                     KQ_mask, KQ_mask_swa, n_tokens,  cb);
    }

-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    ggml_tensor * inp_per_layer = nullptr;
    if (model.tok_embd_per_layer) {
--- a/src/graphs/build_glm4.cpp
+++ b/src/graphs/build_glm4.cpp
@ -3,8 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_glm4_moe() {
-    // create a new graph
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -159,7 +158,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
 }

 ggml_cgraph * llm_build_context::build_glm4() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_gpt2.cpp
+++ b/src/graphs/build_gpt2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_gpt2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_gptneox.cpp
+++ b/src/graphs/build_gptneox.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_gptneox() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_grok.cpp
+++ b/src/graphs/build_grok.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_grok() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_hunyuan.cpp
+++ b/src/graphs/build_hunyuan.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_hunyuan_moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);

--- a/src/graphs/build_internlm2.cpp
+++ b/src/graphs/build_internlm2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_internlm2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_jais.cpp
+++ b/src/graphs/build_jais.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_jais() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_llama.cpp
+++ b/src/graphs/build_llama.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_llama() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_mamba.cpp
+++ b/src/graphs/build_mamba.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_mamba() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t d_model = n_embd;
    const int64_t d_conv  = hparams.ssm_d_conv;
--- a/src/graphs/build_mimo2.cpp
+++ b/src/graphs/build_mimo2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_mimo2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    //const int64_t n_embd_head = hparams.n_embd_head_v(0);
    //GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_minicpm.cpp
+++ b/src/graphs/build_minicpm.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_minicpm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_minimaxm2.cpp
+++ b/src/graphs/build_minimaxm2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph* llm_build_context::build_minimaxm2() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
    // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
--- a/src/graphs/build_mistral3.cpp
+++ b/src/graphs/build_mistral3.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_mistral3() {
-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);

    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_mpt.cpp
+++ b/src/graphs/build_mpt.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_mpt() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_olmo.cpp
+++ b/src/graphs/build_olmo.cpp
@ -9,7 +9,7 @@
 //   * removed bias
 //   * removed MoE
 ggml_cgraph * llm_build_context::build_olmo() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_openai.cpp
+++ b/src/graphs/build_openai.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_openai_moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_openelm.cpp
+++ b/src/graphs/build_openelm.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_openelm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_orion.cpp
+++ b/src/graphs/build_orion.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_orion() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_phi2.cpp
+++ b/src/graphs/build_phi2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_phi2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_phi3.cpp
+++ b/src/graphs/build_phi3.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_phi3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
--- a/src/graphs/build_plamo.cpp
+++ b/src/graphs/build_plamo.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_plamo() {
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_qwen.cpp
+++ b/src/graphs/build_qwen.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_qwen() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_qwen2.cpp
+++ b/src/graphs/build_qwen2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_qwen2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -94,7 +94,7 @@ ggml_cgraph * llm_build_context::build_qwen2() {
 }

 ggml_cgraph * llm_build_context::build_qwen2vl() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);

@ -195,7 +195,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() {
 }

 ggml_cgraph * llm_build_context::build_qwen2moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_qwen3.cpp
+++ b/src/graphs/build_qwen3.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_qwen3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -104,7 +104,7 @@ ggml_cgraph * llm_build_context::build_qwen3() {
 }

 ggml_cgraph * llm_build_context::build_qwen3moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -167,7 +167,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
 }

 ggml_cgraph * llm_build_context::build_qwen3vl() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
    const size_t n_deepstack_layers = hparams.n_deepstack_layers;
@ -236,7 +236,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() {
 }

 ggml_cgraph * llm_build_context::build_qwen3vlmoe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_qwen35.cpp
+++ b/src/graphs/build_qwen35.cpp
@ -5,7 +5,7 @@

 ggml_cgraph * llm_build_context::build_qwen35moe() {

-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -88,7 +88,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {

 ggml_cgraph * llm_build_context::build_qwen35() {

-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_qwen3next.cpp
+++ b/src/graphs/build_qwen3next.cpp
@ -5,7 +5,7 @@

 ggml_cgraph * llm_build_context::build_qwen3next() {

-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    delta_net delta(lctx, batch);

--- a/src/graphs/build_refact.cpp
+++ b/src/graphs/build_refact.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_refact() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_seedoss.cpp
+++ b/src/graphs/build_seedoss.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_seedoss() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_smollm3.cpp
+++ b/src/graphs/build_smollm3.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph* llm_build_context::build_smollm3() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
    // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
--- a/src/graphs/build_stablelm.cpp
+++ b/src/graphs/build_stablelm.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_stablelm() {
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_starcoder.cpp
+++ b/src/graphs/build_starcoder.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_starcoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
--- a/src/graphs/build_starcoder2.cpp
+++ b/src/graphs/build_starcoder2.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_starcoder2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/graphs/build_step35.cpp
+++ b/src/graphs/build_step35.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_step35() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
    ggml_tensor * cur;
    auto inpL        = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
    auto inp_pos     = build_inp_pos();
--- a/src/graphs/build_t5.cpp
+++ b/src/graphs/build_t5.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_t5_encoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
@ -121,7 +121,7 @@ ggml_cgraph * llm_build_context::build_t5_encoder() {
 }

 ggml_cgraph * llm_build_context::build_t5_decoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;
--- a/src/graphs/build_xverse.cpp
+++ b/src/graphs/build_xverse.cpp
@ -3,7 +3,7 @@
 #include "../llama-context.h"

 ggml_cgraph * llm_build_context::build_xverse() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();

    const int64_t n_embd_head = hparams.n_embd_head_v(0);
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@ -2935,3 +2935,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
 int32_t llama_model_n_nextn_layer(const llama_model * model) {
    return model->hparams.nextn_predict_layers;
 }
+
+ggml_cgraph * llm_build_context::new_graph_custom() {
+    int max_nodes = lctx.max_nodes(n_tokens, n_kv);
+    return ggml_new_graph_custom(ctx0, max_nodes, false);
+}
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@ -509,4 +509,6 @@ llm_expert_gating_func_type   gating_op,
        struct ggml_cgraph * gf,
        struct ggml_tensor * inp_pos
    );
+
+    ggml_cgraph * new_graph_custom();
 };
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -319,5 +319,7 @@ struct llama_context {
        struct llama_context & lctx);
    void set_mtp_op_type(llama_mtp_op_type value);

+    int max_nodes(int n_tokens, int n_kv) const;
+
 };

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -696,6 +696,37 @@ llama_context::~llama_context() {
    ggml_backend_buffer_free(buf_output);
 }

+int llama_context::max_nodes(int n_tokens, int n_kv) const {
+    int max_nodes = model.max_nodes(n_tokens);
+    if (model.is_mla_model() &&
+        cparams.mla_attn > 1 &&
+        n_tokens >= 128 &&
+        cparams.attn_max_batch > 0 &&
+        model.split_mode != LLAMA_SPLIT_MODE_GRAPH &&
+        model.layers[0].wkv_b) {
+        // In this case we perform the attention computation iteratively, and this adds
+        // 10 nodes per layer per iteration. Although in many cases the 65536 nodes we
+        // estimate by default are enough to accomodate, to be safe we add the additional
+        // number of nodes required for the iterative MLA evaluation.
+        int n_head = model.hparams.n_head();
+        auto wkv_b = model.layers[0].wkv_b;
+        auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024);
+        if (cparams.attn_max_batch < kv_f32_size) {
+            int n_max_head = 1;
+            for (int niter = 2; niter < n_head; ++niter) {
+                if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) {
+                    n_max_head = n_head/niter;
+                    break;
+                }
+            }
+            GGML_ASSERT(n_head % n_max_head == 0);
+            int n_iter = n_head / n_max_head;
+            max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer;
+        }
+    }
+    return max_nodes;
+}
+
 //
 // kv cache helpers
 //
@ -7128,7 +7159,7 @@ struct llama_context * llama_init_from_model(
            }

            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
-            const size_t max_nodes = model->max_nodes(n_tokens);
+            const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx);

            // buffer used to store the computation graph and the tensor meta data
            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));