From adeff7dbd3bacb63d54056cbd7f74aa6b9e3dac4 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Fri, 29 May 2026 08:38:24 +0000
Subject: [PATCH] Add extra nodes when dealing with MLA and amb

---
 common/common.cpp                |  4 ++++
 src/graphs/build_arctic.cpp      |  2 +-
 src/graphs/build_baichuan.cpp    |  2 +-
 src/graphs/build_bailingmoe2.cpp |  2 +-
 src/graphs/build_bert.cpp        |  2 +-
 src/graphs/build_bitnet.cpp      |  4 ++--
 src/graphs/build_bloom.cpp       |  2 +-
 src/graphs/build_chatglm.cpp     |  2 +-
 src/graphs/build_codeshell.cpp   |  2 +-
 src/graphs/build_cohere2.cpp     |  2 +-
 src/graphs/build_command_r.cpp   |  2 +-
 src/graphs/build_dbrx.cpp        |  2 +-
 src/graphs/build_deci.cpp        |  2 +-
 src/graphs/build_deepseek2.cpp   |  2 +-
 src/graphs/build_dots1.cpp       |  2 +-
 src/graphs/build_ernie45.cpp     |  4 ++--
 src/graphs/build_falcon.cpp      |  2 +-
 src/graphs/build_gemma.cpp       |  2 +-
 src/graphs/build_gemma2.cpp      |  2 +-
 src/graphs/build_gemma3.cpp      |  2 +-
 src/graphs/build_gemma4.cpp      |  6 +++---
 src/graphs/build_glm4.cpp        |  5 ++---
 src/graphs/build_gpt2.cpp        |  2 +-
 src/graphs/build_gptneox.cpp     |  2 +-
 src/graphs/build_grok.cpp        |  2 +-
 src/graphs/build_hunyuan.cpp     |  2 +-
 src/graphs/build_internlm2.cpp   |  2 +-
 src/graphs/build_jais.cpp        |  2 +-
 src/graphs/build_llama.cpp       |  2 +-
 src/graphs/build_mamba.cpp       |  2 +-
 src/graphs/build_mimo2.cpp       |  2 +-
 src/graphs/build_minicpm.cpp     |  2 +-
 src/graphs/build_minimaxm2.cpp   |  2 +-
 src/graphs/build_mistral3.cpp    |  2 +-
 src/graphs/build_mpt.cpp         |  2 +-
 src/graphs/build_olmo.cpp        |  2 +-
 src/graphs/build_openai.cpp      |  2 +-
 src/graphs/build_openelm.cpp     |  2 +-
 src/graphs/build_orion.cpp       |  2 +-
 src/graphs/build_phi2.cpp        |  2 +-
 src/graphs/build_phi3.cpp        |  2 +-
 src/graphs/build_plamo.cpp       |  2 +-
 src/graphs/build_qwen.cpp        |  2 +-
 src/graphs/build_qwen2.cpp       |  6 +++---
 src/graphs/build_qwen3.cpp       |  8 ++++----
 src/graphs/build_qwen35.cpp      |  4 ++--
 src/graphs/build_qwen3next.cpp   |  2 +-
 src/graphs/build_refact.cpp      |  2 +-
 src/graphs/build_seedoss.cpp     |  2 +-
 src/graphs/build_smollm3.cpp     |  2 +-
 src/graphs/build_stablelm.cpp    |  2 +-
 src/graphs/build_starcoder.cpp   |  2 +-
 src/graphs/build_starcoder2.cpp  |  2 +-
 src/graphs/build_step35.cpp      |  2 +-
 src/graphs/build_t5.cpp          |  4 ++--
 src/graphs/build_xverse.cpp      |  2 +-
 src/llama-build-context.cpp      |  5 +++++
 src/llama-build-context.h        |  2 ++
 src/llama-context.h              |  2 ++
 src/llama.cpp                    | 33 +++++++++++++++++++++++++++++++-
 60 files changed, 112 insertions(+), 69 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ffb8d5fd..ac2e3226 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1788,6 +1788,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "-amb" || arg == "--attention-max-batch") {
         CHECK_ARG
         params.attn_max_batch = std::stoi(argv[i]);
+        if (params.attn_max_batch > 0 && params.attn_max_batch < 128) {
+            LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX amb = %d is too low. Changing to 128\n", params.attn_max_batch);
+            params.attn_max_batch = 128;
+        }
         return true;
     }
     if (arg == "-no-fmoe" || arg == "--no-fused-moe") {
diff --git a/src/graphs/build_arctic.cpp b/src/graphs/build_arctic.cpp
index eded4d9f..aae3d85f 100644
--- a/src/graphs/build_arctic.cpp
+++ b/src/graphs/build_arctic.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_arctic() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_baichuan.cpp b/src/graphs/build_baichuan.cpp
index 94b88329..11ef6e2e 100644
--- a/src/graphs/build_baichuan.cpp
+++ b/src/graphs/build_baichuan.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_baichuan() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_bailingmoe2.cpp b/src/graphs/build_bailingmoe2.cpp
index 17022235..6a1e54e9 100644
--- a/src/graphs/build_bailingmoe2.cpp
+++ b/src/graphs/build_bailingmoe2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_bailingmoe2() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     //const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
 
diff --git a/src/graphs/build_bert.cpp b/src/graphs/build_bert.cpp
index 936876df..5574ba8f 100644
--- a/src/graphs/build_bert.cpp
+++ b/src/graphs/build_bert.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_bert() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_bitnet.cpp b/src/graphs/build_bitnet.cpp
index 5247c9b3..9b0a4d72 100644
--- a/src/graphs/build_bitnet.cpp
+++ b/src/graphs/build_bitnet.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_bitnet() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -148,7 +148,7 @@ ggml_cgraph * llm_build_context::build_bitnet() {
 }
 
 ggml_cgraph * llm_build_context::build_bitnet_158() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_bloom.cpp b/src/graphs/build_bloom.cpp
index e6f36827..b712a2ec 100644
--- a/src/graphs/build_bloom.cpp
+++ b/src/graphs/build_bloom.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_bloom() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_chatglm.cpp b/src/graphs/build_chatglm.cpp
index 7880d892..9c3da126 100644
--- a/src/graphs/build_chatglm.cpp
+++ b/src/graphs/build_chatglm.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_chatglm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_codeshell.cpp b/src/graphs/build_codeshell.cpp
index 2567f0c8..bceb5755 100644
--- a/src/graphs/build_codeshell.cpp
+++ b/src/graphs/build_codeshell.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_codeshell() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_cohere2.cpp b/src/graphs/build_cohere2.cpp
index 56150cfd..aaaa7b47 100644
--- a/src/graphs/build_cohere2.cpp
+++ b/src/graphs/build_cohere2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_cohere2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_command_r.cpp b/src/graphs/build_command_r.cpp
index f3c21481..805d4ffc 100644
--- a/src/graphs/build_command_r.cpp
+++ b/src/graphs/build_command_r.cpp
@@ -4,7 +4,7 @@
 
 ggml_cgraph * llm_build_context::build_command_r() {
 
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_dbrx.cpp b/src/graphs/build_dbrx.cpp
index 657aca69..33f14b7d 100644
--- a/src/graphs/build_dbrx.cpp
+++ b/src/graphs/build_dbrx.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_dbrx() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_deci.cpp b/src/graphs/build_deci.cpp
index 8c26174b..73038d48 100644
--- a/src/graphs/build_deci.cpp
+++ b/src/graphs/build_deci.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_deci() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_deepseek2.cpp b/src/graphs/build_deepseek2.cpp
index 1d78a826..c5129b8f 100644
--- a/src/graphs/build_deepseek2.cpp
+++ b/src/graphs/build_deepseek2.cpp
@@ -769,7 +769,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
 #else
     const bool use_f32_attn_precision = lctx.cparams.graph_attn_precision == GGML_TYPE_F32;
 #endif
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_dots1.cpp b/src/graphs/build_dots1.cpp
index d22ab464..3ab32eef 100644
--- a/src/graphs/build_dots1.cpp
+++ b/src/graphs/build_dots1.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_dots1() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
diff --git a/src/graphs/build_ernie45.cpp b/src/graphs/build_ernie45.cpp
index 5d6ba492..f194c7d8 100644
--- a/src/graphs/build_ernie45.cpp
+++ b/src/graphs/build_ernie45.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_ernie4_5() {
-    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -119,7 +119,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() {
 }
 
 ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
-    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_falcon.cpp b/src/graphs/build_falcon.cpp
index c6c267a2..143c1384 100644
--- a/src/graphs/build_falcon.cpp
+++ b/src/graphs/build_falcon.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_falcon() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_gemma.cpp b/src/graphs/build_gemma.cpp
index 4c378a32..87280436 100644
--- a/src/graphs/build_gemma.cpp
+++ b/src/graphs/build_gemma.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_gemma() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
 
diff --git a/src/graphs/build_gemma2.cpp b/src/graphs/build_gemma2.cpp
index 5362e060..c5bfc691 100644
--- a/src/graphs/build_gemma2.cpp
+++ b/src/graphs/build_gemma2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_gemma2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head_k = hparams.n_embd_head_k_full;
 
diff --git a/src/graphs/build_gemma3.cpp b/src/graphs/build_gemma3.cpp
index d0d3682e..3033acc5 100644
--- a/src/graphs/build_gemma3.cpp
+++ b/src/graphs/build_gemma3.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_gemma3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
diff --git a/src/graphs/build_gemma4.cpp b/src/graphs/build_gemma4.cpp
index 64853ea6..8a2c8029 100644
--- a/src/graphs/build_gemma4.cpp
+++ b/src/graphs/build_gemma4.cpp
@@ -169,7 +169,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
     int n_device = model.splits.size();
     GGML_ASSERT(n_device > 1);
     GGML_ASSERT(cparams.flash_attn);
-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = llm.new_graph_custom();
 
     bool is_moe = hparams.n_expert > 0;
 
@@ -527,7 +527,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
 }
 
 ggml_cgraph * llm_build_context::build_gemma4_mtp() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd          = hparams.n_embd;
     const int64_t n_vocab         = hparams.n_vocab;
@@ -731,7 +731,7 @@ ggml_cgraph * llm_build_context::build_gemma4() {
                                      KQ_mask, KQ_mask_swa, n_tokens,  cb);
     }
 
-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     ggml_tensor * inp_per_layer = nullptr;
     if (model.tok_embd_per_layer) {
diff --git a/src/graphs/build_glm4.cpp b/src/graphs/build_glm4.cpp
index 305a1732..cb567cc4 100644
--- a/src/graphs/build_glm4.cpp
+++ b/src/graphs/build_glm4.cpp
@@ -3,8 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_glm4_moe() {
-    // create a new graph
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -159,7 +158,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
 }
 
 ggml_cgraph * llm_build_context::build_glm4() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_gpt2.cpp b/src/graphs/build_gpt2.cpp
index 15bb22ee..94b75403 100644
--- a/src/graphs/build_gpt2.cpp
+++ b/src/graphs/build_gpt2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_gpt2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_gptneox.cpp b/src/graphs/build_gptneox.cpp
index d11ee3af..e3fcb36f 100644
--- a/src/graphs/build_gptneox.cpp
+++ b/src/graphs/build_gptneox.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_gptneox() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_grok.cpp b/src/graphs/build_grok.cpp
index d96f9f07..4f67fbec 100644
--- a/src/graphs/build_grok.cpp
+++ b/src/graphs/build_grok.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_grok() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_hunyuan.cpp b/src/graphs/build_hunyuan.cpp
index 6026258e..23217d5d 100644
--- a/src/graphs/build_hunyuan.cpp
+++ b/src/graphs/build_hunyuan.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_hunyuan_moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
diff --git a/src/graphs/build_internlm2.cpp b/src/graphs/build_internlm2.cpp
index 53969364..8588f699 100644
--- a/src/graphs/build_internlm2.cpp
+++ b/src/graphs/build_internlm2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_internlm2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_jais.cpp b/src/graphs/build_jais.cpp
index cc872f41..20436fb9 100644
--- a/src/graphs/build_jais.cpp
+++ b/src/graphs/build_jais.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_jais() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_llama.cpp b/src/graphs/build_llama.cpp
index 765c882c..16d04ee9 100644
--- a/src/graphs/build_llama.cpp
+++ b/src/graphs/build_llama.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_llama() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_mamba.cpp b/src/graphs/build_mamba.cpp
index 938786b0..b3ef77c3 100644
--- a/src/graphs/build_mamba.cpp
+++ b/src/graphs/build_mamba.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_mamba() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t d_model = n_embd;
     const int64_t d_conv  = hparams.ssm_d_conv;
diff --git a/src/graphs/build_mimo2.cpp b/src/graphs/build_mimo2.cpp
index 304c0d18..cb5a1ea1 100644
--- a/src/graphs/build_mimo2.cpp
+++ b/src/graphs/build_mimo2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_mimo2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     //const int64_t n_embd_head = hparams.n_embd_head_v(0);
     //GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_minicpm.cpp b/src/graphs/build_minicpm.cpp
index 0d4d131a..3a6a58e6 100644
--- a/src/graphs/build_minicpm.cpp
+++ b/src/graphs/build_minicpm.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_minicpm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_minimaxm2.cpp b/src/graphs/build_minimaxm2.cpp
index 8c632616..c6417fa4 100644
--- a/src/graphs/build_minimaxm2.cpp
+++ b/src/graphs/build_minimaxm2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph* llm_build_context::build_minimaxm2() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
     // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
diff --git a/src/graphs/build_mistral3.cpp b/src/graphs/build_mistral3.cpp
index 4be53b23..74230491 100644
--- a/src/graphs/build_mistral3.cpp
+++ b/src/graphs/build_mistral3.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_mistral3() {
-    auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_mpt.cpp b/src/graphs/build_mpt.cpp
index 32776460..561b07c0 100644
--- a/src/graphs/build_mpt.cpp
+++ b/src/graphs/build_mpt.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_mpt() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_olmo.cpp b/src/graphs/build_olmo.cpp
index fe7f29b7..30be0cbc 100644
--- a/src/graphs/build_olmo.cpp
+++ b/src/graphs/build_olmo.cpp
@@ -9,7 +9,7 @@
 //   * removed bias
 //   * removed MoE
 ggml_cgraph * llm_build_context::build_olmo() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_openai.cpp b/src/graphs/build_openai.cpp
index 4229d827..5d96e8df 100644
--- a/src/graphs/build_openai.cpp
+++ b/src/graphs/build_openai.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_openai_moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_openelm.cpp b/src/graphs/build_openelm.cpp
index b22fe9a4..042e6cf4 100644
--- a/src/graphs/build_openelm.cpp
+++ b/src/graphs/build_openelm.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_openelm() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_orion.cpp b/src/graphs/build_orion.cpp
index a4f880b4..aa7d99de 100644
--- a/src/graphs/build_orion.cpp
+++ b/src/graphs/build_orion.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_orion() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_phi2.cpp b/src/graphs/build_phi2.cpp
index 5025413a..67aec7b2 100644
--- a/src/graphs/build_phi2.cpp
+++ b/src/graphs/build_phi2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_phi2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_phi3.cpp b/src/graphs/build_phi3.cpp
index 9cb5b78e..13b1de14 100644
--- a/src/graphs/build_phi3.cpp
+++ b/src/graphs/build_phi3.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_phi3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_plamo.cpp b/src/graphs/build_plamo.cpp
index e0788931..f8e1b9fe 100644
--- a/src/graphs/build_plamo.cpp
+++ b/src/graphs/build_plamo.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_plamo() {
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_qwen.cpp b/src/graphs/build_qwen.cpp
index 9187e2b4..db402ce5 100644
--- a/src/graphs/build_qwen.cpp
+++ b/src/graphs/build_qwen.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_qwen() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_qwen2.cpp b/src/graphs/build_qwen2.cpp
index 01d757cf..8887fdb0 100644
--- a/src/graphs/build_qwen2.cpp
+++ b/src/graphs/build_qwen2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_qwen2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -94,7 +94,7 @@ ggml_cgraph * llm_build_context::build_qwen2() {
 }
 
 ggml_cgraph * llm_build_context::build_qwen2vl() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
 
@@ -195,7 +195,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() {
 }
 
 ggml_cgraph * llm_build_context::build_qwen2moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_qwen3.cpp b/src/graphs/build_qwen3.cpp
index 4172ddfa..1f44900b 100644
--- a/src/graphs/build_qwen3.cpp
+++ b/src/graphs/build_qwen3.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_qwen3() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -104,7 +104,7 @@ ggml_cgraph * llm_build_context::build_qwen3() {
 }
 
 ggml_cgraph * llm_build_context::build_qwen3moe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -167,7 +167,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
 }
 
 ggml_cgraph * llm_build_context::build_qwen3vl() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
@@ -236,7 +236,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() {
 }
 
 ggml_cgraph * llm_build_context::build_qwen3vlmoe() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_qwen35.cpp b/src/graphs/build_qwen35.cpp
index 2fd0eecd..9ffc66e0 100644
--- a/src/graphs/build_qwen35.cpp
+++ b/src/graphs/build_qwen35.cpp
@@ -5,7 +5,7 @@
 
 ggml_cgraph * llm_build_context::build_qwen35moe() {
 
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@@ -88,7 +88,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
 
 ggml_cgraph * llm_build_context::build_qwen35() {
 
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_qwen3next.cpp b/src/graphs/build_qwen3next.cpp
index 3ae32419..dfe97665 100644
--- a/src/graphs/build_qwen3next.cpp
+++ b/src/graphs/build_qwen3next.cpp
@@ -5,7 +5,7 @@
 
 ggml_cgraph * llm_build_context::build_qwen3next() {
 
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     delta_net delta(lctx, batch);
 
diff --git a/src/graphs/build_refact.cpp b/src/graphs/build_refact.cpp
index 3f63d3c1..cf603648 100644
--- a/src/graphs/build_refact.cpp
+++ b/src/graphs/build_refact.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_refact() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_seedoss.cpp b/src/graphs/build_seedoss.cpp
index a7036e71..4781c0e2 100644
--- a/src/graphs/build_seedoss.cpp
+++ b/src/graphs/build_seedoss.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_seedoss() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_smollm3.cpp b/src/graphs/build_smollm3.cpp
index 8046d7ea..a92069e4 100644
--- a/src/graphs/build_smollm3.cpp
+++ b/src/graphs/build_smollm3.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph* llm_build_context::build_smollm3() {
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
     // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
diff --git a/src/graphs/build_stablelm.cpp b/src/graphs/build_stablelm.cpp
index 629fe23e..52d028c4 100644
--- a/src/graphs/build_stablelm.cpp
+++ b/src/graphs/build_stablelm.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_stablelm() {
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_starcoder.cpp b/src/graphs/build_starcoder.cpp
index 411d0d06..77fab907 100644
--- a/src/graphs/build_starcoder.cpp
+++ b/src/graphs/build_starcoder.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_starcoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
diff --git a/src/graphs/build_starcoder2.cpp b/src/graphs/build_starcoder2.cpp
index b6d1e78e..75a3cef1 100644
--- a/src/graphs/build_starcoder2.cpp
+++ b/src/graphs/build_starcoder2.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_starcoder2() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/graphs/build_step35.cpp b/src/graphs/build_step35.cpp
index 4763112f..d22f3774 100644
--- a/src/graphs/build_step35.cpp
+++ b/src/graphs/build_step35.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_step35() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
     ggml_tensor * cur;
     auto inpL        = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
     auto inp_pos     = build_inp_pos();
diff --git a/src/graphs/build_t5.cpp b/src/graphs/build_t5.cpp
index c983f807..e1cbca28 100644
--- a/src/graphs/build_t5.cpp
+++ b/src/graphs/build_t5.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_t5_encoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
@@ -121,7 +121,7 @@ ggml_cgraph * llm_build_context::build_t5_encoder() {
 }
 
 ggml_cgraph * llm_build_context::build_t5_decoder() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     // mutable variable, needed during the last layer of the computation to skip unused tokens
     int32_t n_tokens = this->n_tokens;
diff --git a/src/graphs/build_xverse.cpp b/src/graphs/build_xverse.cpp
index 66a6eefc..a3ef18ff 100644
--- a/src/graphs/build_xverse.cpp
+++ b/src/graphs/build_xverse.cpp
@@ -3,7 +3,7 @@
 #include "../llama-context.h"
 
 ggml_cgraph * llm_build_context::build_xverse() {
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
+    ggml_cgraph * gf = new_graph_custom();
 
     const int64_t n_embd_head = hparams.n_embd_head_v(0);
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 54a649ab..8823dccf 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -2935,3 +2935,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
 int32_t llama_model_n_nextn_layer(const llama_model * model) {
     return model->hparams.nextn_predict_layers;
 }
+
+ggml_cgraph * llm_build_context::new_graph_custom() {
+    int max_nodes = lctx.max_nodes(n_tokens, n_kv);
+    return ggml_new_graph_custom(ctx0, max_nodes, false);
+}
diff --git a/src/llama-build-context.h b/src/llama-build-context.h
index 73490c3a..c0e2f1e4 100644
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -509,4 +509,6 @@ llm_expert_gating_func_type   gating_op,
         struct ggml_cgraph * gf,
         struct ggml_tensor * inp_pos
     );
+
+    ggml_cgraph * new_graph_custom();
 };
diff --git a/src/llama-context.h b/src/llama-context.h
index db0018d7..34745d2d 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -319,5 +319,7 @@ struct llama_context {
         struct llama_context & lctx);
     void set_mtp_op_type(llama_mtp_op_type value);
 
+    int max_nodes(int n_tokens, int n_kv) const;
+
 };
 
diff --git a/src/llama.cpp b/src/llama.cpp
index a14259cc..c589b3aa 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -696,6 +696,37 @@ llama_context::~llama_context() {
     ggml_backend_buffer_free(buf_output);
 }
 
+int llama_context::max_nodes(int n_tokens, int n_kv) const {
+    int max_nodes = model.max_nodes(n_tokens);
+    if (model.is_mla_model() &&
+        cparams.mla_attn > 1 &&
+        n_tokens >= 128 &&
+        cparams.attn_max_batch > 0 &&
+        model.split_mode != LLAMA_SPLIT_MODE_GRAPH &&
+        model.layers[0].wkv_b) {
+        // In this case we perform the attention computation iteratively, and this adds
+        // 10 nodes per layer per iteration. Although in many cases the 65536 nodes we
+        // estimate by default are enough to accomodate, to be safe we add the additional
+        // number of nodes required for the iterative MLA evaluation.
+        int n_head = model.hparams.n_head();
+        auto wkv_b = model.layers[0].wkv_b;
+        auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024);
+        if (cparams.attn_max_batch < kv_f32_size) {
+            int n_max_head = 1;
+            for (int niter = 2; niter < n_head; ++niter) {
+                if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) {
+                    n_max_head = n_head/niter;
+                    break;
+                }
+            }
+            GGML_ASSERT(n_head % n_max_head == 0);
+            int n_iter = n_head / n_max_head;
+            max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer;
+        }
+    }
+    return max_nodes;
+}
+
 //
 // kv cache helpers
 //
@@ -7128,7 +7159,7 @@ struct llama_context * llama_init_from_model(
             }
 
             int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
-            const size_t max_nodes = model->max_nodes(n_tokens);
+            const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx);
 
             // buffer used to store the computation graph and the tensor meta data
             ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));