Add extra nodes when dealing with MLA and amb (#1899)

This commit is contained in:
Kawrakow 2026-05-29 15:17:24 +03:00 committed by GitHub
parent e75337fec3
commit 8960c5ba5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
60 changed files with 112 additions and 69 deletions

View File

@ -1788,6 +1788,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "-amb" || arg == "--attention-max-batch") {
CHECK_ARG
params.attn_max_batch = std::stoi(argv[i]);
if (params.attn_max_batch > 0 && params.attn_max_batch < 128) {
LLAMA_LOG_WARN("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX amb = %d is too low. Changing to 128\n", params.attn_max_batch);
params.attn_max_batch = 128;
}
return true;
}
if (arg == "-no-fmoe" || arg == "--no-fused-moe") {

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_arctic() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_bailingmoe2() {
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
//const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_bert() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_bitnet() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -148,7 +148,7 @@ ggml_cgraph * llm_build_context::build_bitnet() {
}
ggml_cgraph * llm_build_context::build_bitnet_158() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_chatglm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_codeshell() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_cohere2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -4,7 +4,7 @@
ggml_cgraph * llm_build_context::build_command_r() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_dbrx() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_deci() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -769,7 +769,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
#else
const bool use_f32_attn_precision = lctx.cparams.graph_attn_precision == GGML_TYPE_F32;
#endif
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_dots1() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_ernie4_5() {
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -119,7 +119,7 @@ ggml_cgraph * llm_build_context::build_ernie4_5() {
}
ggml_cgraph * llm_build_context::build_ernie4_5_moe() {
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_gemma() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_gemma2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head_k = hparams.n_embd_head_k_full;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_gemma3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;

View File

@ -169,7 +169,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
int n_device = model.splits.size();
GGML_ASSERT(n_device > 1);
GGML_ASSERT(cparams.flash_attn);
auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = llm.new_graph_custom();
bool is_moe = hparams.n_expert > 0;
@ -527,7 +527,7 @@ static ggml_cgraph * build_gemma4_graph_parallel(llm_build_context & llm, llama_
}
ggml_cgraph * llm_build_context::build_gemma4_mtp() {
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;
@ -731,7 +731,7 @@ ggml_cgraph * llm_build_context::build_gemma4() {
KQ_mask, KQ_mask_swa, n_tokens, cb);
}
auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
ggml_tensor * inp_per_layer = nullptr;
if (model.tok_embd_per_layer) {

View File

@ -3,8 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_glm4_moe() {
// create a new graph
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -159,7 +158,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
}
ggml_cgraph * llm_build_context::build_glm4() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_gptneox() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_grok() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_hunyuan_moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_internlm2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_jais() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_mamba() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t d_model = n_embd;
const int64_t d_conv = hparams.ssm_d_conv;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_mimo2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
//const int64_t n_embd_head = hparams.n_embd_head_v(0);
//GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph* llm_build_context::build_minimaxm2() {
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_mistral3() {
auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -9,7 +9,7 @@
// * removed bias
// * removed MoE
ggml_cgraph * llm_build_context::build_olmo() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_openai_moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_openelm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_orion() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_phi3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_plamo() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_qwen2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -94,7 +94,7 @@ ggml_cgraph * llm_build_context::build_qwen2() {
}
ggml_cgraph * llm_build_context::build_qwen2vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
@ -195,7 +195,7 @@ ggml_cgraph * llm_build_context::build_qwen2vl() {
}
ggml_cgraph * llm_build_context::build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_qwen3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -104,7 +104,7 @@ ggml_cgraph * llm_build_context::build_qwen3() {
}
ggml_cgraph * llm_build_context::build_qwen3moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -167,7 +167,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
}
ggml_cgraph * llm_build_context::build_qwen3vl() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
@ -236,7 +236,7 @@ ggml_cgraph * llm_build_context::build_qwen3vl() {
}
ggml_cgraph * llm_build_context::build_qwen3vlmoe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -5,7 +5,7 @@
ggml_cgraph * llm_build_context::build_qwen35moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
@ -88,7 +88,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
ggml_cgraph * llm_build_context::build_qwen35() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -5,7 +5,7 @@
ggml_cgraph * llm_build_context::build_qwen3next() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
delta_net delta(lctx, batch);

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_seedoss() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph* llm_build_context::build_smollm3() {
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_stablelm() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_step35() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
ggml_tensor * cur;
auto inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
auto inp_pos = build_inp_pos();

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_t5_encoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -121,7 +121,7 @@ ggml_cgraph * llm_build_context::build_t5_encoder() {
}
ggml_cgraph * llm_build_context::build_t5_decoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

View File

@ -3,7 +3,7 @@
#include "../llama-context.h"
ggml_cgraph * llm_build_context::build_xverse() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(n_tokens), false);
ggml_cgraph * gf = new_graph_custom();
const int64_t n_embd_head = hparams.n_embd_head_v(0);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k(0));

View File

@ -2935,3 +2935,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
int32_t llama_model_n_nextn_layer(const llama_model * model) {
return model->hparams.nextn_predict_layers;
}
ggml_cgraph * llm_build_context::new_graph_custom() {
int max_nodes = lctx.max_nodes(n_tokens, n_kv);
return ggml_new_graph_custom(ctx0, max_nodes, false);
}

View File

@ -509,4 +509,6 @@ llm_expert_gating_func_type gating_op,
struct ggml_cgraph * gf,
struct ggml_tensor * inp_pos
);
ggml_cgraph * new_graph_custom();
};

View File

@ -319,5 +319,7 @@ struct llama_context {
struct llama_context & lctx);
void set_mtp_op_type(llama_mtp_op_type value);
int max_nodes(int n_tokens, int n_kv) const;
};

View File

@ -696,6 +696,37 @@ llama_context::~llama_context() {
ggml_backend_buffer_free(buf_output);
}
int llama_context::max_nodes(int n_tokens, int n_kv) const {
int max_nodes = model.max_nodes(n_tokens);
if (model.is_mla_model() &&
cparams.mla_attn > 1 &&
n_tokens >= 128 &&
cparams.attn_max_batch > 0 &&
model.split_mode != LLAMA_SPLIT_MODE_GRAPH &&
model.layers[0].wkv_b) {
// In this case we perform the attention computation iteratively, and this adds
// 10 nodes per layer per iteration. Although in many cases the 65536 nodes we
// estimate by default are enough to accomodate, to be safe we add the additional
// number of nodes required for the iterative MLA evaluation.
int n_head = model.hparams.n_head();
auto wkv_b = model.layers[0].wkv_b;
auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024);
if (cparams.attn_max_batch < kv_f32_size) {
int n_max_head = 1;
for (int niter = 2; niter < n_head; ++niter) {
if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) {
n_max_head = n_head/niter;
break;
}
}
GGML_ASSERT(n_head % n_max_head == 0);
int n_iter = n_head / n_max_head;
max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer;
}
}
return max_nodes;
}
//
// kv cache helpers
//
@ -7128,7 +7159,7 @@ struct llama_context * llama_init_from_model(
}
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
const size_t max_nodes = model->max_nodes(n_tokens);
const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx);
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));