From b0df4c0cfd2cda10738056771714a5290dc95454 Mon Sep 17 00:00:00 2001 From: Michael Wand Date: Sat, 23 May 2026 07:30:31 -0400 Subject: [PATCH] model : add NVFP4 MTP scale tensors (#23563) * Add NVFP4 MTP scale tensors * Link Qwen3.5 MTP tensors * Aligned nullptr --- src/llama-model.cpp | 12 ++++++++++++ src/llama-model.h | 16 ++++++++++------ src/models/qwen35.cpp | 5 +++-- src/models/qwen35moe.cpp | 5 +++-- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8bf20a716e..0d21b2a53c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // input scales if (!layer.wq_in_s && layer.wq) { @@ -1393,6 +1399,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_in_s && layer.ssm_beta) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } } // output scales if (output && output->type == GGML_TYPE_NVFP4) { diff --git a/src/llama-model.h b/src/llama-model.h index 01c87a7527..398a0aa725 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -202,12 +202,16 @@ struct llama_layer_shortconv { }; struct llama_layer_nextn { - struct ggml_tensor * eh_proj = nullptr; - struct ggml_tensor * embed_tokens = nullptr; - struct ggml_tensor * enorm = nullptr; - struct ggml_tensor * hnorm = nullptr; - struct ggml_tensor * shared_head_head = nullptr; - struct ggml_tensor * shared_head_norm = nullptr; + struct ggml_tensor * eh_proj = nullptr; + struct ggml_tensor * eh_proj_s = nullptr; + struct ggml_tensor * eh_proj_in_s = nullptr; + struct ggml_tensor * embed_tokens = nullptr; + struct ggml_tensor * enorm = nullptr; + struct ggml_tensor * hnorm = nullptr; + struct ggml_tensor * shared_head_head = nullptr; + struct ggml_tensor * shared_head_head_s = nullptr; + struct ggml_tensor * shared_head_head_in_s = nullptr; + struct ggml_tensor * shared_head_norm = nullptr; }; struct llama_layer { diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index adeb0c26e4..04ecc18fcd 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -538,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -626,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index e4512116d3..dc24f6ed53 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -602,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -722,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur;