mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
MTP: clean-up (#9)
* MTP: clean-up * review: use llama_context_type instead of llama_graph_type * review: remove llama_model_has_mtp * review: fix convert issues * convert: fix pycheck * review: formatting * use `mtp-` for identifying mtp models * convert: fix mtp conversion
This commit is contained in:
parent
5c58cc5bdd
commit
3c3aebaaa0
@ -335,11 +335,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
|
||||
bool found_mtp = false;
|
||||
common_params_model mtp;
|
||||
};
|
||||
|
||||
static handle_model_result common_params_handle_model(struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline) {
|
||||
bool offline,
|
||||
bool search_mtp = false) {
|
||||
handle_model_result result;
|
||||
|
||||
if (!model.docker_repo.empty()) {
|
||||
@ -354,7 +358,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = bearer_token;
|
||||
opts.offline = offline;
|
||||
auto download_result = common_download_model(model, opts, true);
|
||||
auto download_result = common_download_model(model, opts, true, search_mtp);
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
LOG_ERR("error: failed to download model from Hugging Face\n");
|
||||
@ -368,6 +372,11 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
result.found_mmproj = true;
|
||||
result.mmproj.path = download_result.mmproj_path;
|
||||
}
|
||||
|
||||
if (!download_result.mtp_path.empty()) {
|
||||
result.found_mtp = true;
|
||||
result.mtp.path = download_result.mtp_path;
|
||||
}
|
||||
} else if (!model.url.empty()) {
|
||||
if (model.path.empty()) {
|
||||
auto f = string_split<std::string>(model.url, '#').front();
|
||||
@ -436,7 +445,11 @@ static bool parse_bool_value(const std::string & value) {
|
||||
//
|
||||
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex) {
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
|
||||
const bool spec_type_mtp = std::find(params.speculative.types.begin(),
|
||||
params.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_MTP) != params.speculative.types.end();
|
||||
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_mtp);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
@ -450,6 +463,14 @@ void common_params_handle_models(common_params & params, llama_example curr_ex)
|
||||
break;
|
||||
}
|
||||
}
|
||||
// when --spec-type mtp is set and no draft model was provided explicitly,
|
||||
// fall back to the MTP head discovered alongside the -hf model
|
||||
if (spec_type_mtp && res.found_mtp &&
|
||||
params.speculative.draft.mparams.path.empty() &&
|
||||
params.speculative.draft.mparams.hf_repo.empty() &&
|
||||
params.speculative.draft.mparams.url.empty()) {
|
||||
params.speculative.draft.mparams.path = res.mtp.path;
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
}
|
||||
|
||||
@ -566,8 +566,11 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
|
||||
return result;
|
||||
}
|
||||
|
||||
static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
|
||||
const std::string & model) {
|
||||
// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"),
|
||||
// preferring deeper shared directory prefix with the model, then closest quantization
|
||||
static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
|
||||
const std::string & model,
|
||||
const std::string & keyword) {
|
||||
hf_cache::hf_file best;
|
||||
size_t best_depth = 0;
|
||||
int best_diff = 0;
|
||||
@ -579,20 +582,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
|
||||
|
||||
for (const auto & f : files) {
|
||||
if (!string_ends_with(f.path, ".gguf") ||
|
||||
f.path.find("mmproj") == std::string::npos) {
|
||||
f.path.find(keyword) == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto mmproj_parts = string_split<std::string>(f.path, '/');
|
||||
auto mmproj_dir = mmproj_parts.end() - 1;
|
||||
auto sib_parts = string_split<std::string>(f.path, '/');
|
||||
auto sib_dir = sib_parts.end() - 1;
|
||||
|
||||
auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
|
||||
mmproj_parts.begin(), mmproj_dir);
|
||||
if (dir != mmproj_dir) {
|
||||
sib_parts.begin(), sib_dir);
|
||||
if (dir != sib_dir) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t depth = dir - mmproj_parts.begin();
|
||||
size_t depth = dir - sib_parts.begin();
|
||||
auto bits = extract_quant_bits(f.path);
|
||||
auto diff = std::abs(bits - model_bits);
|
||||
|
||||
@ -606,6 +609,16 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
|
||||
return best;
|
||||
}
|
||||
|
||||
static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
|
||||
const std::string & model) {
|
||||
return find_best_sibling(files, model, "mmproj");
|
||||
}
|
||||
|
||||
static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
|
||||
const std::string & model) {
|
||||
return find_best_sibling(files, model, "mtp-");
|
||||
}
|
||||
|
||||
static bool gguf_filename_is_model(const std::string & filepath) {
|
||||
if (!string_ends_with(filepath, ".gguf")) {
|
||||
return false;
|
||||
@ -617,7 +630,8 @@ static bool gguf_filename_is_model(const std::string & filepath) {
|
||||
}
|
||||
|
||||
return filename.find("mmproj") == std::string::npos &&
|
||||
filename.find("imatrix") == std::string::npos;
|
||||
filename.find("imatrix") == std::string::npos &&
|
||||
filename.find("mtp-") == std::string::npos;
|
||||
}
|
||||
|
||||
static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
@ -673,11 +687,13 @@ struct hf_plan {
|
||||
hf_cache::hf_file primary;
|
||||
hf_cache::hf_files model_files;
|
||||
hf_cache::hf_file mmproj;
|
||||
hf_cache::hf_file mtp;
|
||||
};
|
||||
|
||||
static hf_plan get_hf_plan(const common_params_model & model,
|
||||
const common_download_opts & opts,
|
||||
bool download_mmproj) {
|
||||
bool download_mmproj,
|
||||
bool download_mtp) {
|
||||
hf_plan plan;
|
||||
hf_cache::hf_files all;
|
||||
|
||||
@ -723,6 +739,10 @@ static hf_plan get_hf_plan(const common_params_model & model,
|
||||
plan.mmproj = find_best_mmproj(all, primary.path);
|
||||
}
|
||||
|
||||
if (download_mtp) {
|
||||
plan.mtp = find_best_mtp(all, primary.path);
|
||||
}
|
||||
|
||||
return plan;
|
||||
}
|
||||
|
||||
@ -756,7 +776,8 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
|
||||
|
||||
common_download_model_result common_download_model(const common_params_model & model,
|
||||
const common_download_opts & opts,
|
||||
bool download_mmproj) {
|
||||
bool download_mmproj,
|
||||
bool download_mtp) {
|
||||
common_download_model_result result;
|
||||
std::vector<download_task> tasks;
|
||||
hf_plan hf;
|
||||
@ -764,13 +785,16 @@ common_download_model_result common_download_model(const common_params_model &
|
||||
bool is_hf = !model.hf_repo.empty();
|
||||
|
||||
if (is_hf) {
|
||||
hf = get_hf_plan(model, opts, download_mmproj);
|
||||
hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
|
||||
for (const auto & f : hf.model_files) {
|
||||
tasks.push_back({f.url, f.local_path});
|
||||
}
|
||||
if (!hf.mmproj.path.empty()) {
|
||||
tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
|
||||
}
|
||||
if (!hf.mtp.path.empty()) {
|
||||
tasks.push_back({hf.mtp.url, hf.mtp.local_path});
|
||||
}
|
||||
} else if (!model.url.empty()) {
|
||||
tasks = get_url_tasks(model);
|
||||
} else {
|
||||
@ -807,6 +831,10 @@ common_download_model_result common_download_model(const common_params_model &
|
||||
if (!hf.mmproj.path.empty()) {
|
||||
result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
|
||||
}
|
||||
|
||||
if (!hf.mtp.path.empty()) {
|
||||
result.mtp_path = hf_cache::finalize_file(hf.mtp);
|
||||
}
|
||||
} else {
|
||||
result.model_path = model.path;
|
||||
}
|
||||
@ -946,7 +974,8 @@ std::vector<common_cached_model_info> common_list_cached_models() {
|
||||
for (const auto & f : files) {
|
||||
auto split = get_gguf_split_info(f.path);
|
||||
if (split.index != 1 || split.tag.empty() ||
|
||||
split.prefix.find("mmproj") != std::string::npos) {
|
||||
split.prefix.find("mmproj") != std::string::npos ||
|
||||
split.prefix.find("MTP") != std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
if (seen.insert(f.repo_id + ":" + split.tag).second) {
|
||||
|
||||
@ -59,6 +59,7 @@ struct common_download_opts {
|
||||
struct common_download_model_result {
|
||||
std::string model_path;
|
||||
std::string mmproj_path;
|
||||
std::string mtp_path;
|
||||
};
|
||||
|
||||
// Download model from HuggingFace repo or URL
|
||||
@ -83,12 +84,14 @@ struct common_download_model_result {
|
||||
// when opts.offline=true, no network requests are made
|
||||
// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
|
||||
// then with the closest quantization bits
|
||||
// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
|
||||
//
|
||||
// returns result with model_path and mmproj_path (empty on failure)
|
||||
// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
|
||||
common_download_model_result common_download_model(
|
||||
const common_params_model & model,
|
||||
const common_download_opts & opts = {},
|
||||
bool download_mmproj = false
|
||||
bool download_mmproj = false,
|
||||
bool download_mtp = false
|
||||
);
|
||||
|
||||
// returns list of cached models
|
||||
|
||||
@ -1198,7 +1198,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
|
||||
has_draft_simple = false;
|
||||
}
|
||||
} else if (has_draft_model_path) {
|
||||
} else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) {
|
||||
LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
|
||||
has_draft_simple = true;
|
||||
}
|
||||
|
||||
@ -95,6 +95,7 @@ class ModelBase:
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
model_name: str | None
|
||||
metadata_override: Path | None
|
||||
metadata: gguf.Metadata
|
||||
dir_model_card: Path
|
||||
remote_hf_model_id: str | None
|
||||
|
||||
@ -5564,26 +5565,59 @@ class _Qwen35MtpMixin:
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
|
||||
mtp_only: bool = False
|
||||
no_mtp: bool = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("mtp_num_hidden_layers", 0)
|
||||
self.block_count = self.hparams["num_hidden_layers"]
|
||||
if not self.no_mtp:
|
||||
self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item):
|
||||
name, _ = item
|
||||
if name.startswith("mtp."):
|
||||
if cls.no_mtp:
|
||||
return None
|
||||
return item
|
||||
if cls.mtp_only:
|
||||
# In --mtp mode, drop trunk weights and keep only the shared embeddings/output
|
||||
# tensors that the standalone MTP graph references at inference time.
|
||||
canonical = name.replace("language_model.", "")
|
||||
keep = canonical in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
"embed_tokens.weight", "norm.weight",
|
||||
)
|
||||
if not keep:
|
||||
return None
|
||||
return super().filter_tensors(item) # ty: ignore[unresolved-attribute]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
|
||||
if self.no_mtp:
|
||||
return
|
||||
if (n := self.hparams.get("mtp_num_hidden_layers", 0)) > 0:
|
||||
self.gguf_writer.add_nextn_predict_layers(n)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Multimodal Qwen3.5/3.6 wrap the text model under `model.language_model.*`.
|
||||
if name.startswith("model.language_model."):
|
||||
name = "model." + name[len("model.language_model."):]
|
||||
elif name.startswith("language_model."):
|
||||
name = name[len("language_model."):]
|
||||
def prepare_metadata(self, vocab_only: bool):
|
||||
# TextModel.prepare_metadata resolves a directory fname_out into a concrete
|
||||
# file path, so snapshot is_dir() first to decide whether to apply the mtp- prefix.
|
||||
from_dir = self.fname_out.is_dir()
|
||||
super().prepare_metadata(vocab_only=vocab_only) # ty: ignore[unresolved-attribute]
|
||||
|
||||
if not self.mtp_only or not from_dir:
|
||||
return
|
||||
|
||||
output_type: str = self.ftype.name.partition("_")[2] # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
fname_default: str = gguf.naming_convention(
|
||||
self.metadata.name, self.metadata.basename, self.metadata.finetune, # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Remap MTP block tensors to llama.cpp's layer-indexed nextn naming.
|
||||
# HF: mtp.layers.0.* (transformer block at MTP slot 0)
|
||||
# mtp.fc / mtp.pre_fc_norm_embedding / mtp.pre_fc_norm_hidden / mtp.norm
|
||||
if name.startswith("mtp."):
|
||||
n_layer = self.hparams["num_hidden_layers"]
|
||||
if name.find("layers.") != -1:
|
||||
@ -14109,6 +14143,14 @@ def parse_args() -> argparse.Namespace:
|
||||
"--mmproj", action="store_true",
|
||||
help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mtp", action="store_true",
|
||||
help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-mtp", action="store_true",
|
||||
help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mistral-format", action="store_true",
|
||||
help="Whether the model is stored following the Mistral format.",
|
||||
@ -14268,6 +14310,20 @@ def main() -> None:
|
||||
else:
|
||||
model_class = MistralModel
|
||||
|
||||
if args.mtp and args.no_mtp:
|
||||
logger.error("--mtp and --no-mtp are mutually exclusive")
|
||||
sys.exit(1)
|
||||
|
||||
if (args.mtp or args.no_mtp) and not issubclass(model_class, _Qwen35MtpMixin):
|
||||
logger.error("--mtp / --no-mtp are only supported for Qwen3.5/3.6 text variants today")
|
||||
sys.exit(1)
|
||||
|
||||
# set on the class so __init__ / filter_tensors see the correct mode
|
||||
if args.no_mtp:
|
||||
model_class.no_mtp = True # ty: ignore[unresolved-attribute]
|
||||
if args.mtp:
|
||||
model_class.mtp_only = True # ty: ignore[unresolved-attribute]
|
||||
|
||||
model_instance = model_class(dir_model, output_type, fname_out,
|
||||
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
|
||||
eager=args.no_lazy,
|
||||
|
||||
@ -198,6 +198,11 @@ extern "C" {
|
||||
LLAMA_SPLIT_MODE_TENSOR = 3,
|
||||
};
|
||||
|
||||
enum llama_context_type {
|
||||
LLAMA_CONTEXT_TYPE_DEFAULT = 0,
|
||||
LLAMA_CONTEXT_TYPE_MTP = 1,
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
@ -339,6 +344,7 @@ extern "C" {
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
enum llama_context_type ctx_type; // set the context type (e.g. MTP)
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||
|
||||
@ -41,8 +41,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" },
|
||||
{ LLM_ARCH_QWEN35, "qwen35" },
|
||||
{ LLM_ARCH_QWEN35MOE, "qwen35moe" },
|
||||
{ LLM_ARCH_QWEN35_MTP, "qwen35_mtp" },
|
||||
{ LLM_ARCH_QWEN35MOE_MTP, "qwen35moe_mtp" },
|
||||
{ LLM_ARCH_PHI2, "phi2" },
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
|
||||
@ -45,8 +45,6 @@ enum llm_arch {
|
||||
LLM_ARCH_QWEN3VLMOE,
|
||||
LLM_ARCH_QWEN35,
|
||||
LLM_ARCH_QWEN35MOE,
|
||||
LLM_ARCH_QWEN35_MTP,
|
||||
LLM_ARCH_QWEN35MOE_MTP,
|
||||
LLM_ARCH_PHI2,
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-io.h"
|
||||
@ -21,6 +22,14 @@
|
||||
// llama_context
|
||||
//
|
||||
|
||||
static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
|
||||
switch (ctx_type) {
|
||||
case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT;
|
||||
case LLAMA_CONTEXT_TYPE_MTP : return LLM_GRAPH_TYPE_DECODER_MTP;
|
||||
}
|
||||
throw std::runtime_error("Unsupported ctx type");
|
||||
}
|
||||
|
||||
llama_context::llama_context(
|
||||
const llama_model & model,
|
||||
llama_context_params params) :
|
||||
@ -66,6 +75,8 @@ llama_context::llama_context(
|
||||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
cparams.ctx_type = params.ctx_type;
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
@ -279,6 +290,7 @@ llama_context::llama_context(
|
||||
/*.type_k =*/ params.type_k,
|
||||
/*.type_v =*/ params.type_v,
|
||||
/*.swa_full =*/ params.swa_full,
|
||||
/*.ctx_type= */ cparams.ctx_type,
|
||||
};
|
||||
|
||||
memory.reset(model.create_memory(params_mem, cparams));
|
||||
@ -1738,7 +1750,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
}
|
||||
|
||||
ggml_status status;
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
|
||||
@ -2198,7 +2211,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
||||
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type));
|
||||
|
||||
res->reset();
|
||||
|
||||
@ -3177,7 +3190,7 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type));
|
||||
|
||||
res->reset();
|
||||
|
||||
@ -3280,6 +3293,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.n_seq_max =*/ 1,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
||||
@ -3383,6 +3397,13 @@ llama_context * llama_init_from_model(
|
||||
model->hparams.pooling_type, params.pooling_type);
|
||||
}
|
||||
|
||||
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
|
||||
model->hparams.nextn_predict_layers == 0) {
|
||||
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
auto * ctx = new llama_context(*model, params);
|
||||
return ctx;
|
||||
|
||||
@ -41,6 +41,7 @@ struct llama_cparams {
|
||||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
enum llama_context_type ctx_type;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
|
||||
@ -32,6 +32,7 @@ enum llm_graph_type {
|
||||
LLM_GRAPH_TYPE_DEFAULT,
|
||||
LLM_GRAPH_TYPE_ENCODER,
|
||||
LLM_GRAPH_TYPE_DECODER,
|
||||
LLM_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-graph.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
@ -20,6 +21,8 @@ struct llama_memory_params {
|
||||
|
||||
// use full-size SWA cache
|
||||
bool swa_full;
|
||||
|
||||
llama_context_type ctx_type;
|
||||
};
|
||||
|
||||
enum llama_memory_status {
|
||||
|
||||
@ -276,10 +276,6 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
|
||||
return new llama_model_qwen35(params);
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
return new llama_model_qwen35moe(params);
|
||||
case LLM_ARCH_QWEN35_MTP:
|
||||
return new llama_model_qwen35_mtp(params);
|
||||
case LLM_ARCH_QWEN35MOE_MTP:
|
||||
return new llama_model_qwen35moe_mtp(params);
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
return new llama_model_mistral3(params);
|
||||
case LLM_ARCH_MIMO2:
|
||||
@ -1409,8 +1405,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
|
||||
const bool partial_load = (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP);
|
||||
ml.done_getting_tensors(partial_load);
|
||||
ml.done_getting_tensors();
|
||||
|
||||
// populate tensors_by_name
|
||||
for (auto & [_, ctx_ptr] : ml.ctx_map) {
|
||||
@ -1948,6 +1943,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
// checks
|
||||
default:
|
||||
{
|
||||
// The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain
|
||||
// attention KV cache for the MTP context instead of the hybrid wrapper.
|
||||
const bool mtp_on_hybrid_qwen35 =
|
||||
params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
|
||||
(arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE);
|
||||
|
||||
if (llm_arch_is_recurrent(arch)) {
|
||||
res = new llama_memory_recurrent(
|
||||
*this,
|
||||
@ -1957,7 +1958,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
cparams.n_seq_max,
|
||||
nullptr);
|
||||
} else if (llm_arch_is_hybrid(arch)) {
|
||||
} else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) {
|
||||
// The main difference between hybrid architectures is the
|
||||
// layer filters, so pick the right one here
|
||||
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
||||
@ -1972,6 +1973,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
filter_recr = [&](int32_t il) {
|
||||
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter_attn = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && !hparams.is_recurrent(il);
|
||||
};
|
||||
filter_recr = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && hparams.is_recurrent(il);
|
||||
};
|
||||
}
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
@ -2014,6 +2023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
}
|
||||
} else {
|
||||
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
||||
llama_kv_cache::layer_filter_cb filter = nullptr;
|
||||
|
||||
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
|
||||
reuse = [&](int32_t il) {
|
||||
@ -2025,6 +2035,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
};
|
||||
}
|
||||
|
||||
if (mtp_on_hybrid_qwen35) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
|
||||
}
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
GGML_ASSERT(hparams.is_swa_any());
|
||||
|
||||
@ -2040,7 +2055,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
cparams.n_seq_max,
|
||||
cparams.n_ubatch,
|
||||
1,
|
||||
nullptr,
|
||||
filter,
|
||||
reuse);
|
||||
} else {
|
||||
GGML_ASSERT(!hparams.is_swa_any());
|
||||
@ -2057,7 +2072,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
1,
|
||||
hparams.n_swa,
|
||||
hparams.swa_type,
|
||||
nullptr,
|
||||
filter,
|
||||
nullptr);
|
||||
}
|
||||
}
|
||||
@ -2161,6 +2176,7 @@ int32_t llama_model_n_swa(const llama_model * model) {
|
||||
return model->hparams.n_swa;
|
||||
}
|
||||
|
||||
|
||||
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
||||
return model->hparams.n_cls_out;
|
||||
}
|
||||
@ -2328,8 +2344,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
case LLM_ARCH_QWEN35_MTP:
|
||||
case LLM_ARCH_QWEN35MOE_MTP:
|
||||
return LLAMA_ROPE_TYPE_IMROPE;
|
||||
|
||||
case LLM_ARCH_GLM4:
|
||||
|
||||
@ -1739,6 +1739,10 @@ struct llama_model_qwen35 : public llama_model_base {
|
||||
const llama_model & model;
|
||||
};
|
||||
|
||||
struct graph_mtp : public llm_graph_context {
|
||||
graph_mtp(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
@ -1781,30 +1785,8 @@ struct llama_model_qwen35moe : public llama_model_base {
|
||||
const llama_model & model;
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_qwen35_mtp : public llama_model_base {
|
||||
llama_model_qwen35_mtp(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
||||
struct llama_model_qwen35moe_mtp : public llama_model_base {
|
||||
llama_model_qwen35moe_mtp(const struct llama_model_params & params) : llama_model_base(params) {}
|
||||
void load_arch_hparams(llama_model_loader & ml) override;
|
||||
void load_arch_tensors(llama_model_loader & ml) override;
|
||||
|
||||
struct graph : public llm_graph_context {
|
||||
graph(const llama_model & model, const llm_graph_params & params);
|
||||
struct graph_mtp : public llm_graph_context {
|
||||
graph_mtp(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
|
||||
@ -1,207 +0,0 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_qwen35_mtp::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
||||
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35_MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer);
|
||||
|
||||
// only the MTP layers get a KV cache, trunk layers are skipped.
|
||||
hparams.kv_only_nextn = true;
|
||||
hparams.n_layer_kv_from_start = -1;
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = false;
|
||||
}
|
||||
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
void llama_model_qwen35_mtp::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
if (output == nullptr) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
if (static_cast<uint32_t>(i) < n_main) {
|
||||
continue; // trunk layer — owned by the sibling QWEN35 model
|
||||
}
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
// MTP block looks like a full-attention Qwen3.5 decoder block.
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
|
||||
// NextN-specific tensors that define the MTP block.
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, 0);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, 0);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, 0);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_qwen35_mtp::build_arch_graph(const llm_graph_params & params) const {
|
||||
return std::make_unique<graph>(*this, params);
|
||||
}
|
||||
|
||||
// LLM_ARCH_QWEN35_MTP draft head for Qwen3.5/3.6 dense series
|
||||
llama_model_qwen35_mtp::graph::graph(const llama_model & model, const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35_MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35_MTP currently only supports a single MTP block");
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
// The MTP block lives at the source file's original layer index.
|
||||
const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
|
||||
GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm");
|
||||
GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm");
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
ggml_set_name(inp->embd, "mtp_h_input");
|
||||
|
||||
ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
|
||||
|
||||
ggml_tensor * h_input = inp->embd;
|
||||
ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
|
||||
cb(tok_embd, "mtp_tok_embd", il);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
||||
ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(e_norm, "mtp_enorm", il);
|
||||
|
||||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
||||
cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur_full, "mtp_Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
0);
|
||||
Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "mtp_Qcur_normed", il);
|
||||
|
||||
ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
ggml_element_size(Qcur_full) * n_embd_head);
|
||||
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
||||
cb(gate, "mtp_gate", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "mtp_Kcur_normed", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Vcur, "mtp_Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f
|
||||
? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
nullptr, nullptr, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "mtp_attn_pregate", il);
|
||||
|
||||
cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
|
||||
cur = build_lora_mm(layer.wo, cur, layer.wo_s);
|
||||
cb(cur, "mtp_attn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpSA);
|
||||
cb(cur, "mtp_attn_residual", il);
|
||||
|
||||
ggml_tensor * ffn_residual = cur;
|
||||
cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_post_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
layer.ffn_up, nullptr, layer.ffn_up_s,
|
||||
layer.ffn_gate, nullptr, layer.ffn_gate_s,
|
||||
layer.ffn_down, nullptr, layer.ffn_down_s,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "mtp_ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "mtp_post_ffn", il);
|
||||
|
||||
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
|
||||
// (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
GGML_ASSERT(head_norm_w && "QWEN35_MTP: missing both nextn.shared_head_norm and output_norm");
|
||||
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
GGML_ASSERT(head_w && "QWEN35_MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@ -15,7 +15,6 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
|
||||
// NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
// Mark recurrent layers (linear attention layers). MTP layers are dense
|
||||
// attention-only and must be flagged non-recurrent.
|
||||
@ -36,9 +35,14 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_qwen35::load_arch_tensors(llama_model_loader &) {
|
||||
void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
|
||||
const bool mtp_only = (hparams.nextn_predict_layers > 0) &&
|
||||
(ml.get_weight("blk.0.attn_norm.weight") == nullptr);
|
||||
const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
@ -50,60 +54,85 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
// Calculate dimensions from hyperparameters
|
||||
const int64_t head_k_dim = hparams.ssm_d_state;
|
||||
const int64_t head_v_dim = hparams.ssm_d_state;
|
||||
const int64_t n_k_heads = hparams.ssm_n_group;
|
||||
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
||||
const int64_t key_dim = head_k_dim * n_k_heads;
|
||||
const int64_t value_dim = head_v_dim * n_v_heads;
|
||||
const int64_t conv_dim = key_dim * 2 + value_dim;
|
||||
auto load_block_trunk = [&](int il, int flags) {
|
||||
auto & layer = layers[il];
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
// Calculate dimensions from hyperparameters
|
||||
const int64_t head_k_dim = hparams.ssm_d_state;
|
||||
const int64_t head_v_dim = hparams.ssm_d_state;
|
||||
const int64_t n_k_heads = hparams.ssm_n_group;
|
||||
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
||||
const int64_t key_dim = head_k_dim * n_k_heads;
|
||||
const int64_t value_dim = head_v_dim * n_v_heads;
|
||||
const int64_t conv_dim = key_dim * 2 + value_dim;
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags);
|
||||
|
||||
if (!hparams.is_recurrent(i)) {
|
||||
if (!hparams.is_recurrent(il)) {
|
||||
// Attention layers
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags);
|
||||
|
||||
// Q/K normalization for attention layers
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags);
|
||||
} else {
|
||||
// Linear attention (gated delta net) specific tensors
|
||||
// Create tensors with calculated dimensions
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags);
|
||||
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags);
|
||||
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags);
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags);
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags);
|
||||
}
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, flags);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, flags);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, flags);
|
||||
};
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - only bound on MTP layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
auto load_block_mtp = [&](int il) {
|
||||
auto & layer = layers[il];
|
||||
|
||||
// MTP block looks like a full-attention Qwen3.5 decoder block.
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0);
|
||||
|
||||
create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", il), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", il), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", il), {n_embd, n_ff}, 0);
|
||||
|
||||
// NextN-specific tensors that define the MTP block.
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
};
|
||||
|
||||
for (int i = 0; i < (int) n_main; ++i) {
|
||||
load_block_trunk(i, trunk_flags);
|
||||
}
|
||||
for (int i = (int) n_main; i < n_layer; ++i) {
|
||||
load_block_mtp(i);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_qwen35::build_arch_graph(const llm_graph_params & params) const {
|
||||
if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
return std::make_unique<graph_mtp>(*this, params);
|
||||
}
|
||||
return std::make_unique<graph>(*this, params);
|
||||
}
|
||||
|
||||
@ -493,3 +522,146 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series
|
||||
llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block");
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
// The MTP block lives at the source file's original layer index.
|
||||
const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
|
||||
GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm");
|
||||
GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm");
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
ggml_set_name(inp->embd, "mtp_h_input");
|
||||
|
||||
ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
|
||||
|
||||
ggml_tensor * h_input = inp->embd;
|
||||
ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
|
||||
cb(tok_embd, "mtp_tok_embd", il);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
||||
ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(e_norm, "mtp_enorm", il);
|
||||
|
||||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
||||
cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur_full, "mtp_Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
0);
|
||||
Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "mtp_Qcur_normed", il);
|
||||
|
||||
ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
ggml_element_size(Qcur_full) * n_embd_head);
|
||||
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
||||
cb(gate, "mtp_gate", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "mtp_Kcur_normed", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Vcur, "mtp_Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f
|
||||
? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
nullptr, nullptr, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "mtp_attn_pregate", il);
|
||||
|
||||
cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
|
||||
cur = build_lora_mm(layer.wo, cur, layer.wo_s);
|
||||
cb(cur, "mtp_attn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpSA);
|
||||
cb(cur, "mtp_attn_residual", il);
|
||||
|
||||
ggml_tensor * ffn_residual = cur;
|
||||
cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_post_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
layer.ffn_up, nullptr, layer.ffn_up_s,
|
||||
layer.ffn_gate, nullptr, layer.ffn_gate_s,
|
||||
layer.ffn_down, nullptr, layer.ffn_down_s,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "mtp_ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "mtp_post_ffn", il);
|
||||
|
||||
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
|
||||
// (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.)
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm");
|
||||
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
@ -1,252 +0,0 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_qwen35moe_mtp::load_arch_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
||||
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE_MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers <= hparams.n_layer);
|
||||
GGML_ASSERT(hparams.n_expert > 0 && "QWEN35MOE_MTP requires n_expert > 0");
|
||||
|
||||
// only the MTP layers get a KV cache, trunk layers are skipped.
|
||||
hparams.kv_only_nextn = true;
|
||||
hparams.n_layer_kv_from_start = -1;
|
||||
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
||||
hparams.recurrent_layer_arr[i] = false;
|
||||
}
|
||||
|
||||
type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
void llama_model_qwen35moe_mtp::load_arch_tensors(llama_model_loader &) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
if (output == nullptr) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
||||
|
||||
const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
if (static_cast<uint32_t>(i) < n_main) {
|
||||
continue; // trunk layer — owned by the sibling QWEN35MOE model
|
||||
}
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
// MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN.
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
|
||||
// Routed experts
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared experts
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
|
||||
|
||||
// NextN-specific tensors that define the MTP block.
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, 0);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, 0);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, 0);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_qwen35moe_mtp::build_arch_graph(const llm_graph_params & params) const {
|
||||
return std::make_unique<graph>(*this, params);
|
||||
}
|
||||
|
||||
// LLM_ARCH_QWEN35MOE_MTP draft head for Qwen3.5/3.6 MoE
|
||||
llama_model_qwen35moe_mtp::graph::graph(const llama_model & model, const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE_MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE_MTP currently only supports a single MTP block");
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
|
||||
GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm");
|
||||
GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm");
|
||||
GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp");
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
ggml_set_name(inp->embd, "mtp_h_input");
|
||||
|
||||
ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
|
||||
|
||||
ggml_tensor * h_input = inp->embd;
|
||||
ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
|
||||
cb(tok_embd, "mtp_tok_embd", il);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
||||
ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(e_norm, "mtp_enorm", il);
|
||||
|
||||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
||||
cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur_full, "mtp_Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
0);
|
||||
Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "mtp_Qcur_normed", il);
|
||||
|
||||
ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
ggml_element_size(Qcur_full) * n_embd_head);
|
||||
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
||||
cb(gate, "mtp_gate", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "mtp_Kcur_normed", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Vcur, "mtp_Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f
|
||||
? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
nullptr, nullptr, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "mtp_attn_pregate", il);
|
||||
|
||||
cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
|
||||
cur = build_lora_mm(layer.wo, cur, layer.wo_s);
|
||||
cb(cur, "mtp_attn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpSA);
|
||||
cb(cur, "mtp_attn_residual", il);
|
||||
|
||||
ggml_tensor * ffn_residual = cur;
|
||||
cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_post_norm", il);
|
||||
|
||||
// MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe).
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
layer.ffn_gate_inp,
|
||||
layer.ffn_up_exps,
|
||||
layer.ffn_gate_exps,
|
||||
layer.ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, layer.ffn_gate_up_exps,
|
||||
layer.ffn_up_exps_s,
|
||||
layer.ffn_gate_exps_s,
|
||||
layer.ffn_down_exps_s);
|
||||
cb(moe_out, "mtp_ffn_moe_out", il);
|
||||
|
||||
if (layer.ffn_up_shexp != nullptr) {
|
||||
ggml_tensor * ffn_shexp =
|
||||
build_ffn(cur,
|
||||
layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s,
|
||||
layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
|
||||
layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "mtp_ffn_shexp", il);
|
||||
|
||||
ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur);
|
||||
shared_gate = ggml_sigmoid(ctx0, shared_gate);
|
||||
cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il);
|
||||
|
||||
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
|
||||
cb(ffn_shexp, "mtp_ffn_shexp_gated", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
} else {
|
||||
cur = moe_out;
|
||||
}
|
||||
cb(cur, "mtp_ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "mtp_post_ffn", il);
|
||||
|
||||
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
GGML_ASSERT(head_norm_w && "QWEN35MOE_MTP: missing both nextn.shared_head_norm and output_norm");
|
||||
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
GGML_ASSERT(head_w && "QWEN35MOE_MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@ -18,7 +18,6 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
// NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
// Mark recurrent layers (linear attention layers). MTP layers are dense
|
||||
// attention-only and must be flagged non-recurrent.
|
||||
@ -39,9 +38,14 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) {
|
||||
void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) {
|
||||
LLAMA_LOAD_LOCALS;
|
||||
|
||||
const uint32_t n_main = n_layer - hparams.nextn_predict_layers;
|
||||
const bool mtp_only = (hparams.nextn_predict_layers > 0) &&
|
||||
(ml.get_weight("blk.0.attn_norm.weight") == nullptr);
|
||||
const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
@ -53,70 +57,105 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader &) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
auto load_block_trunk = [&](int il, int flags) {
|
||||
auto & layer = layers[il];
|
||||
|
||||
// Calculate dimensions from hyperparameters
|
||||
const int64_t head_k_dim = hparams.ssm_d_state;
|
||||
const int64_t head_v_dim = hparams.ssm_d_state;
|
||||
const int64_t n_k_heads = hparams.ssm_n_group;
|
||||
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
||||
const int64_t key_dim = head_k_dim * n_k_heads;
|
||||
const int64_t value_dim = head_v_dim * n_v_heads;
|
||||
const int64_t conv_dim = key_dim * 2 + value_dim;
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
// Calculate dimensions from hyperparameters
|
||||
const int64_t head_k_dim = hparams.ssm_d_state;
|
||||
const int64_t head_v_dim = hparams.ssm_d_state;
|
||||
const int64_t n_k_heads = hparams.ssm_n_group;
|
||||
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
||||
const int64_t key_dim = head_k_dim * n_k_heads;
|
||||
const int64_t value_dim = head_v_dim * n_v_heads;
|
||||
const int64_t conv_dim = key_dim * 2 + value_dim;
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags);
|
||||
|
||||
if (!hparams.is_recurrent(i)) {
|
||||
if (!hparams.is_recurrent(il)) {
|
||||
// Attention layers
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags);
|
||||
|
||||
// Q/K normalization for attention layers
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, flags);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, flags);
|
||||
} else {
|
||||
// Linear attention (gated delta net) specific tensors
|
||||
// Create tensors with calculated dimensions
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", il), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", il), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", il), { hparams.ssm_d_conv, conv_dim }, flags);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", il), { hparams.ssm_dt_rank }, flags);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, il), { hparams.ssm_dt_rank }, flags);
|
||||
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", il), { n_embd, n_v_heads }, flags);
|
||||
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", il), { n_embd, n_v_heads }, flags);
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", il), { head_v_dim }, flags);
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", il), { value_dim, n_embd }, flags);
|
||||
}
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
||||
// Routed experts
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, flags);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, flags);
|
||||
create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, flags);
|
||||
|
||||
// Shared experts
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, flags);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, flags);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, flags);
|
||||
};
|
||||
|
||||
auto load_block_mtp = [&](int il) {
|
||||
auto & layer = layers[il];
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
||||
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
|
||||
// MTP block looks like a full-attention Qwen3.5 decoder block with MoE FFN.
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, 0);
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - only bound on MTP layers
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", il), { n_embd_head_k }, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", il), { n_embd_head_k }, 0);
|
||||
|
||||
// Routed experts
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", il), { n_embd, n_expert }, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", il), { n_ff_exp, n_embd, n_expert }, 0);
|
||||
create_tensor_gate_up_exps(layer, il, n_embd, n_ff_exp, n_expert, 0);
|
||||
|
||||
// Shared experts
|
||||
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", il), { n_embd }, 0);
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", il), { n_embd, n_ff_shexp }, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", il), { n_ff_shexp, n_embd }, 0);
|
||||
|
||||
// NextN-specific tensors that define the MTP block.
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", il), { 2 * n_embd, n_embd }, 0);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", il), { n_embd }, 0);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", il), { n_embd }, 0);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", il), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
};
|
||||
|
||||
for (int i = 0; i < (int) n_main; ++i) {
|
||||
load_block_trunk(i, trunk_flags);
|
||||
}
|
||||
for (int i = (int) n_main; i < n_layer; ++i) {
|
||||
load_block_mtp(i);
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<llm_graph_context> llama_model_qwen35moe::build_arch_graph(const llm_graph_params & params) const {
|
||||
if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
return std::make_unique<graph_mtp>(*this, params);
|
||||
}
|
||||
return std::make_unique<graph>(*this, params);
|
||||
}
|
||||
|
||||
@ -547,3 +586,178 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE
|
||||
llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0");
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block");
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
|
||||
GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm");
|
||||
GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm");
|
||||
GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp");
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
|
||||
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
ggml_set_input(inp->tokens);
|
||||
|
||||
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
|
||||
ggml_set_input(inp->embd);
|
||||
ggml_set_name(inp->embd, "mtp_h_input");
|
||||
|
||||
ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
|
||||
|
||||
ggml_tensor * h_input = inp->embd;
|
||||
ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
|
||||
cb(tok_embd, "mtp_tok_embd", il);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(h_norm, "mtp_hnorm", il);
|
||||
|
||||
ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(e_norm, "mtp_enorm", il);
|
||||
|
||||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
||||
cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_norm", il);
|
||||
|
||||
ggml_tensor * Qcur_full = build_lora_mm(layer.wq, cur, layer.wq_s);
|
||||
cb(Qcur_full, "mtp_Qcur_full", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
0);
|
||||
Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "mtp_Qcur_normed", il);
|
||||
|
||||
ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full,
|
||||
n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2,
|
||||
ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
|
||||
ggml_element_size(Qcur_full) * n_embd_head);
|
||||
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
||||
cb(gate, "mtp_gate", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "mtp_Kcur_normed", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Vcur, "mtp_Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f
|
||||
? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
nullptr, nullptr, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "mtp_attn_pregate", il);
|
||||
|
||||
cur = ggml_mul(ctx0, cur, ggml_sigmoid(ctx0, gate));
|
||||
cur = build_lora_mm(layer.wo, cur, layer.wo_s);
|
||||
cb(cur, "mtp_attn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpSA);
|
||||
cb(cur, "mtp_attn_residual", il);
|
||||
|
||||
ggml_tensor * ffn_residual = cur;
|
||||
cur = build_norm(cur, layer.attn_post_norm, nullptr, LLM_NORM_RMS, il);
|
||||
cb(cur, "mtp_attn_post_norm", il);
|
||||
|
||||
// MoE FFN — routed experts plus gated shared expert (mirrors qwen35moe).
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
layer.ffn_gate_inp,
|
||||
layer.ffn_up_exps,
|
||||
layer.ffn_gate_exps,
|
||||
layer.ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, layer.ffn_gate_up_exps,
|
||||
layer.ffn_up_exps_s,
|
||||
layer.ffn_gate_exps_s,
|
||||
layer.ffn_down_exps_s);
|
||||
cb(moe_out, "mtp_ffn_moe_out", il);
|
||||
|
||||
if (layer.ffn_up_shexp != nullptr) {
|
||||
ggml_tensor * ffn_shexp =
|
||||
build_ffn(cur,
|
||||
layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s,
|
||||
layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s,
|
||||
layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s,
|
||||
nullptr,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "mtp_ffn_shexp", il);
|
||||
|
||||
ggml_tensor * shared_gate = build_lora_mm(layer.ffn_gate_inp_shexp, cur);
|
||||
shared_gate = ggml_sigmoid(ctx0, shared_gate);
|
||||
cb(shared_gate, "mtp_shared_expert_gate_sigmoid", il);
|
||||
|
||||
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
|
||||
cb(ffn_shexp, "mtp_ffn_shexp_gated", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
} else {
|
||||
cur = moe_out;
|
||||
}
|
||||
cb(cur, "mtp_ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "mtp_post_ffn", il);
|
||||
|
||||
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
|
||||
cb(cur, "h_pre_norm", -1);
|
||||
res->t_h_pre_norm = cur;
|
||||
|
||||
ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
|
||||
? layer.nextn.shared_head_norm
|
||||
: model.output_norm;
|
||||
GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm");
|
||||
cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
|
||||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
@ -406,11 +406,7 @@ static bool arch_supported(const llm_arch arch) {
|
||||
if (arch == LLM_ARCH_DEEPSEEK2OCR) {
|
||||
return false;
|
||||
}
|
||||
if (arch == LLM_ARCH_QWEN35_MTP || arch == LLM_ARCH_QWEN35MOE_MTP) {
|
||||
return false; // MTP-only arch; requires a sibling trunk model and cannot run standalone.
|
||||
}
|
||||
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
// FIXME some models are segfaulting with WebGPU:
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
||||
return false;
|
||||
|
||||
@ -756,6 +756,14 @@ private:
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params_dft);
|
||||
|
||||
const bool spec_mtp = std::find(params_base.speculative.types.begin(),
|
||||
params_base.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end();
|
||||
if (spec_mtp) {
|
||||
cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
|
||||
}
|
||||
|
||||
ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams));
|
||||
|
||||
ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get());
|
||||
@ -764,36 +772,13 @@ private:
|
||||
params_base.speculative.draft.ctx_dft = ctx_dft.get();
|
||||
} else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_MTP) != params_base.speculative.types.end()) {
|
||||
// MTP head lives in the *target* GGUF — load it as a sibling model
|
||||
// with override_arch and feed it through the existing ctx_dft slot.
|
||||
char trunk_arch[64] = {0};
|
||||
llama_model_meta_val_str(model_tgt, "general.architecture", trunk_arch, sizeof(trunk_arch));
|
||||
|
||||
const char * mtp_arch = nullptr;
|
||||
if (std::string(trunk_arch) == "qwen35") {
|
||||
mtp_arch = "qwen35_mtp";
|
||||
} else if (std::string(trunk_arch) == "qwen35moe") {
|
||||
mtp_arch = "qwen35moe_mtp";
|
||||
} else {
|
||||
SRV_ERR("MTP not supported for trunk architecture '%s'\n", trunk_arch);
|
||||
return false;
|
||||
}
|
||||
|
||||
SRV_INF("loading MTP head from '%s' (override_arch=%s)\n",
|
||||
params_base.model.path.c_str(), mtp_arch);
|
||||
|
||||
auto mparams_mtp = common_model_params_to_llama(params_base);
|
||||
mparams_mtp.override_arch = mtp_arch;
|
||||
|
||||
model_dft.reset(llama_model_load_from_file(params_base.model.path.c_str(), mparams_mtp));
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load MTP head from '%s'\n", params_base.model.path.c_str());
|
||||
return false;
|
||||
}
|
||||
SRV_INF("creating MTP draft context against the target model '%s'\n",
|
||||
params_base.model.path.c_str());
|
||||
|
||||
auto cparams_mtp = common_context_params_to_llama(params_base);
|
||||
cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
|
||||
|
||||
ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams_mtp));
|
||||
ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));
|
||||
if (ctx_dft == nullptr) {
|
||||
SRV_ERR("%s", "failed to create MTP context\n");
|
||||
return false;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user