From 1f8c603d9c57d1d9b30a9296ece460c508244729 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 17 May 2026 13:59:56 +0300 Subject: [PATCH] Quantize: add extra output tensor for MTP (#1810) * Quantize: add extra output tensor for MTP * Consistently use --mtp-requantize-output-tensor --- common/common.cpp | 2 +- examples/quantize/quantize.cpp | 9 +++- include/llama.h | 1 + src/llama-load-tensors.cpp | 16 ++++-- src/llama-quantize.cpp | 91 ++++++++++++++++++++++++++++++++-- src/llama.cpp | 1 + 6 files changed, 109 insertions(+), 11 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2e36b288..fc5e59ea 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1727,7 +1727,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") { + if (arg == "--mtp-requantize-output-tensor" || arg == "-mtprot") { CHECK_ARG params.extra_output_type = argv[i]; return true; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 83f79722..3734a67d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--extra-output-tensor] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -163,6 +163,7 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n"); + printf(" --extra-output-tensor ggml_type: requantize and add output tensor of that type.\n"); printf(" --ffn-gate-inp-type ggml_type: use this ggml_type for the ffn_gate_inp tensors.\n\n"); printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n"); printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n"); @@ -383,6 +384,12 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--extra-output-tensor") == 0) { + if (arg_idx < argc-1) { + params.extra_output_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { if (arg_idx < argc-1) { params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); diff --git a/include/llama.h b/include/llama.h index b847b88f..448fd464 100644 --- a/include/llama.h +++ b/include/llama.h @@ -522,6 +522,7 @@ extern "C" { enum ggml_type ffn_down_type; // feedforward network down type enum ggml_type ffn_up_type; // feedforward network up type enum ggml_type ffn_gate_inp_type; // routed experts probabilities typy (relevant for MoE models only) + enum ggml_type extra_output_type; // routed experts probabilities typy (relevant for MoE models only) bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 033a08dc..b284e248 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1520,11 +1520,15 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) { if (model.output == NULL) { model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } + int flags = llama_model_loader::TENSOR_NOT_REQUIRED; + if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP; + auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags); if (model.mtp) { - model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, - llama_model_loader::TENSOR_NOT_REQUIRED); + model.output_mtp = output_mtp; if (!model.output_mtp) { model.output_mtp = model.output; + } else { + LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name); } } } @@ -1627,11 +1631,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } + int flags = llama_model_loader::TENSOR_NOT_REQUIRED; + if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP; + auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags); if (model.mtp) { - model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, - llama_model_loader::TENSOR_NOT_REQUIRED); + model.output_mtp = output_mtp; if (!model.output_mtp) { model.output_mtp = model.output; + } else { + LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name); } } } diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 8ee24ae1..4e6032f6 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1275,8 +1275,44 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector ctx_outs(n_split, NULL); ctx_outs[0] = ctx_out; + ggml_tensor extra; + ggml_tensor * output_meta = ml.get_tensor_meta("output.weight"); + if (!output_meta) { + output_meta = ml.get_tensor_meta("token_embd.weight"); + } + ggml_tensor * output_tensor = nullptr; + if (params->extra_output_type != GGML_TYPE_COUNT) { + auto meta = ml.get_tensor_meta("output.weight"); + if (!meta) { + meta = ml.get_tensor_meta("token_embd.weight"); + } + if (!meta) { + LLAMA_LOG_WARN("Extra output tensor requested, but 'output.weight' or 'token_embd.weight' not found\n"); + } else { + LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Will duplicate %s as %s\n", meta->name, + ggml_type_name(params->extra_output_type)); + auto weights = ml.get_weight(meta->name); + output_tensor = weights->tensor; + extra = *output_tensor; + auto new_type = params->extra_output_type; + extra.type = new_type; + auto tt = ggml_internal_get_type_traits(extra.type); + extra.nb[0] = tt.type_size; + extra.nb[1] = ggml_row_size(extra.type, extra.ne[0]); + extra.nb[2] = extra.nb[3] = extra.nb[1]*extra.ne[1]; + extra.data = nullptr; + strcpy(extra.name, "output_extra.weight"); + auto orig_size = ggml_nbytes(output_tensor); + auto new_size = ggml_nbytes(&extra); + if (new_size >= orig_size) { + LLAMA_LOG_INFO("No, duplicating it makes no sense as the new size (%zu) is greater than the original size (%zu)\n", + new_size, orig_size); + output_tensor = nullptr; + } + } + } + // populate the original tensors so we get an initial meta data - int last_split = -1; for (int i = 0; i < ml.n_tensors; ++i) { auto weight = ml.get_weight(i); uint16_t i_split = params->keep_split ? weight->idx : 0; @@ -1285,8 +1321,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ctx_outs[i_split] = gguf_init_empty(); } gguf_add_tensor(ctx_outs[i_split], tensor); - if (i_split > last_split) { - last_split = i_split; + if (tensor == output_tensor) { + gguf_add_tensor(ctx_outs[i_split], &extra); } } @@ -1520,7 +1556,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. - quantize &= tensor->type != new_type; + if (tensor != output_tensor) { + quantize &= tensor->type != new_type; + } } if (!quantize) { @@ -1644,8 +1682,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } new_data = work.data(); - do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers, + if (params->extra_output_type != GGML_TYPE_COUNT && tensor == output_tensor) { + auto cur_size = ggml_nbytes(tensor); + if (new_type != tensor->type) { + do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers, + new_size, chunk_size_multiplier, params); + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); + total_size_new += new_size; + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, new_size/1024.0/1024.0); + } else { + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, cur_size); + fout.write((const char *) tensor->data, cur_size); + zeros(fout, GGML_PAD(cur_size, align) - cur_size); + total_size_new += cur_size; + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, cur_size/1024.0/1024.0); + } + + LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", + ++idx, ml.n_tensors, + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); + + new_type = params->extra_output_type; + chunk_size_multiplier = 1; + auto [working_type, num_rows] = interleaved_properties(new_type); + if (tensor->ne[1] % num_rows != 0) { + new_type = working_type; + } else { + chunk_size_multiplier = num_rows; + } + LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); + fflush(stdout); + + do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers, + new_size, 1, params); + + name = extra.name; + } else { + do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers, new_size, chunk_size_multiplier, params); + } } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); diff --git a/src/llama.cpp b/src/llama.cpp index ee6abd76..2eca5226 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5818,6 +5818,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ffn_down_type =*/ GGML_TYPE_COUNT, /*.ffn_up_type =*/ GGML_TYPE_COUNT, /*.ffn_gat_inp_type =*/ GGML_TYPE_COUNT, + /*.extra_output_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false,