Quantize: add extra output tensor for MTP (#1810)

* Quantize: add extra output tensor for MTP * Consistently use --mtp-requantize-output-tensor
2026-06-28 04:30:15 -05:00 · 2026-05-17 13:59:56 +03:00 · 2026-05-17 13:59:56 +03:00 · 1f8c603d9c
commit 1f8c603d9c
parent 3e573cfea6
6 changed files with 109 additions and 11 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1727,7 +1727,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") {
+    if (arg == "--mtp-requantize-output-tensor" || arg == "-mtprot") {
        CHECK_ARG
        params.extra_output_type = argv[i];
        return true;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--extra-output-tensor] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@ -163,6 +163,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
+    printf("  --extra-output-tensor ggml_type: requantize and add output tensor of that type.\n");
    printf("  --ffn-gate-inp-type ggml_type: use this ggml_type for the ffn_gate_inp tensors.\n\n");
    printf("  --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
    printf("  --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
@ -383,6 +384,12 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--extra-output-tensor") == 0) {
+            if (arg_idx < argc-1) {
+                params.extra_output_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
            if (arg_idx < argc-1) {
                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
--- a/include/llama.h
+++ b/include/llama.h
@ -522,6 +522,7 @@ extern "C" {
        enum ggml_type ffn_down_type;        // feedforward network down type
        enum ggml_type ffn_up_type;          // feedforward network up type
        enum ggml_type ffn_gate_inp_type;    // routed experts probabilities typy (relevant for MoE models only)
+        enum ggml_type extra_output_type;    // routed experts probabilities typy (relevant for MoE models only)
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -1520,11 +1520,15 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
        if (model.output == NULL) {
            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
        }
+        int flags = llama_model_loader::TENSOR_NOT_REQUIRED;
+        if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP;
+        auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags);
        if (model.mtp) {
-            model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
-                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            model.output_mtp = output_mtp;
            if (!model.output_mtp) {
                model.output_mtp = model.output;
+            } else {
+                LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name);
            }
        }
    }
@ -1627,11 +1631,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
                    llama_model_loader::TENSOR_DUPLICATED);
        }
+        int flags = llama_model_loader::TENSOR_NOT_REQUIRED;
+        if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP;
+        auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags);
        if (model.mtp) {
-            model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
-                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            model.output_mtp = output_mtp;
            if (!model.output_mtp) {
                model.output_mtp = model.output;
+            } else {
+                LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name);
            }
        }
    }
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@ -1275,8 +1275,44 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::vector<gguf_context*> ctx_outs(n_split, NULL);
    ctx_outs[0] = ctx_out;

+    ggml_tensor extra;
+    ggml_tensor * output_meta = ml.get_tensor_meta("output.weight");
+    if (!output_meta) {
+        output_meta = ml.get_tensor_meta("token_embd.weight");
+    }
+    ggml_tensor * output_tensor = nullptr;
+    if (params->extra_output_type != GGML_TYPE_COUNT) {
+        auto meta = ml.get_tensor_meta("output.weight");
+        if (!meta) {
+            meta = ml.get_tensor_meta("token_embd.weight");
+        }
+        if (!meta) {
+            LLAMA_LOG_WARN("Extra output tensor requested, but 'output.weight' or 'token_embd.weight' not found\n");
+        } else {
+            LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Will duplicate %s as %s\n", meta->name,
+                    ggml_type_name(params->extra_output_type));
+            auto weights = ml.get_weight(meta->name);
+            output_tensor = weights->tensor;
+            extra = *output_tensor;
+            auto new_type = params->extra_output_type;
+            extra.type = new_type;
+            auto tt = ggml_internal_get_type_traits(extra.type);
+            extra.nb[0] = tt.type_size;
+            extra.nb[1] = ggml_row_size(extra.type, extra.ne[0]);
+            extra.nb[2] = extra.nb[3] = extra.nb[1]*extra.ne[1];
+            extra.data  = nullptr;
+            strcpy(extra.name, "output_extra.weight");
+            auto orig_size = ggml_nbytes(output_tensor);
+            auto new_size  = ggml_nbytes(&extra);
+            if (new_size >= orig_size) {
+                LLAMA_LOG_INFO("No, duplicating it makes no sense as the new size (%zu) is greater than the original size (%zu)\n",
+                        new_size, orig_size);
+                output_tensor = nullptr;
+            }
+        }
+    }
+
    // populate the original tensors so we get an initial meta data
-    int last_split = -1;
    for (int i = 0; i < ml.n_tensors; ++i) {
        auto weight = ml.get_weight(i);
        uint16_t i_split = params->keep_split ? weight->idx : 0;
@ -1285,8 +1321,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            ctx_outs[i_split] = gguf_init_empty();
        }
        gguf_add_tensor(ctx_outs[i_split], tensor);
-        if (i_split > last_split) {
-            last_split = i_split;
+        if (tensor == output_tensor) {
+            gguf_add_tensor(ctx_outs[i_split], &extra);
        }
    }

@ -1520,7 +1556,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

            // If we've decided to quantize to the same type the tensor is already
            // in then there's nothing to do.
-            quantize &= tensor->type != new_type;
+            if (tensor != output_tensor) {
+                quantize &= tensor->type != new_type;
+            }
        }

        if (!quantize) {
@ -1644,8 +1682,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                }
                new_data = work.data();

-                do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
+                if (params->extra_output_type != GGML_TYPE_COUNT && tensor == output_tensor) {
+                    auto cur_size = ggml_nbytes(tensor);
+                    if (new_type != tensor->type) {
+                        do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
+                                new_size, chunk_size_multiplier, params);
+                        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
+                        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
+                        fout.write((const char *) new_data, new_size);
+                        zeros(fout, GGML_PAD(new_size, align) - new_size);
+                        total_size_new += new_size;
+                        LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, new_size/1024.0/1024.0);
+                    } else {
+                        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
+                        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, cur_size);
+                        fout.write((const char *) tensor->data, cur_size);
+                        zeros(fout, GGML_PAD(cur_size, align) - cur_size);
+                        total_size_new += cur_size;
+                        LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, cur_size/1024.0/1024.0);
+                    }
+
+                    LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
+                           ++idx, ml.n_tensors,
+                           ggml_get_name(tensor),
+                           llama_format_tensor_shape(tensor).c_str(),
+                           ggml_type_name(tensor->type));
+
+                    new_type = params->extra_output_type;
+                    chunk_size_multiplier = 1;
+                    auto [working_type, num_rows] = interleaved_properties(new_type);
+                    if (tensor->ne[1] % num_rows != 0) {
+                        new_type = working_type;
+                    } else {
+                        chunk_size_multiplier = num_rows;
+                    }
+                    LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+                    fflush(stdout);
+
+                    do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
+                        new_size, 1, params);
+
+                    name = extra.name;
+                } else {
+                    do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
                            new_size, chunk_size_multiplier, params);
+                }

            }
            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5818,6 +5818,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.ffn_down_type               =*/ GGML_TYPE_COUNT,
        /*.ffn_up_type                 =*/ GGML_TYPE_COUNT,
        /*.ffn_gat_inp_type            =*/ GGML_TYPE_COUNT,
+        /*.extra_output_type           =*/ GGML_TYPE_COUNT,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
        /*.only_copy                   =*/ false,