MTP: option to use re-quantized output tensor for better TG performance (#1809)

* Option to use re-quantized output tensor for MTP * Remove quantize extra output option * Handle interleaved types
2026-06-28 04:30:15 -05:00 · 2026-05-16 14:40:18 +03:00 · 2026-05-16 14:40:18 +03:00 · 3e573cfea6
commit 3e573cfea6
parent 5cc0d86c76
10 changed files with 312 additions and 77 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1727,6 +1727,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
+    if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") {
+        CHECK_ARG
+        params.extra_output_type = argv[i];
+        return true;
+    }
    if (arg == "-ctkd" || arg == "--cache-type-k-draft") {
        params.speculative.cache_type_k = argv[++i];
        return true;
@ -3028,6 +3033,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-ctv-last,  --cache-type-k-last  TYPE,N", "KV cache data type for the last N layers of K  (default: %s,-1)", params.type_k_last.c_str() });
    options.push_back({ "*",           "-ctv-first, --cache-type-v-first TYPE,N", "KV cache data type for the first N layers of V (default: %s,-1)", params.type_v_first.c_str() });
    options.push_back({ "*",           "-ctk-last,  --cache-type-v-last  TYPE,N", "KV cache data type for the last N layers of V  (default: %s,-1)", params.type_v_last.c_str() });
+    options.push_back({ "*",           "-mtprot, --mtp-requantize-output-tensor type", "Use output requantized to type for MTP (default: %s)", params.extra_output_type.c_str() });
    options.push_back({ "*",           "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" });
    options.push_back({ "*",           "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" });

@ -3926,6 +3932,17 @@ static std::pair<int, int> get_batch_ubatch(const gpt_params & params) {
    return {n_batch, n_ubatch};
 }

+static ggml_type parse_ggml_type(const char * arg) {
+    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
+        auto type = ggml_type(j);
+        const auto * name = ggml_type_name(type);
+        if (name && strcmp(arg, name) == 0) {
+            return type;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
 struct llama_model_params common_model_params_to_llama(const gpt_params & params) {
    auto mparams = llama_model_default_params();
    mparams.devices = params.devices.c_str();
@ -3948,6 +3965,9 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
    mparams.type_k_last     = kv_cache_type_from_str(params.type_k_last );
    mparams.type_v_first    = kv_cache_type_from_str(params.type_v_first);
    mparams.type_v_last     = kv_cache_type_from_str(params.type_v_last );
+    if (!params.extra_output_type.empty()) {
+        mparams.extra_output_type = parse_ggml_type(params.extra_output_type.c_str());
+    }
    mparams.n_k_first       = params.n_k_first;
    mparams.n_k_last        = params.n_k_last;
    mparams.n_v_first       = params.n_v_first;
--- a/common/common.h
+++ b/common/common.h
@ -443,6 +443,8 @@ struct gpt_params {
    int32_t     n_v_first    = -1;
    int32_t     n_v_last     = -1;

+    std::string extra_output_type = "";
+
    // multimodal models (see examples/mtmd)
    common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@ -240,6 +240,81 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i
    }
 }

+static inline int num_rows([[maybe_unused]] ggml_type type) {
+#ifdef HAVE_FANCY_SIMD
+    switch (type) {
+        case GGML_TYPE_Q2_K_R4:
+        case GGML_TYPE_Q3_K_R4:
+        case GGML_TYPE_Q6_K_R4:
+        case GGML_TYPE_IQ2_K_R4:
+        case GGML_TYPE_IQ3_K_R4:
+        case GGML_TYPE_IQ4_K_R4:
+        case GGML_TYPE_IQ5_K_R4:
+        case GGML_TYPE_IQ4_KS_R4:
+        case GGML_TYPE_IQ5_KS_R4:
+        case GGML_TYPE_IQ2_XXS_R4:
+        case GGML_TYPE_IQ2_XS_R4:
+        case GGML_TYPE_IQ2_S_R4:
+        case GGML_TYPE_IQ3_XXS_R4:
+        case GGML_TYPE_IQ1_S_R4:
+        case GGML_TYPE_IQ1_M_R4:
+        case GGML_TYPE_IQ3_S_R4: return 4;
+        case GGML_TYPE_IQ4_NL_R4:
+        case GGML_TYPE_Q5_0_R4:
+        case GGML_TYPE_Q6_0_R4:
+        case GGML_TYPE_IQ2_BN_R4:
+        case GGML_TYPE_IQ4_XS_R8:
+        case GGML_TYPE_Q4_K_R4:
+        case GGML_TYPE_Q5_K_R4:
+        case GGML_TYPE_Q8_KV:
+        case GGML_TYPE_Q8_KV_R8:
+        case GGML_TYPE_Q8_K_R8: return 8;
+        case GGML_TYPE_Q4_0_R8:
+        case GGML_TYPE_Q8_0_R8:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q8_K_R16:
+        case GGML_TYPE_BF16_R16: return 16;
+        default: return 1;
+    }
+#else
+    switch (type) {
+        case GGML_TYPE_Q2_K_R4:
+        case GGML_TYPE_Q3_K_R4:
+        case GGML_TYPE_Q4_K_R4:
+        case GGML_TYPE_Q5_K_R4:
+        case GGML_TYPE_Q6_K_R4:
+        case GGML_TYPE_Q5_0_R4:
+        case GGML_TYPE_Q6_0_R4:
+        case GGML_TYPE_IQ4_NL_R4:
+        case GGML_TYPE_IQ2_K_R4:
+        case GGML_TYPE_IQ3_K_R4:
+        case GGML_TYPE_IQ4_K_R4:
+        case GGML_TYPE_IQ5_K_R4:
+        case GGML_TYPE_IQ4_KS_R4:
+        case GGML_TYPE_IQ5_KS_R4:
+        case GGML_TYPE_IQ2_XXS_R4:
+        case GGML_TYPE_IQ2_XS_R4:
+        case GGML_TYPE_IQ2_S_R4:
+        case GGML_TYPE_IQ3_XXS_R4:
+        case GGML_TYPE_IQ3_S_R4:
+        case GGML_TYPE_IQ1_S_R4:
+        case GGML_TYPE_IQ1_M_R4:
+        case GGML_TYPE_IQ2_BN_R4: return 4;
+        case GGML_TYPE_IQ4_XS_R8:
+        case GGML_TYPE_Q4_0_R8:
+        case GGML_TYPE_Q8_0_R8:
+        case GGML_TYPE_Q8_KV:
+        case GGML_TYPE_Q8_KV_R8:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_Q8_K_R8: return 8;
+        case GGML_TYPE_Q8_K_R16:
+        case GGML_TYPE_BF16_R16: return 16;
+        default: return 1;
+    }
+#endif
+}
+
+
 }

 void iqk_quantize_any(int from_type, int to_type,
@ -251,21 +326,28 @@ void iqk_quantize_any(int from_type, int to_type,
    GGML_ASSERT(ggml_type_size(type_x) == nb0);
    auto type_y = ggml_type(to_type);
    auto row_size_y = ggml_row_size(type_y, ne0);
-    int64_t nrows = ne1*ne2*ne3;
+    auto n_interleaved = num_rows(type_y);
+    GGML_ASSERT(ne1 % n_interleaved == 0);
+    int64_t ne1i  = ne1/n_interleaved;
+    int64_t nrows = ne1i*ne2*ne3;
    int64_t nrows_per_thread = (nrows + nth - 1)/nth;
    int64_t first_row = nrows_per_thread*ith;
    if (first_row >= nrows) return;
    int64_t last_row = std::min(first_row + nrows_per_thread, nrows);
    for (int64_t row = first_row; row < last_row; ++row) {
-        int64_t i3 = row/(ne1*ne2);
-        int64_t i2 = (row - i3*ne1*ne2)/ne1;
-        int64_t i1 = row - i3*ne1*ne2 - i2*ne1;
-        const char * cx = (const char *)x + i1*nb1 + i2*nb2 + i3*nb3;
+        int64_t i3 = row/(ne1i*ne2);
+        int64_t i2 = (row - i3*ne1i*ne2)/ne1i;
+        int64_t i1 = row - i3*ne1i*ne2 - i2*ne1i;
+        auto cx = (const char *)x + i1*n_interleaved*nb1 + i2*nb2 + i3*nb3;
+        auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1*n_interleaved)*row_size_y;
        // TODO: special case common types such as f16, q8_0
        //       (although the performance gains may be too small to justify the added complexity)
-        to_float((const void *)cx, (float *)work_buffer, ne0);
-        auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1)*row_size_y;
-        from_float((const float *)work_buffer, (void *)cy, ne0);
+        if (type_x != GGML_TYPE_F32) {
+            to_float((const void *)cx, (float *)work_buffer, ne0*n_interleaved);
+            from_float((const float *)work_buffer, (void *)cy, ne0*n_interleaved);
+        } else {
+            from_float((const float *)cx, (void *)cy, ne0*n_interleaved);
+        }
    }
 }

--- a/include/llama.h
+++ b/include/llama.h
@ -394,6 +394,8 @@ extern "C" {
        int32_t n_v_first;
        int32_t n_v_last;

+        enum ggml_type extra_output_type;
+
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;

--- a/src/graphs/build_qwen35.cpp
+++ b/src/graphs/build_qwen35.cpp
@ -238,7 +238,7 @@ struct ggml_tensor * llm_build_context::build_qwen35moe_mtp(

    cb(cur, "result_norm", -1);

-    cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
+    cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
    cb(cur, "result_output", -1);

    return cur;
@ -317,7 +317,7 @@ struct ggml_tensor * llm_build_context::build_qwen35_mtp(
    cb(cur, "result_norm", -1);

    //cur = build_output(lctx, ctx0, cur, model.output, nullptr, cb);
-    cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
+    cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
    cb(cur, "result_output", -1);

    return cur;
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@ -1520,6 +1520,13 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
        if (model.output == NULL) {
            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
        }
+        if (model.mtp) {
+            model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            if (!model.output_mtp) {
+                model.output_mtp = model.output;
+            }
+        }
    }

    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@ -1614,9 +1621,18 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
    // output
    {
        model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-        model.output      = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+        model.output      = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab},
+                llama_model_loader::TENSOR_NOT_REQUIRED);
        if (model.output == NULL) {
-            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
+                    llama_model_loader::TENSOR_DUPLICATED);
+        }
+        if (model.mtp) {
+            model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
+                    llama_model_loader::TENSOR_NOT_REQUIRED);
+            if (!model.output_mtp) {
+                model.output_mtp = model.output;
+            }
        }
    }

--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@ -70,6 +70,10 @@ struct llama_model_loader {
            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);

            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
+                auto data_offset = gguf_get_data_offset(gguf_ctx);
+                auto tensor_offset = gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+                fprintf(stderr, "Error while loading tensor %s: offs = %zu (%zu, %zu), size: %zu, file size: %zu\n", name,
+                        offs, data_offset, tensor_offset, ggml_nbytes(tensor), file->size());
                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
            }
        }
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -415,6 +415,9 @@ struct llama_model {
    struct ggml_tensor * output;
    struct ggml_tensor * output_b;
    struct ggml_tensor * output_norm_enc;
+    struct ggml_tensor * output_mtp = nullptr;
+
+    std::unique_ptr<ggml_tensor> output_mtp_ptr;

    llama_split_tensor split_output;
    llama_split_tensor split_output_norm;
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@ -924,6 +924,72 @@ static llama_ftype repacked_ftype(llama_ftype ftype) {
    return ftype;
 }

+static void do_quantize(int nthread, const ggml_tensor * tensor, ggml_type new_type, const float * f32_data, char * new_data,
+        const float * imatrix, std::vector<std::thread> & workers, size_t & new_size, int chunk_size_multiplier,
+        const llama_model_quantize_params * params) {
+    if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
+        std::mutex mutex;
+        int counter = 0;
+        bool valid = true;
+        auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
+            int ne2 = tensor->ne[2];
+            auto row_size = ggml_row_size(new_type, tensor->ne[0]);
+            auto matrix_size = row_size * tensor->ne[1];
+            size_t local_size = 0;
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                int i02 = counter++;
+                if (i02 >= ne2) {
+                    if (local_size > 0) {
+                        new_size += local_size;
+                    }
+                    break;
+                }
+                lock.unlock();
+                auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
+                auto this_data = (char *)new_data + i02*matrix_size;
+                auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data,
+                        0, tensor->ne[1], tensor->ne[0], this_imatrix, user_data);
+                local_size += this_size;
+
+                // validate the quantized data
+                if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
+                    lock.lock();
+                    valid = false;
+                    break;
+                }
+            }
+        };
+        for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
+        for (auto & w : workers) w.join();
+        workers.clear();
+        if (!valid) {
+            throw std::runtime_error("quantized data validation failed");
+        }
+    } else {
+        static const int64_t min_chunk_size = 32 * 512;
+        const int64_t n_per_row = tensor->ne[0];
+        const int64_t nrows     = tensor->ne[1];
+        const int64_t chunk_size = (n_per_row >= min_chunk_size
+                                 ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
+
+        const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+        const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+        const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+        // quantize each expert separately since they have different importance matrices
+        new_size = 0;
+        for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+            const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+            void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+            const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+            new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size,
+                    nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
+        }
+    }
+}
+
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
    ggml_type default_type;
    llama_ftype ftype = params->ftype;
@ -1210,6 +1276,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    ctx_outs[0] = ctx_out;

    // populate the original tensors so we get an initial meta data
+    int last_split = -1;
    for (int i = 0; i < ml.n_tensors; ++i) {
        auto weight = ml.get_weight(i);
        uint16_t i_split = params->keep_split ? weight->idx : 0;
@ -1218,6 +1285,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            ctx_outs[i_split] = gguf_init_empty();
        }
        gguf_add_tensor(ctx_outs[i_split], tensor);
+        if (i_split > last_split) {
+            last_split = i_split;
+        }
    }

    // Set split info if needed
@ -1290,7 +1360,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            continue;
        }

-        const std::string name = ggml_get_name(tensor);
+        std::string name = ggml_get_name(tensor);

        if (!ml.use_mmap) {
            if (read_data.size() < ggml_nbytes(tensor)) {
@ -1450,7 +1520,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

            // If we've decided to quantize to the same type the tensor is already
            // in then there's nothing to do.
-            quantize = tensor->type != new_type;
+            quantize &= tensor->type != new_type;
        }

        if (!quantize) {
@ -1566,72 +1636,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                    f32_data = (float *) f32_conv_buf.data();
                }

-                if (work.size() < (size_t)nelements * 4) {
-                    work.resize(nelements * 4); // upper bound on size
+                auto expected_size = ggml_row_size(new_type, tensor->ne[0])*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+
+                if (work.size() < expected_size) { //(size_t)nelements * 4) {
+                    //work.resize(nelements * 4); // upper bound on size
+                    work.resize(expected_size); // upper bound on size
                }
                new_data = work.data();

-                const int64_t n_per_row = tensor->ne[0];
-                const int64_t nrows = tensor->ne[1];
+                do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
+                            new_size, chunk_size_multiplier, params);

-                if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
-                    std::mutex mutex;
-                    int counter = 0;
-                    bool valid = true;
-                    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
-                        int ne2 = tensor->ne[2];
-                        auto row_size = ggml_row_size(new_type, tensor->ne[0]);
-                        auto matrix_size = row_size * tensor->ne[1];
-                        size_t local_size = 0;
-                        while (true) {
-                            std::unique_lock<std::mutex> lock(mutex);
-                            int i02 = counter++;
-                            if (i02 >= ne2) {
-                                if (local_size > 0) {
-                                    new_size += local_size;
-                                }
-                                break;
-                            }
-                            lock.unlock();
-                            auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
-                            auto this_data = (char *)new_data + i02*matrix_size;
-                            auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0],
-                                    this_imatrix, user_data);
-                            local_size += this_size;
-
-                            // validate the quantized data
-                            if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
-                                lock.lock();
-                                valid = false;
-                                break;
-                            }
-                        }
-                    };
-                    for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
-                    for (auto & w : workers) w.join();
-                    workers.clear();
-                    if (!valid) {
-                        throw std::runtime_error("quantized data validation failed");
-                    }
-                } else {
-                static const int64_t min_chunk_size = 32 * 512;
-                const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
-                    chunk_size_multiplier;
-
-                const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
-                const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
-                const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
-
-                // quantize each expert separately since they have different importance matrices
-                new_size = 0;
-                for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
-                    const float * f32_data_03 = f32_data + i03 * nelements_matrix;
-                    void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
-                    const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
-
-                    new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
-                }
-                }
            }
            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
        }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2132,6 +2132,91 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {

 }

+static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) {
+    if (new_type == GGML_TYPE_COUNT || !model.output) return;
+    if (model.output_mtp && model.output_mtp != model.output) {
+        LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__);
+        return;
+    }
+    if (model.output->type == new_type) {
+        LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type));
+    }
+    auto [other_type, n_interleaved] = interleaved_properties(new_type);
+    if (model.output->ne[1] % n_interleaved != 0) {
+        LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__,
+                model.output->ne[1], n_interleaved, ggml_type_name(new_type));
+        LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type));
+        new_type = other_type;
+        n_interleaved = 1;
+    }
+    auto nbytes_orig = ggml_nbytes(model.output);
+    auto row_size    = ggml_row_size(new_type, model.output->ne[0]);
+    auto nbytes_new  = row_size*ggml_nrows(model.output);
+    if (nbytes_new >= nbytes_orig) {
+        LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig);
+        return;
+    }
+
+    LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n",
+            ggml_type_name(new_type), nbytes_new/1024./1024.);
+
+    bool is_host = ggml_backend_buffer_is_host(model.output->buffer);
+
+    auto tensor_data = model.output->data;
+    std::vector<char> tensor_data_buf;
+    if (!is_host) {
+        tensor_data_buf.resize(nbytes_orig);
+        ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig);
+        tensor_data = tensor_data_buf.data();
+    }
+
+    auto tt_new  = ggml_internal_get_type_traits(new_type);
+    auto new_output = std::make_unique<ggml_tensor>(*model.output);
+    new_output->type = new_type;
+    new_output->nb[0] = tt_new.type_size;
+    new_output->nb[1] = row_size;
+    new_output->nb[2] = new_output->nb[1] * new_output->ne[1];
+    new_output->nb[3] = new_output->nb[2] * new_output->ne[2];
+    GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new);
+    new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new);
+    new_output->data   = ggml_backend_buffer_get_base(new_output->buffer);
+    new_output->op     = GGML_OP_NONE;
+    for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr;
+    ggml_set_name(new_output.get(), "output_extra.weight");
+    ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+    std::vector<char> new_data_buf;
+    char * new_data = (char *)new_output->data;
+    if (!is_host) {
+        new_data_buf.resize(nbytes_new);
+        new_data = new_data_buf.data();
+    }
+
+    int nthread = std::max<int>(1, std::thread::hardware_concurrency()/2);
+
+    auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) {
+        std::vector<float> work(t->ne[0]*n_interleaved);
+        auto tt_orig = ggml_internal_get_type_traits(t->type);
+        auto tt_new  = ggml_internal_get_type_traits(new_type);
+        iqk_quantize_any(int(t->type), int(new_type),
+                t->ne[0], t->ne[1], t->ne[2], t->ne[3],
+                t->nb[0], t->nb[1], t->nb[2], t->nb[3],
+                tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread);
+    };
+    std::vector<std::thread> workers(nthread-1);
+    for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it);
+    compute(nthread-1);
+    for (auto & w : workers) w.join();
+
+    if (!is_host) {
+        ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new);
+    }
+
+    model.output_mtp_ptr = std::move(new_output);
+    model.output_mtp     = model.output_mtp_ptr.get();
+
+}
+
 static void llm_prepare_mla(llama_model & model, int mla) {
    if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA && model.arch != LLM_ARCH_MISTRAL4) return;
    const auto& hparams = model.hparams;
@ -2768,6 +2853,7 @@ static bool llm_load_tensors(
        const float * tensor_split,
        ggml_type cache_type_k,
        ggml_type cache_type_v,
+        ggml_type extra_output_type,
        uint32_t max_ctx_size,
        int n_seq_max,
        int n_ubatch,
@ -3364,6 +3450,9 @@ static bool llm_load_tensors(
    if (model.arch == LLM_ARCH_GEMMA4) {
        llm_scale_gate_inp_s(model, use_mmap_buffer);
    }
+    if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) {
+        llm_requantize_output_tensor(model, extra_output_type);
+    }

    if (use_mmap_buffer) {
        for (auto & mapping : ml.mappings) {
@ -3521,7 +3610,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam

        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
-            params.type_k, params.type_v, params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
+            params.type_k, params.type_v, params.extra_output_type,
+            params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
            params.worst_graph_tokens, params.flash_attn,
            params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
            params.progress_callback, params.progress_callback_user_data
@ -5617,6 +5707,7 @@ struct llama_model_params llama_model_default_params() {
        /*.n_last_k                    =*/ -1,
        /*.n_first_v                   =*/ -1,
        /*.n_last_v                    =*/ -1,
+        /*.extra_output_type           =*/ GGML_TYPE_COUNT,
        /*.tensor_split                =*/ nullptr,
        /*.rpc_servers                 =*/ nullptr,
        /*.progress_callback           =*/ nullptr,