MTP: option to use re-quantized output tensor for better TG performance (#1809)

* Option to use re-quantized output tensor for MTP

* Remove quantize extra output option

* Handle interleaved types
This commit is contained in:
Kawrakow 2026-05-16 14:40:18 +03:00 committed by GitHub
parent 5cc0d86c76
commit 3e573cfea6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 312 additions and 77 deletions

View File

@ -1727,6 +1727,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") {
CHECK_ARG
params.extra_output_type = argv[i];
return true;
}
if (arg == "-ctkd" || arg == "--cache-type-k-draft") {
params.speculative.cache_type_k = argv[++i];
return true;
@ -3028,6 +3033,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-ctv-last, --cache-type-k-last TYPE,N", "KV cache data type for the last N layers of K (default: %s,-1)", params.type_k_last.c_str() });
options.push_back({ "*", "-ctv-first, --cache-type-v-first TYPE,N", "KV cache data type for the first N layers of V (default: %s,-1)", params.type_v_first.c_str() });
options.push_back({ "*", "-ctk-last, --cache-type-v-last TYPE,N", "KV cache data type for the last N layers of V (default: %s,-1)", params.type_v_last.c_str() });
options.push_back({ "*", "-mtprot, --mtp-requantize-output-tensor type", "Use output requantized to type for MTP (default: %s)", params.extra_output_type.c_str() });
options.push_back({ "*", "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" });
options.push_back({ "*", "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" });
@ -3926,6 +3932,17 @@ static std::pair<int, int> get_batch_ubatch(const gpt_params & params) {
return {n_batch, n_ubatch};
}
static ggml_type parse_ggml_type(const char * arg) {
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
return type;
}
}
return GGML_TYPE_COUNT;
}
struct llama_model_params common_model_params_to_llama(const gpt_params & params) {
auto mparams = llama_model_default_params();
mparams.devices = params.devices.c_str();
@ -3948,6 +3965,9 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
mparams.type_k_last = kv_cache_type_from_str(params.type_k_last );
mparams.type_v_first = kv_cache_type_from_str(params.type_v_first);
mparams.type_v_last = kv_cache_type_from_str(params.type_v_last );
if (!params.extra_output_type.empty()) {
mparams.extra_output_type = parse_ggml_type(params.extra_output_type.c_str());
}
mparams.n_k_first = params.n_k_first;
mparams.n_k_last = params.n_k_last;
mparams.n_v_first = params.n_v_first;

View File

@ -443,6 +443,8 @@ struct gpt_params {
int32_t n_v_first = -1;
int32_t n_v_last = -1;
std::string extra_output_type = "";
// multimodal models (see examples/mtmd)
common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model

View File

@ -240,6 +240,81 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i
}
}
static inline int num_rows([[maybe_unused]] ggml_type type) {
#ifdef HAVE_FANCY_SIMD
switch (type) {
case GGML_TYPE_Q2_K_R4:
case GGML_TYPE_Q3_K_R4:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_IQ2_K_R4:
case GGML_TYPE_IQ3_K_R4:
case GGML_TYPE_IQ4_K_R4:
case GGML_TYPE_IQ5_K_R4:
case GGML_TYPE_IQ4_KS_R4:
case GGML_TYPE_IQ5_KS_R4:
case GGML_TYPE_IQ2_XXS_R4:
case GGML_TYPE_IQ2_XS_R4:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_IQ3_S_R4: return 4;
case GGML_TYPE_IQ4_NL_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_IQ4_XS_R8:
case GGML_TYPE_Q4_K_R4:
case GGML_TYPE_Q5_K_R4:
case GGML_TYPE_Q8_KV:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_K_R8: return 8;
case GGML_TYPE_Q4_0_R8:
case GGML_TYPE_Q8_0_R8:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_BF16_R16: return 16;
default: return 1;
}
#else
switch (type) {
case GGML_TYPE_Q2_K_R4:
case GGML_TYPE_Q3_K_R4:
case GGML_TYPE_Q4_K_R4:
case GGML_TYPE_Q5_K_R4:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q5_0_R4:
case GGML_TYPE_Q6_0_R4:
case GGML_TYPE_IQ4_NL_R4:
case GGML_TYPE_IQ2_K_R4:
case GGML_TYPE_IQ3_K_R4:
case GGML_TYPE_IQ4_K_R4:
case GGML_TYPE_IQ5_K_R4:
case GGML_TYPE_IQ4_KS_R4:
case GGML_TYPE_IQ5_KS_R4:
case GGML_TYPE_IQ2_XXS_R4:
case GGML_TYPE_IQ2_XS_R4:
case GGML_TYPE_IQ2_S_R4:
case GGML_TYPE_IQ3_XXS_R4:
case GGML_TYPE_IQ3_S_R4:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_IQ2_BN_R4: return 4;
case GGML_TYPE_IQ4_XS_R8:
case GGML_TYPE_Q4_0_R8:
case GGML_TYPE_Q8_0_R8:
case GGML_TYPE_Q8_KV:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K_R8: return 8;
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_BF16_R16: return 16;
default: return 1;
}
#endif
}
}
void iqk_quantize_any(int from_type, int to_type,
@ -251,21 +326,28 @@ void iqk_quantize_any(int from_type, int to_type,
GGML_ASSERT(ggml_type_size(type_x) == nb0);
auto type_y = ggml_type(to_type);
auto row_size_y = ggml_row_size(type_y, ne0);
int64_t nrows = ne1*ne2*ne3;
auto n_interleaved = num_rows(type_y);
GGML_ASSERT(ne1 % n_interleaved == 0);
int64_t ne1i = ne1/n_interleaved;
int64_t nrows = ne1i*ne2*ne3;
int64_t nrows_per_thread = (nrows + nth - 1)/nth;
int64_t first_row = nrows_per_thread*ith;
if (first_row >= nrows) return;
int64_t last_row = std::min(first_row + nrows_per_thread, nrows);
for (int64_t row = first_row; row < last_row; ++row) {
int64_t i3 = row/(ne1*ne2);
int64_t i2 = (row - i3*ne1*ne2)/ne1;
int64_t i1 = row - i3*ne1*ne2 - i2*ne1;
const char * cx = (const char *)x + i1*nb1 + i2*nb2 + i3*nb3;
int64_t i3 = row/(ne1i*ne2);
int64_t i2 = (row - i3*ne1i*ne2)/ne1i;
int64_t i1 = row - i3*ne1i*ne2 - i2*ne1i;
auto cx = (const char *)x + i1*n_interleaved*nb1 + i2*nb2 + i3*nb3;
auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1*n_interleaved)*row_size_y;
// TODO: special case common types such as f16, q8_0
// (although the performance gains may be too small to justify the added complexity)
to_float((const void *)cx, (float *)work_buffer, ne0);
auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1)*row_size_y;
from_float((const float *)work_buffer, (void *)cy, ne0);
if (type_x != GGML_TYPE_F32) {
to_float((const void *)cx, (float *)work_buffer, ne0*n_interleaved);
from_float((const float *)work_buffer, (void *)cy, ne0*n_interleaved);
} else {
from_float((const float *)cx, (void *)cy, ne0*n_interleaved);
}
}
}

View File

@ -394,6 +394,8 @@ extern "C" {
int32_t n_v_first;
int32_t n_v_last;
enum ggml_type extra_output_type;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;

View File

@ -238,7 +238,7 @@ struct ggml_tensor * llm_build_context::build_qwen35moe_mtp(
cb(cur, "result_norm", -1);
cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
cb(cur, "result_output", -1);
return cur;
@ -317,7 +317,7 @@ struct ggml_tensor * llm_build_context::build_qwen35_mtp(
cb(cur, "result_norm", -1);
//cur = build_output(lctx, ctx0, cur, model.output, nullptr, cb);
cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
cb(cur, "result_output", -1);
return cur;

View File

@ -1520,6 +1520,13 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
if (model.output == NULL) {
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
if (model.mtp) {
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
llama_model_loader::TENSOR_NOT_REQUIRED);
if (!model.output_mtp) {
model.output_mtp = model.output;
}
}
}
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
@ -1614,9 +1621,18 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
// output
{
model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
llama_model_loader::TENSOR_NOT_REQUIRED);
if (model.output == NULL) {
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
llama_model_loader::TENSOR_DUPLICATED);
}
if (model.mtp) {
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
llama_model_loader::TENSOR_NOT_REQUIRED);
if (!model.output_mtp) {
model.output_mtp = model.output;
}
}
}

View File

@ -70,6 +70,10 @@ struct llama_model_loader {
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
auto data_offset = gguf_get_data_offset(gguf_ctx);
auto tensor_offset = gguf_get_tensor_offset(gguf_ctx, tensor_idx);
fprintf(stderr, "Error while loading tensor %s: offs = %zu (%zu, %zu), size: %zu, file size: %zu\n", name,
offs, data_offset, tensor_offset, ggml_nbytes(tensor), file->size());
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
}
}

View File

@ -415,6 +415,9 @@ struct llama_model {
struct ggml_tensor * output;
struct ggml_tensor * output_b;
struct ggml_tensor * output_norm_enc;
struct ggml_tensor * output_mtp = nullptr;
std::unique_ptr<ggml_tensor> output_mtp_ptr;
llama_split_tensor split_output;
llama_split_tensor split_output_norm;

View File

@ -924,6 +924,72 @@ static llama_ftype repacked_ftype(llama_ftype ftype) {
return ftype;
}
static void do_quantize(int nthread, const ggml_tensor * tensor, ggml_type new_type, const float * f32_data, char * new_data,
const float * imatrix, std::vector<std::thread> & workers, size_t & new_size, int chunk_size_multiplier,
const llama_model_quantize_params * params) {
if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
std::mutex mutex;
int counter = 0;
bool valid = true;
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
int ne2 = tensor->ne[2];
auto row_size = ggml_row_size(new_type, tensor->ne[0]);
auto matrix_size = row_size * tensor->ne[1];
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i02 = counter++;
if (i02 >= ne2) {
if (local_size > 0) {
new_size += local_size;
}
break;
}
lock.unlock();
auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
auto this_data = (char *)new_data + i02*matrix_size;
auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data,
0, tensor->ne[1], tensor->ne[0], this_imatrix, user_data);
local_size += this_size;
// validate the quantized data
if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
lock.lock();
valid = false;
break;
}
}
};
for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
for (auto & w : workers) w.join();
workers.clear();
if (!valid) {
throw std::runtime_error("quantized data validation failed");
}
} else {
static const int64_t min_chunk_size = 32 * 512;
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
const int64_t chunk_size = (n_per_row >= min_chunk_size
? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// quantize each expert separately since they have different importance matrices
new_size = 0;
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size,
nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
}
}
}
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
@ -1210,6 +1276,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
ctx_outs[0] = ctx_out;
// populate the original tensors so we get an initial meta data
int last_split = -1;
for (int i = 0; i < ml.n_tensors; ++i) {
auto weight = ml.get_weight(i);
uint16_t i_split = params->keep_split ? weight->idx : 0;
@ -1218,6 +1285,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
ctx_outs[i_split] = gguf_init_empty();
}
gguf_add_tensor(ctx_outs[i_split], tensor);
if (i_split > last_split) {
last_split = i_split;
}
}
// Set split info if needed
@ -1290,7 +1360,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
continue;
}
const std::string name = ggml_get_name(tensor);
std::string name = ggml_get_name(tensor);
if (!ml.use_mmap) {
if (read_data.size() < ggml_nbytes(tensor)) {
@ -1450,7 +1520,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
quantize &= tensor->type != new_type;
}
if (!quantize) {
@ -1566,72 +1636,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.data();
}
if (work.size() < (size_t)nelements * 4) {
work.resize(nelements * 4); // upper bound on size
auto expected_size = ggml_row_size(new_type, tensor->ne[0])*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
if (work.size() < expected_size) { //(size_t)nelements * 4) {
//work.resize(nelements * 4); // upper bound on size
work.resize(expected_size); // upper bound on size
}
new_data = work.data();
const int64_t n_per_row = tensor->ne[0];
const int64_t nrows = tensor->ne[1];
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
new_size, chunk_size_multiplier, params);
if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
std::mutex mutex;
int counter = 0;
bool valid = true;
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
int ne2 = tensor->ne[2];
auto row_size = ggml_row_size(new_type, tensor->ne[0]);
auto matrix_size = row_size * tensor->ne[1];
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i02 = counter++;
if (i02 >= ne2) {
if (local_size > 0) {
new_size += local_size;
}
break;
}
lock.unlock();
auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
auto this_data = (char *)new_data + i02*matrix_size;
auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0],
this_imatrix, user_data);
local_size += this_size;
// validate the quantized data
if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
lock.lock();
valid = false;
break;
}
}
};
for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
for (auto & w : workers) w.join();
workers.clear();
if (!valid) {
throw std::runtime_error("quantized data validation failed");
}
} else {
static const int64_t min_chunk_size = 32 * 512;
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
chunk_size_multiplier;
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
// quantize each expert separately since they have different importance matrices
new_size = 0;
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
}
}
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
}

View File

@ -2132,6 +2132,91 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
}
static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) {
if (new_type == GGML_TYPE_COUNT || !model.output) return;
if (model.output_mtp && model.output_mtp != model.output) {
LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__);
return;
}
if (model.output->type == new_type) {
LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type));
}
auto [other_type, n_interleaved] = interleaved_properties(new_type);
if (model.output->ne[1] % n_interleaved != 0) {
LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__,
model.output->ne[1], n_interleaved, ggml_type_name(new_type));
LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type));
new_type = other_type;
n_interleaved = 1;
}
auto nbytes_orig = ggml_nbytes(model.output);
auto row_size = ggml_row_size(new_type, model.output->ne[0]);
auto nbytes_new = row_size*ggml_nrows(model.output);
if (nbytes_new >= nbytes_orig) {
LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig);
return;
}
LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n",
ggml_type_name(new_type), nbytes_new/1024./1024.);
bool is_host = ggml_backend_buffer_is_host(model.output->buffer);
auto tensor_data = model.output->data;
std::vector<char> tensor_data_buf;
if (!is_host) {
tensor_data_buf.resize(nbytes_orig);
ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig);
tensor_data = tensor_data_buf.data();
}
auto tt_new = ggml_internal_get_type_traits(new_type);
auto new_output = std::make_unique<ggml_tensor>(*model.output);
new_output->type = new_type;
new_output->nb[0] = tt_new.type_size;
new_output->nb[1] = row_size;
new_output->nb[2] = new_output->nb[1] * new_output->ne[1];
new_output->nb[3] = new_output->nb[2] * new_output->ne[2];
GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new);
new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new);
new_output->data = ggml_backend_buffer_get_base(new_output->buffer);
new_output->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr;
ggml_set_name(new_output.get(), "output_extra.weight");
ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
std::vector<char> new_data_buf;
char * new_data = (char *)new_output->data;
if (!is_host) {
new_data_buf.resize(nbytes_new);
new_data = new_data_buf.data();
}
int nthread = std::max<int>(1, std::thread::hardware_concurrency()/2);
auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) {
std::vector<float> work(t->ne[0]*n_interleaved);
auto tt_orig = ggml_internal_get_type_traits(t->type);
auto tt_new = ggml_internal_get_type_traits(new_type);
iqk_quantize_any(int(t->type), int(new_type),
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
t->nb[0], t->nb[1], t->nb[2], t->nb[3],
tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread);
};
std::vector<std::thread> workers(nthread-1);
for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it);
compute(nthread-1);
for (auto & w : workers) w.join();
if (!is_host) {
ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new);
}
model.output_mtp_ptr = std::move(new_output);
model.output_mtp = model.output_mtp_ptr.get();
}
static void llm_prepare_mla(llama_model & model, int mla) {
if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA && model.arch != LLM_ARCH_MISTRAL4) return;
const auto& hparams = model.hparams;
@ -2768,6 +2853,7 @@ static bool llm_load_tensors(
const float * tensor_split,
ggml_type cache_type_k,
ggml_type cache_type_v,
ggml_type extra_output_type,
uint32_t max_ctx_size,
int n_seq_max,
int n_ubatch,
@ -3364,6 +3450,9 @@ static bool llm_load_tensors(
if (model.arch == LLM_ARCH_GEMMA4) {
llm_scale_gate_inp_s(model, use_mmap_buffer);
}
if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) {
llm_requantize_output_tensor(model, extra_output_type);
}
if (use_mmap_buffer) {
for (auto & mapping : ml.mappings) {
@ -3521,7 +3610,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
params.type_k, params.type_v, params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
params.type_k, params.type_v, params.extra_output_type,
params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
params.worst_graph_tokens, params.flash_attn,
params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
params.progress_callback, params.progress_callback_user_data
@ -5617,6 +5707,7 @@ struct llama_model_params llama_model_default_params() {
/*.n_last_k =*/ -1,
/*.n_first_v =*/ -1,
/*.n_last_v =*/ -1,
/*.extra_output_type =*/ GGML_TYPE_COUNT,
/*.tensor_split =*/ nullptr,
/*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,