Quantize: add extra output tensor for MTP (#1810)

* Quantize: add extra output tensor for MTP

* Consistently use --mtp-requantize-output-tensor
This commit is contained in:
Kawrakow 2026-05-17 13:59:56 +03:00 committed by GitHub
parent 3e573cfea6
commit 1f8c603d9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 109 additions and 11 deletions

View File

@ -1727,7 +1727,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") {
if (arg == "--mtp-requantize-output-tensor" || arg == "-mtprot") {
CHECK_ARG
params.extra_output_type = argv[i];
return true;

View File

@ -151,7 +151,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--hide-imatrix] [--ignore-imatrix-rules] [--dry-run] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--extra-output-tensor] [--ffn-gate-inp-type] [--attn-q-type] [--attn-k-type] [--attn-v-type] [--attn-qkv-type] [--attn-output-type] [--ffn-gate-type] [--ffn-down-type] [--ffn-up-type] [--repack] [--repack-pattern] [--keep-split] [--partial-requant] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@ -163,6 +163,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor.\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token_embd.weight tensor.\n\n");
printf(" --extra-output-tensor ggml_type: requantize and add output tensor of that type.\n");
printf(" --ffn-gate-inp-type ggml_type: use this ggml_type for the ffn_gate_inp tensors.\n\n");
printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
@ -383,6 +384,12 @@ int main(int argc, char ** argv) {
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--extra-output-tensor") == 0) {
if (arg_idx < argc-1) {
params.extra_output_type = parse_ggml_type(argv[++arg_idx]);
} else {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
if (arg_idx < argc-1) {
params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);

View File

@ -522,6 +522,7 @@ extern "C" {
enum ggml_type ffn_down_type; // feedforward network down type
enum ggml_type ffn_up_type; // feedforward network up type
enum ggml_type ffn_gate_inp_type; // routed experts probabilities typy (relevant for MoE models only)
enum ggml_type extra_output_type; // routed experts probabilities typy (relevant for MoE models only)
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored

View File

@ -1520,11 +1520,15 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
if (model.output == NULL) {
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
}
int flags = llama_model_loader::TENSOR_NOT_REQUIRED;
if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP;
auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags);
if (model.mtp) {
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
llama_model_loader::TENSOR_NOT_REQUIRED);
model.output_mtp = output_mtp;
if (!model.output_mtp) {
model.output_mtp = model.output;
} else {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name);
}
}
}
@ -1627,11 +1631,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
llama_model_loader::TENSOR_DUPLICATED);
}
int flags = llama_model_loader::TENSOR_NOT_REQUIRED;
if (!model.mtp) flags |= llama_model_loader::TENSOR_SKIP;
auto output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, flags);
if (model.mtp) {
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
llama_model_loader::TENSOR_NOT_REQUIRED);
model.output_mtp = output_mtp;
if (!model.output_mtp) {
model.output_mtp = model.output;
} else {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Using %s as MTP output\n", model.output_mtp->name);
}
}
}

View File

@ -1275,8 +1275,44 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<gguf_context*> ctx_outs(n_split, NULL);
ctx_outs[0] = ctx_out;
ggml_tensor extra;
ggml_tensor * output_meta = ml.get_tensor_meta("output.weight");
if (!output_meta) {
output_meta = ml.get_tensor_meta("token_embd.weight");
}
ggml_tensor * output_tensor = nullptr;
if (params->extra_output_type != GGML_TYPE_COUNT) {
auto meta = ml.get_tensor_meta("output.weight");
if (!meta) {
meta = ml.get_tensor_meta("token_embd.weight");
}
if (!meta) {
LLAMA_LOG_WARN("Extra output tensor requested, but 'output.weight' or 'token_embd.weight' not found\n");
} else {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX Will duplicate %s as %s\n", meta->name,
ggml_type_name(params->extra_output_type));
auto weights = ml.get_weight(meta->name);
output_tensor = weights->tensor;
extra = *output_tensor;
auto new_type = params->extra_output_type;
extra.type = new_type;
auto tt = ggml_internal_get_type_traits(extra.type);
extra.nb[0] = tt.type_size;
extra.nb[1] = ggml_row_size(extra.type, extra.ne[0]);
extra.nb[2] = extra.nb[3] = extra.nb[1]*extra.ne[1];
extra.data = nullptr;
strcpy(extra.name, "output_extra.weight");
auto orig_size = ggml_nbytes(output_tensor);
auto new_size = ggml_nbytes(&extra);
if (new_size >= orig_size) {
LLAMA_LOG_INFO("No, duplicating it makes no sense as the new size (%zu) is greater than the original size (%zu)\n",
new_size, orig_size);
output_tensor = nullptr;
}
}
}
// populate the original tensors so we get an initial meta data
int last_split = -1;
for (int i = 0; i < ml.n_tensors; ++i) {
auto weight = ml.get_weight(i);
uint16_t i_split = params->keep_split ? weight->idx : 0;
@ -1285,8 +1321,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
ctx_outs[i_split] = gguf_init_empty();
}
gguf_add_tensor(ctx_outs[i_split], tensor);
if (i_split > last_split) {
last_split = i_split;
if (tensor == output_tensor) {
gguf_add_tensor(ctx_outs[i_split], &extra);
}
}
@ -1520,7 +1556,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize &= tensor->type != new_type;
if (tensor != output_tensor) {
quantize &= tensor->type != new_type;
}
}
if (!quantize) {
@ -1644,8 +1682,51 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
new_data = work.data();
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
if (params->extra_output_type != GGML_TYPE_COUNT && tensor == output_tensor) {
auto cur_size = ggml_nbytes(tensor);
if (new_type != tensor->type) {
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
new_size, chunk_size_multiplier, params);
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
total_size_new += new_size;
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, new_size/1024.0/1024.0);
} else {
gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), tensor->type);
gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), tensor->data, cur_size);
fout.write((const char *) tensor->data, cur_size);
zeros(fout, GGML_PAD(cur_size, align) - cur_size);
total_size_new += cur_size;
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", cur_size/1024.0/1024.0, cur_size/1024.0/1024.0);
}
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
new_type = params->extra_output_type;
chunk_size_multiplier = 1;
auto [working_type, num_rows] = interleaved_properties(new_type);
if (tensor->ne[1] % num_rows != 0) {
new_type = working_type;
} else {
chunk_size_multiplier = num_rows;
}
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
fflush(stdout);
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
new_size, 1, params);
name = extra.name;
} else {
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
new_size, chunk_size_multiplier, params);
}
}
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);

View File

@ -5818,6 +5818,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ffn_down_type =*/ GGML_TYPE_COUNT,
/*.ffn_up_type =*/ GGML_TYPE_COUNT,
/*.ffn_gat_inp_type =*/ GGML_TYPE_COUNT,
/*.extra_output_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,