From 40d8cb196a5221b0c09bc8aefd84b9edc7431d50 Mon Sep 17 00:00:00 2001 From: Justin Martin Date: Sat, 23 May 2026 10:52:34 +0000 Subject: [PATCH] llama-quantize: enable --extra-output-tensor with COPY (#1871) --- src/llama-quantize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 4e6032f6..1f538882 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1446,6 +1446,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; + // quantize the extra output tensor + quantize = tensor == output_tensor || quantize; + enum ggml_type new_type; void * new_data = nullptr; size_t new_size = 0; @@ -1516,6 +1519,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } + else if (params->only_copy && tensor == output_tensor) { + new_type = tensor->type; + } if (params->ffn_gate_inp_type < GGML_TYPE_COUNT && name.find("ffn_gate_inp.weight") != std::string::npos) { new_type = params->ffn_gate_inp_type; }