From 40d8cb196a5221b0c09bc8aefd84b9edc7431d50 Mon Sep 17 00:00:00 2001
From: Justin Martin <jaming@protonmail.com>
Date: Sat, 23 May 2026 10:52:34 +0000
Subject: [PATCH] llama-quantize: enable --extra-output-tensor with COPY
 (#1871)

---
 src/llama-quantize.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp
index 4e6032f6..1f538882 100644
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -1446,6 +1446,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
+        // quantize the extra output tensor
+        quantize = tensor == output_tensor || quantize;
+
         enum ggml_type new_type;
         void * new_data = nullptr;
         size_t new_size = 0;
@@ -1516,6 +1519,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                 new_type = params->output_tensor_type;
             }
+            else if (params->only_copy && tensor == output_tensor) {
+                new_type = tensor->type;
+            }
             if (params->ffn_gate_inp_type < GGML_TYPE_COUNT && name.find("ffn_gate_inp.weight") != std::string::npos) {
                 new_type = params->ffn_gate_inp_type;
             }