diff --git a/common/common.cpp b/common/common.cpp index dccfe1db..2e36b288 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1727,6 +1727,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") { + CHECK_ARG + params.extra_output_type = argv[i]; + return true; + } if (arg == "-ctkd" || arg == "--cache-type-k-draft") { params.speculative.cache_type_k = argv[++i]; return true; @@ -3028,6 +3033,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-ctv-last, --cache-type-k-last TYPE,N", "KV cache data type for the last N layers of K (default: %s,-1)", params.type_k_last.c_str() }); options.push_back({ "*", "-ctv-first, --cache-type-v-first TYPE,N", "KV cache data type for the first N layers of V (default: %s,-1)", params.type_v_first.c_str() }); options.push_back({ "*", "-ctk-last, --cache-type-v-last TYPE,N", "KV cache data type for the last N layers of V (default: %s,-1)", params.type_v_last.c_str() }); + options.push_back({ "*", "-mtprot, --mtp-requantize-output-tensor type", "Use output requantized to type for MTP (default: %s)", params.extra_output_type.c_str() }); options.push_back({ "*", "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" }); options.push_back({ "*", "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" }); @@ -3926,6 +3932,17 @@ static std::pair get_batch_ubatch(const gpt_params & params) { return {n_batch, n_ubatch}; } +static ggml_type parse_ggml_type(const char * arg) { + for (int j = 0; j < GGML_TYPE_COUNT; ++j) { + auto type = ggml_type(j); + const auto * name = ggml_type_name(type); + if (name && strcmp(arg, name) == 0) { + return type; + } + } + return GGML_TYPE_COUNT; +} + struct llama_model_params common_model_params_to_llama(const gpt_params & params) { auto mparams = llama_model_default_params(); mparams.devices = params.devices.c_str(); @@ -3948,6 +3965,9 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params mparams.type_k_last = kv_cache_type_from_str(params.type_k_last ); mparams.type_v_first = kv_cache_type_from_str(params.type_v_first); mparams.type_v_last = kv_cache_type_from_str(params.type_v_last ); + if (!params.extra_output_type.empty()) { + mparams.extra_output_type = parse_ggml_type(params.extra_output_type.c_str()); + } mparams.n_k_first = params.n_k_first; mparams.n_k_last = params.n_k_last; mparams.n_v_first = params.n_v_first; diff --git a/common/common.h b/common/common.h index 2ea38928..0dbc29b7 100644 --- a/common/common.h +++ b/common/common.h @@ -443,6 +443,8 @@ struct gpt_params { int32_t n_v_first = -1; int32_t n_v_last = -1; + std::string extra_output_type = ""; + // multimodal models (see examples/mtmd) common_params_model mmproj; bool mmproj_use_gpu = true; // use GPU for multimodal model diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 7f183468..40df1ea6 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -240,6 +240,81 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i } } +static inline int num_rows([[maybe_unused]] ggml_type type) { +#ifdef HAVE_FANCY_SIMD + switch (type) { + case GGML_TYPE_Q2_K_R4: + case GGML_TYPE_Q3_K_R4: + case GGML_TYPE_Q6_K_R4: + case GGML_TYPE_IQ2_K_R4: + case GGML_TYPE_IQ3_K_R4: + case GGML_TYPE_IQ4_K_R4: + case GGML_TYPE_IQ5_K_R4: + case GGML_TYPE_IQ4_KS_R4: + case GGML_TYPE_IQ5_KS_R4: + case GGML_TYPE_IQ2_XXS_R4: + case GGML_TYPE_IQ2_XS_R4: + case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ3_XXS_R4: + case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: + case GGML_TYPE_IQ3_S_R4: return 4; + case GGML_TYPE_IQ4_NL_R4: + case GGML_TYPE_Q5_0_R4: + case GGML_TYPE_Q6_0_R4: + case GGML_TYPE_IQ2_BN_R4: + case GGML_TYPE_IQ4_XS_R8: + case GGML_TYPE_Q4_K_R4: + case GGML_TYPE_Q5_K_R4: + case GGML_TYPE_Q8_KV: + case GGML_TYPE_Q8_KV_R8: + case GGML_TYPE_Q8_K_R8: return 8; + case GGML_TYPE_Q4_0_R8: + case GGML_TYPE_Q8_0_R8: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_K_R16: + case GGML_TYPE_BF16_R16: return 16; + default: return 1; + } +#else + switch (type) { + case GGML_TYPE_Q2_K_R4: + case GGML_TYPE_Q3_K_R4: + case GGML_TYPE_Q4_K_R4: + case GGML_TYPE_Q5_K_R4: + case GGML_TYPE_Q6_K_R4: + case GGML_TYPE_Q5_0_R4: + case GGML_TYPE_Q6_0_R4: + case GGML_TYPE_IQ4_NL_R4: + case GGML_TYPE_IQ2_K_R4: + case GGML_TYPE_IQ3_K_R4: + case GGML_TYPE_IQ4_K_R4: + case GGML_TYPE_IQ5_K_R4: + case GGML_TYPE_IQ4_KS_R4: + case GGML_TYPE_IQ5_KS_R4: + case GGML_TYPE_IQ2_XXS_R4: + case GGML_TYPE_IQ2_XS_R4: + case GGML_TYPE_IQ2_S_R4: + case GGML_TYPE_IQ3_XXS_R4: + case GGML_TYPE_IQ3_S_R4: + case GGML_TYPE_IQ1_S_R4: + case GGML_TYPE_IQ1_M_R4: + case GGML_TYPE_IQ2_BN_R4: return 4; + case GGML_TYPE_IQ4_XS_R8: + case GGML_TYPE_Q4_0_R8: + case GGML_TYPE_Q8_0_R8: + case GGML_TYPE_Q8_KV: + case GGML_TYPE_Q8_KV_R8: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_K_R8: return 8; + case GGML_TYPE_Q8_K_R16: + case GGML_TYPE_BF16_R16: return 16; + default: return 1; + } +#endif +} + + } void iqk_quantize_any(int from_type, int to_type, @@ -251,21 +326,28 @@ void iqk_quantize_any(int from_type, int to_type, GGML_ASSERT(ggml_type_size(type_x) == nb0); auto type_y = ggml_type(to_type); auto row_size_y = ggml_row_size(type_y, ne0); - int64_t nrows = ne1*ne2*ne3; + auto n_interleaved = num_rows(type_y); + GGML_ASSERT(ne1 % n_interleaved == 0); + int64_t ne1i = ne1/n_interleaved; + int64_t nrows = ne1i*ne2*ne3; int64_t nrows_per_thread = (nrows + nth - 1)/nth; int64_t first_row = nrows_per_thread*ith; if (first_row >= nrows) return; int64_t last_row = std::min(first_row + nrows_per_thread, nrows); for (int64_t row = first_row; row < last_row; ++row) { - int64_t i3 = row/(ne1*ne2); - int64_t i2 = (row - i3*ne1*ne2)/ne1; - int64_t i1 = row - i3*ne1*ne2 - i2*ne1; - const char * cx = (const char *)x + i1*nb1 + i2*nb2 + i3*nb3; + int64_t i3 = row/(ne1i*ne2); + int64_t i2 = (row - i3*ne1i*ne2)/ne1i; + int64_t i1 = row - i3*ne1i*ne2 - i2*ne1i; + auto cx = (const char *)x + i1*n_interleaved*nb1 + i2*nb2 + i3*nb3; + auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1*n_interleaved)*row_size_y; // TODO: special case common types such as f16, q8_0 // (although the performance gains may be too small to justify the added complexity) - to_float((const void *)cx, (float *)work_buffer, ne0); - auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1)*row_size_y; - from_float((const float *)work_buffer, (void *)cy, ne0); + if (type_x != GGML_TYPE_F32) { + to_float((const void *)cx, (float *)work_buffer, ne0*n_interleaved); + from_float((const float *)work_buffer, (void *)cy, ne0*n_interleaved); + } else { + from_float((const float *)cx, (void *)cy, ne0*n_interleaved); + } } } diff --git a/include/llama.h b/include/llama.h index 42539c70..b847b88f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -394,6 +394,8 @@ extern "C" { int32_t n_v_first; int32_t n_v_last; + enum ggml_type extra_output_type; + // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; diff --git a/src/graphs/build_qwen35.cpp b/src/graphs/build_qwen35.cpp index 93c18793..2fd0eecd 100644 --- a/src/graphs/build_qwen35.cpp +++ b/src/graphs/build_qwen35.cpp @@ -238,7 +238,7 @@ struct ggml_tensor * llm_build_context::build_qwen35moe_mtp( cb(cur, "result_norm", -1); - cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb); + cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb); cb(cur, "result_output", -1); return cur; @@ -317,7 +317,7 @@ struct ggml_tensor * llm_build_context::build_qwen35_mtp( cb(cur, "result_norm", -1); //cur = build_output(lctx, ctx0, cur, model.output, nullptr, cb); - cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb); + cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb); cb(cur, "result_output", -1); return cur; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index b1dd9b50..033a08dc 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1520,6 +1520,13 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) { if (model.output == NULL) { model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } + if (model.mtp) { + model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, + llama_model_loader::TENSOR_NOT_REQUIRED); + if (!model.output_mtp) { + model.output_mtp = model.output; + } + } } const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; @@ -1614,9 +1621,18 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { // output { model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, + llama_model_loader::TENSOR_NOT_REQUIRED); if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, + llama_model_loader::TENSOR_DUPLICATED); + } + if (model.mtp) { + model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, + llama_model_loader::TENSOR_NOT_REQUIRED); + if (!model.output_mtp) { + model.output_mtp = model.output; + } } } diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 175c2072..f8c09c0e 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -70,6 +70,10 @@ struct llama_model_loader { offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) { + auto data_offset = gguf_get_data_offset(gguf_ctx); + auto tensor_offset = gguf_get_tensor_offset(gguf_ctx, tensor_idx); + fprintf(stderr, "Error while loading tensor %s: offs = %zu (%zu, %zu), size: %zu, file size: %zu\n", name, + offs, data_offset, tensor_offset, ggml_nbytes(tensor), file->size()); throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); } } diff --git a/src/llama-model.h b/src/llama-model.h index decdbb2b..9691b962 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -415,6 +415,9 @@ struct llama_model { struct ggml_tensor * output; struct ggml_tensor * output_b; struct ggml_tensor * output_norm_enc; + struct ggml_tensor * output_mtp = nullptr; + + std::unique_ptr output_mtp_ptr; llama_split_tensor split_output; llama_split_tensor split_output_norm; diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 6ccc054d..8ee24ae1 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -924,6 +924,72 @@ static llama_ftype repacked_ftype(llama_ftype ftype) { return ftype; } +static void do_quantize(int nthread, const ggml_tensor * tensor, ggml_type new_type, const float * f32_data, char * new_data, + const float * imatrix, std::vector & workers, size_t & new_size, int chunk_size_multiplier, + const llama_model_quantize_params * params) { + if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) { + std::mutex mutex; + int counter = 0; + bool valid = true; + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () { + int ne2 = tensor->ne[2]; + auto row_size = ggml_row_size(new_type, tensor->ne[0]); + auto matrix_size = row_size * tensor->ne[1]; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + int i02 = counter++; + if (i02 >= ne2) { + if (local_size > 0) { + new_size += local_size; + } + break; + } + lock.unlock(); + auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr; + auto this_data = (char *)new_data + i02*matrix_size; + auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, + 0, tensor->ne[1], tensor->ne[0], this_imatrix, user_data); + local_size += this_size; + + // validate the quantized data + if (!ggml_validate_row_data(new_type, this_data, matrix_size)) { + lock.lock(); + valid = false; + break; + } + } + }; + for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute)); + for (auto & w : workers) w.join(); + workers.clear(); + if (!valid) { + throw std::runtime_error("quantized data validation failed"); + } + } else { + static const int64_t min_chunk_size = 32 * 512; + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + const int64_t chunk_size = (n_per_row >= min_chunk_size + ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier; + + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; + const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + + // quantize each expert separately since they have different importance matrices + new_size = 0; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, + nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use); + } + } +} + static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; @@ -1210,6 +1276,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ctx_outs[0] = ctx_out; // populate the original tensors so we get an initial meta data + int last_split = -1; for (int i = 0; i < ml.n_tensors; ++i) { auto weight = ml.get_weight(i); uint16_t i_split = params->keep_split ? weight->idx : 0; @@ -1218,6 +1285,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ctx_outs[i_split] = gguf_init_empty(); } gguf_add_tensor(ctx_outs[i_split], tensor); + if (i_split > last_split) { + last_split = i_split; + } } // Set split info if needed @@ -1290,7 +1360,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s continue; } - const std::string name = ggml_get_name(tensor); + std::string name = ggml_get_name(tensor); if (!ml.use_mmap) { if (read_data.size() < ggml_nbytes(tensor)) { @@ -1450,7 +1520,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. - quantize = tensor->type != new_type; + quantize &= tensor->type != new_type; } if (!quantize) { @@ -1566,72 +1636,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - if (work.size() < (size_t)nelements * 4) { - work.resize(nelements * 4); // upper bound on size + auto expected_size = ggml_row_size(new_type, tensor->ne[0])*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; + + if (work.size() < expected_size) { //(size_t)nelements * 4) { + //work.resize(nelements * 4); // upper bound on size + work.resize(expected_size); // upper bound on size } new_data = work.data(); - const int64_t n_per_row = tensor->ne[0]; - const int64_t nrows = tensor->ne[1]; + do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers, + new_size, chunk_size_multiplier, params); - if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) { - std::mutex mutex; - int counter = 0; - bool valid = true; - auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () { - int ne2 = tensor->ne[2]; - auto row_size = ggml_row_size(new_type, tensor->ne[0]); - auto matrix_size = row_size * tensor->ne[1]; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - int i02 = counter++; - if (i02 >= ne2) { - if (local_size > 0) { - new_size += local_size; - } - break; - } - lock.unlock(); - auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr; - auto this_data = (char *)new_data + i02*matrix_size; - auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0], - this_imatrix, user_data); - local_size += this_size; - - // validate the quantized data - if (!ggml_validate_row_data(new_type, this_data, matrix_size)) { - lock.lock(); - valid = false; - break; - } - } - }; - for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute)); - for (auto & w : workers) w.join(); - workers.clear(); - if (!valid) { - throw std::runtime_error("quantized data validation failed"); - } - } else { - static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; - - const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; - const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; - const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; - - // quantize each expert separately since they have different importance matrices - new_size = 0; - for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { - const float * f32_data_03 = f32_data + i03 * nelements_matrix; - void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; - const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; - - new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use); - } - } } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } diff --git a/src/llama.cpp b/src/llama.cpp index 635a0fd1..ee6abd76 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2132,6 +2132,91 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } +static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) { + if (new_type == GGML_TYPE_COUNT || !model.output) return; + if (model.output_mtp && model.output_mtp != model.output) { + LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__); + return; + } + if (model.output->type == new_type) { + LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type)); + } + auto [other_type, n_interleaved] = interleaved_properties(new_type); + if (model.output->ne[1] % n_interleaved != 0) { + LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__, + model.output->ne[1], n_interleaved, ggml_type_name(new_type)); + LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type)); + new_type = other_type; + n_interleaved = 1; + } + auto nbytes_orig = ggml_nbytes(model.output); + auto row_size = ggml_row_size(new_type, model.output->ne[0]); + auto nbytes_new = row_size*ggml_nrows(model.output); + if (nbytes_new >= nbytes_orig) { + LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig); + return; + } + + LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n", + ggml_type_name(new_type), nbytes_new/1024./1024.); + + bool is_host = ggml_backend_buffer_is_host(model.output->buffer); + + auto tensor_data = model.output->data; + std::vector tensor_data_buf; + if (!is_host) { + tensor_data_buf.resize(nbytes_orig); + ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig); + tensor_data = tensor_data_buf.data(); + } + + auto tt_new = ggml_internal_get_type_traits(new_type); + auto new_output = std::make_unique(*model.output); + new_output->type = new_type; + new_output->nb[0] = tt_new.type_size; + new_output->nb[1] = row_size; + new_output->nb[2] = new_output->nb[1] * new_output->ne[1]; + new_output->nb[3] = new_output->nb[2] * new_output->ne[2]; + GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new); + new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new); + new_output->data = ggml_backend_buffer_get_base(new_output->buffer); + new_output->op = GGML_OP_NONE; + for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr; + ggml_set_name(new_output.get(), "output_extra.weight"); + ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + std::vector new_data_buf; + char * new_data = (char *)new_output->data; + if (!is_host) { + new_data_buf.resize(nbytes_new); + new_data = new_data_buf.data(); + } + + int nthread = std::max(1, std::thread::hardware_concurrency()/2); + + auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) { + std::vector work(t->ne[0]*n_interleaved); + auto tt_orig = ggml_internal_get_type_traits(t->type); + auto tt_new = ggml_internal_get_type_traits(new_type); + iqk_quantize_any(int(t->type), int(new_type), + t->ne[0], t->ne[1], t->ne[2], t->ne[3], + t->nb[0], t->nb[1], t->nb[2], t->nb[3], + tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread); + }; + std::vector workers(nthread-1); + for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it); + compute(nthread-1); + for (auto & w : workers) w.join(); + + if (!is_host) { + ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new); + } + + model.output_mtp_ptr = std::move(new_output); + model.output_mtp = model.output_mtp_ptr.get(); + +} + static void llm_prepare_mla(llama_model & model, int mla) { if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA && model.arch != LLM_ARCH_MISTRAL4) return; const auto& hparams = model.hparams; @@ -2768,6 +2853,7 @@ static bool llm_load_tensors( const float * tensor_split, ggml_type cache_type_k, ggml_type cache_type_v, + ggml_type extra_output_type, uint32_t max_ctx_size, int n_seq_max, int n_ubatch, @@ -3364,6 +3450,9 @@ static bool llm_load_tensors( if (model.arch == LLM_ARCH_GEMMA4) { llm_scale_gate_inp_s(model, use_mmap_buffer); } + if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) { + llm_requantize_output_tensor(model, extra_output_type); + } if (use_mmap_buffer) { for (auto & mapping : ml.mappings) { @@ -3521,7 +3610,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split, - params.type_k, params.type_v, params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, + params.type_k, params.type_v, params.extra_output_type, + params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, params.worst_graph_tokens, params.flash_attn, params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run, params.progress_callback, params.progress_callback_user_data @@ -5617,6 +5707,7 @@ struct llama_model_params llama_model_default_params() { /*.n_last_k =*/ -1, /*.n_first_v =*/ -1, /*.n_last_v =*/ -1, + /*.extra_output_type =*/ GGML_TYPE_COUNT, /*.tensor_split =*/ nullptr, /*.rpc_servers =*/ nullptr, /*.progress_callback =*/ nullptr,