mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
MTP: option to use re-quantized output tensor for better TG performance (#1809)
* Option to use re-quantized output tensor for MTP * Remove quantize extra output option * Handle interleaved types
This commit is contained in:
parent
5cc0d86c76
commit
3e573cfea6
@ -1727,6 +1727,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "--mtp-requantized-output-tensor" || arg == "-mtprot") {
|
||||
CHECK_ARG
|
||||
params.extra_output_type = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "-ctkd" || arg == "--cache-type-k-draft") {
|
||||
params.speculative.cache_type_k = argv[++i];
|
||||
return true;
|
||||
@ -3028,6 +3033,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
options.push_back({ "*", "-ctv-last, --cache-type-k-last TYPE,N", "KV cache data type for the last N layers of K (default: %s,-1)", params.type_k_last.c_str() });
|
||||
options.push_back({ "*", "-ctv-first, --cache-type-v-first TYPE,N", "KV cache data type for the first N layers of V (default: %s,-1)", params.type_v_first.c_str() });
|
||||
options.push_back({ "*", "-ctk-last, --cache-type-v-last TYPE,N", "KV cache data type for the last N layers of V (default: %s,-1)", params.type_v_last.c_str() });
|
||||
options.push_back({ "*", "-mtprot, --mtp-requantize-output-tensor type", "Use output requantized to type for MTP (default: %s)", params.extra_output_type.c_str() });
|
||||
options.push_back({ "*", "-ctkd, --cache-type-k-draft TYPE", "KV cache data type for K for the draft model" });
|
||||
options.push_back({ "*", "-ctvd, --cache-type-v-draft TYPE", "KV cache data type for V for the draft model" });
|
||||
|
||||
@ -3926,6 +3932,17 @@ static std::pair<int, int> get_batch_ubatch(const gpt_params & params) {
|
||||
return {n_batch, n_ubatch};
|
||||
}
|
||||
|
||||
static ggml_type parse_ggml_type(const char * arg) {
|
||||
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
|
||||
auto type = ggml_type(j);
|
||||
const auto * name = ggml_type_name(type);
|
||||
if (name && strcmp(arg, name) == 0) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
struct llama_model_params common_model_params_to_llama(const gpt_params & params) {
|
||||
auto mparams = llama_model_default_params();
|
||||
mparams.devices = params.devices.c_str();
|
||||
@ -3948,6 +3965,9 @@ struct llama_model_params common_model_params_to_llama(const gpt_params & params
|
||||
mparams.type_k_last = kv_cache_type_from_str(params.type_k_last );
|
||||
mparams.type_v_first = kv_cache_type_from_str(params.type_v_first);
|
||||
mparams.type_v_last = kv_cache_type_from_str(params.type_v_last );
|
||||
if (!params.extra_output_type.empty()) {
|
||||
mparams.extra_output_type = parse_ggml_type(params.extra_output_type.c_str());
|
||||
}
|
||||
mparams.n_k_first = params.n_k_first;
|
||||
mparams.n_k_last = params.n_k_last;
|
||||
mparams.n_v_first = params.n_v_first;
|
||||
|
||||
@ -443,6 +443,8 @@ struct gpt_params {
|
||||
int32_t n_v_first = -1;
|
||||
int32_t n_v_last = -1;
|
||||
|
||||
std::string extra_output_type = "";
|
||||
|
||||
// multimodal models (see examples/mtmd)
|
||||
common_params_model mmproj;
|
||||
bool mmproj_use_gpu = true; // use GPU for multimodal model
|
||||
|
||||
@ -240,6 +240,81 @@ void IQ1BNQuantizer::quantize_one_row_2bn(const float * src, block_iq2_bn * y, i
|
||||
}
|
||||
}
|
||||
|
||||
static inline int num_rows([[maybe_unused]] ggml_type type) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q2_K_R4:
|
||||
case GGML_TYPE_Q3_K_R4:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
case GGML_TYPE_IQ4_KS_R4:
|
||||
case GGML_TYPE_IQ5_KS_R4:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
case GGML_TYPE_IQ2_XS_R4:
|
||||
case GGML_TYPE_IQ2_S_R4:
|
||||
case GGML_TYPE_IQ3_XXS_R4:
|
||||
case GGML_TYPE_IQ1_S_R4:
|
||||
case GGML_TYPE_IQ1_M_R4:
|
||||
case GGML_TYPE_IQ3_S_R4: return 4;
|
||||
case GGML_TYPE_IQ4_NL_R4:
|
||||
case GGML_TYPE_Q5_0_R4:
|
||||
case GGML_TYPE_Q6_0_R4:
|
||||
case GGML_TYPE_IQ2_BN_R4:
|
||||
case GGML_TYPE_IQ4_XS_R8:
|
||||
case GGML_TYPE_Q4_K_R4:
|
||||
case GGML_TYPE_Q5_K_R4:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q4_0_R8:
|
||||
case GGML_TYPE_Q8_0_R8:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
#else
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q2_K_R4:
|
||||
case GGML_TYPE_Q3_K_R4:
|
||||
case GGML_TYPE_Q4_K_R4:
|
||||
case GGML_TYPE_Q5_K_R4:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q5_0_R4:
|
||||
case GGML_TYPE_Q6_0_R4:
|
||||
case GGML_TYPE_IQ4_NL_R4:
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
case GGML_TYPE_IQ4_KS_R4:
|
||||
case GGML_TYPE_IQ5_KS_R4:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
case GGML_TYPE_IQ2_XS_R4:
|
||||
case GGML_TYPE_IQ2_S_R4:
|
||||
case GGML_TYPE_IQ3_XXS_R4:
|
||||
case GGML_TYPE_IQ3_S_R4:
|
||||
case GGML_TYPE_IQ1_S_R4:
|
||||
case GGML_TYPE_IQ1_M_R4:
|
||||
case GGML_TYPE_IQ2_BN_R4: return 4;
|
||||
case GGML_TYPE_IQ4_XS_R8:
|
||||
case GGML_TYPE_Q4_0_R8:
|
||||
case GGML_TYPE_Q8_0_R8:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void iqk_quantize_any(int from_type, int to_type,
|
||||
@ -251,21 +326,28 @@ void iqk_quantize_any(int from_type, int to_type,
|
||||
GGML_ASSERT(ggml_type_size(type_x) == nb0);
|
||||
auto type_y = ggml_type(to_type);
|
||||
auto row_size_y = ggml_row_size(type_y, ne0);
|
||||
int64_t nrows = ne1*ne2*ne3;
|
||||
auto n_interleaved = num_rows(type_y);
|
||||
GGML_ASSERT(ne1 % n_interleaved == 0);
|
||||
int64_t ne1i = ne1/n_interleaved;
|
||||
int64_t nrows = ne1i*ne2*ne3;
|
||||
int64_t nrows_per_thread = (nrows + nth - 1)/nth;
|
||||
int64_t first_row = nrows_per_thread*ith;
|
||||
if (first_row >= nrows) return;
|
||||
int64_t last_row = std::min(first_row + nrows_per_thread, nrows);
|
||||
for (int64_t row = first_row; row < last_row; ++row) {
|
||||
int64_t i3 = row/(ne1*ne2);
|
||||
int64_t i2 = (row - i3*ne1*ne2)/ne1;
|
||||
int64_t i1 = row - i3*ne1*ne2 - i2*ne1;
|
||||
const char * cx = (const char *)x + i1*nb1 + i2*nb2 + i3*nb3;
|
||||
int64_t i3 = row/(ne1i*ne2);
|
||||
int64_t i2 = (row - i3*ne1i*ne2)/ne1i;
|
||||
int64_t i1 = row - i3*ne1i*ne2 - i2*ne1i;
|
||||
auto cx = (const char *)x + i1*n_interleaved*nb1 + i2*nb2 + i3*nb3;
|
||||
auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1*n_interleaved)*row_size_y;
|
||||
// TODO: special case common types such as f16, q8_0
|
||||
// (although the performance gains may be too small to justify the added complexity)
|
||||
to_float((const void *)cx, (float *)work_buffer, ne0);
|
||||
auto cy = (char *)y + (i3*ne1*ne2 + i2*ne1 + i1)*row_size_y;
|
||||
from_float((const float *)work_buffer, (void *)cy, ne0);
|
||||
if (type_x != GGML_TYPE_F32) {
|
||||
to_float((const void *)cx, (float *)work_buffer, ne0*n_interleaved);
|
||||
from_float((const float *)work_buffer, (void *)cy, ne0*n_interleaved);
|
||||
} else {
|
||||
from_float((const float *)cx, (void *)cy, ne0*n_interleaved);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -394,6 +394,8 @@ extern "C" {
|
||||
int32_t n_v_first;
|
||||
int32_t n_v_last;
|
||||
|
||||
enum ggml_type extra_output_type;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
|
||||
@ -238,7 +238,7 @@ struct ggml_tensor * llm_build_context::build_qwen35moe_mtp(
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
|
||||
cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
return cur;
|
||||
@ -317,7 +317,7 @@ struct ggml_tensor * llm_build_context::build_qwen35_mtp(
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
//cur = build_output(lctx, ctx0, cur, model.output, nullptr, cb);
|
||||
cur = build_output(lctx, ctx0, cur, model.output, mtp_layer.nextn.shared_head_norm, cb);
|
||||
cur = build_output(lctx, ctx0, cur, model.output_mtp, mtp_layer.nextn.shared_head_norm, cb);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
return cur;
|
||||
|
||||
@ -1520,6 +1520,13 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) {
|
||||
if (model.output == NULL) {
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
if (model.mtp) {
|
||||
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
if (!model.output_mtp) {
|
||||
model.output_mtp = model.output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
@ -1614,9 +1621,18 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
|
||||
// output
|
||||
{
|
||||
model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
if (model.output == NULL) {
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},
|
||||
llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
if (model.mtp) {
|
||||
model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab},
|
||||
llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
if (!model.output_mtp) {
|
||||
model.output_mtp = model.output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -70,6 +70,10 @@ struct llama_model_loader {
|
||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
|
||||
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
|
||||
auto data_offset = gguf_get_data_offset(gguf_ctx);
|
||||
auto tensor_offset = gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||
fprintf(stderr, "Error while loading tensor %s: offs = %zu (%zu, %zu), size: %zu, file size: %zu\n", name,
|
||||
offs, data_offset, tensor_offset, ggml_nbytes(tensor), file->size());
|
||||
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
||||
}
|
||||
}
|
||||
|
||||
@ -415,6 +415,9 @@ struct llama_model {
|
||||
struct ggml_tensor * output;
|
||||
struct ggml_tensor * output_b;
|
||||
struct ggml_tensor * output_norm_enc;
|
||||
struct ggml_tensor * output_mtp = nullptr;
|
||||
|
||||
std::unique_ptr<ggml_tensor> output_mtp_ptr;
|
||||
|
||||
llama_split_tensor split_output;
|
||||
llama_split_tensor split_output_norm;
|
||||
|
||||
@ -924,6 +924,72 @@ static llama_ftype repacked_ftype(llama_ftype ftype) {
|
||||
return ftype;
|
||||
}
|
||||
|
||||
static void do_quantize(int nthread, const ggml_tensor * tensor, ggml_type new_type, const float * f32_data, char * new_data,
|
||||
const float * imatrix, std::vector<std::thread> & workers, size_t & new_size, int chunk_size_multiplier,
|
||||
const llama_model_quantize_params * params) {
|
||||
if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
bool valid = true;
|
||||
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
|
||||
int ne2 = tensor->ne[2];
|
||||
auto row_size = ggml_row_size(new_type, tensor->ne[0]);
|
||||
auto matrix_size = row_size * tensor->ne[1];
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int i02 = counter++;
|
||||
if (i02 >= ne2) {
|
||||
if (local_size > 0) {
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
|
||||
auto this_data = (char *)new_data + i02*matrix_size;
|
||||
auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data,
|
||||
0, tensor->ne[1], tensor->ne[0], this_imatrix, user_data);
|
||||
local_size += this_size;
|
||||
|
||||
// validate the quantized data
|
||||
if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
|
||||
lock.lock();
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
|
||||
for (auto & w : workers) w.join();
|
||||
workers.clear();
|
||||
if (!valid) {
|
||||
throw std::runtime_error("quantized data validation failed");
|
||||
}
|
||||
} else {
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size
|
||||
? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * chunk_size_multiplier;
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size,
|
||||
nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
@ -1210,6 +1276,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
ctx_outs[0] = ctx_out;
|
||||
|
||||
// populate the original tensors so we get an initial meta data
|
||||
int last_split = -1;
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
auto weight = ml.get_weight(i);
|
||||
uint16_t i_split = params->keep_split ? weight->idx : 0;
|
||||
@ -1218,6 +1285,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
ctx_outs[i_split] = gguf_init_empty();
|
||||
}
|
||||
gguf_add_tensor(ctx_outs[i_split], tensor);
|
||||
if (i_split > last_split) {
|
||||
last_split = i_split;
|
||||
}
|
||||
}
|
||||
|
||||
// Set split info if needed
|
||||
@ -1290,7 +1360,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
std::string name = ggml_get_name(tensor);
|
||||
|
||||
if (!ml.use_mmap) {
|
||||
if (read_data.size() < ggml_nbytes(tensor)) {
|
||||
@ -1450,7 +1520,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
quantize &= tensor->type != new_type;
|
||||
}
|
||||
|
||||
if (!quantize) {
|
||||
@ -1566,72 +1636,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
if (work.size() < (size_t)nelements * 4) {
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
auto expected_size = ggml_row_size(new_type, tensor->ne[0])*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
|
||||
if (work.size() < expected_size) { //(size_t)nelements * 4) {
|
||||
//work.resize(nelements * 4); // upper bound on size
|
||||
work.resize(expected_size); // upper bound on size
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
const int64_t n_per_row = tensor->ne[0];
|
||||
const int64_t nrows = tensor->ne[1];
|
||||
do_quantize(nthread, tensor, new_type, f32_data, (char *)new_data, imatrix, workers,
|
||||
new_size, chunk_size_multiplier, params);
|
||||
|
||||
if (nthread > 1 && (tensor->ne[2] % nthread == 0 || tensor->ne[2] >= 2*nthread)) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
bool valid = true;
|
||||
auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
|
||||
int ne2 = tensor->ne[2];
|
||||
auto row_size = ggml_row_size(new_type, tensor->ne[0]);
|
||||
auto matrix_size = row_size * tensor->ne[1];
|
||||
size_t local_size = 0;
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex);
|
||||
int i02 = counter++;
|
||||
if (i02 >= ne2) {
|
||||
if (local_size > 0) {
|
||||
new_size += local_size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
lock.unlock();
|
||||
auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
|
||||
auto this_data = (char *)new_data + i02*matrix_size;
|
||||
auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0],
|
||||
this_imatrix, user_data);
|
||||
local_size += this_size;
|
||||
|
||||
// validate the quantized data
|
||||
if (!ggml_validate_row_data(new_type, this_data, matrix_size)) {
|
||||
lock.lock();
|
||||
valid = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
for (int it = 0; it < nthread; ++it) workers.emplace_back(std::thread(compute));
|
||||
for (auto & w : workers) w.join();
|
||||
workers.clear();
|
||||
if (!valid) {
|
||||
throw std::runtime_error("quantized data validation failed");
|
||||
}
|
||||
} else {
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||
|
||||
// quantize each expert separately since they have different importance matrices
|
||||
new_size = 0;
|
||||
for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
|
||||
const float * f32_data_03 = f32_data + i03 * nelements_matrix;
|
||||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
|
||||
}
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
|
||||
@ -2132,6 +2132,91 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
|
||||
}
|
||||
|
||||
static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) {
|
||||
if (new_type == GGML_TYPE_COUNT || !model.output) return;
|
||||
if (model.output_mtp && model.output_mtp != model.output) {
|
||||
LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__);
|
||||
return;
|
||||
}
|
||||
if (model.output->type == new_type) {
|
||||
LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type));
|
||||
}
|
||||
auto [other_type, n_interleaved] = interleaved_properties(new_type);
|
||||
if (model.output->ne[1] % n_interleaved != 0) {
|
||||
LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__,
|
||||
model.output->ne[1], n_interleaved, ggml_type_name(new_type));
|
||||
LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type));
|
||||
new_type = other_type;
|
||||
n_interleaved = 1;
|
||||
}
|
||||
auto nbytes_orig = ggml_nbytes(model.output);
|
||||
auto row_size = ggml_row_size(new_type, model.output->ne[0]);
|
||||
auto nbytes_new = row_size*ggml_nrows(model.output);
|
||||
if (nbytes_new >= nbytes_orig) {
|
||||
LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig);
|
||||
return;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n",
|
||||
ggml_type_name(new_type), nbytes_new/1024./1024.);
|
||||
|
||||
bool is_host = ggml_backend_buffer_is_host(model.output->buffer);
|
||||
|
||||
auto tensor_data = model.output->data;
|
||||
std::vector<char> tensor_data_buf;
|
||||
if (!is_host) {
|
||||
tensor_data_buf.resize(nbytes_orig);
|
||||
ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig);
|
||||
tensor_data = tensor_data_buf.data();
|
||||
}
|
||||
|
||||
auto tt_new = ggml_internal_get_type_traits(new_type);
|
||||
auto new_output = std::make_unique<ggml_tensor>(*model.output);
|
||||
new_output->type = new_type;
|
||||
new_output->nb[0] = tt_new.type_size;
|
||||
new_output->nb[1] = row_size;
|
||||
new_output->nb[2] = new_output->nb[1] * new_output->ne[1];
|
||||
new_output->nb[3] = new_output->nb[2] * new_output->ne[2];
|
||||
GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new);
|
||||
new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new);
|
||||
new_output->data = ggml_backend_buffer_get_base(new_output->buffer);
|
||||
new_output->op = GGML_OP_NONE;
|
||||
for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr;
|
||||
ggml_set_name(new_output.get(), "output_extra.weight");
|
||||
ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
std::vector<char> new_data_buf;
|
||||
char * new_data = (char *)new_output->data;
|
||||
if (!is_host) {
|
||||
new_data_buf.resize(nbytes_new);
|
||||
new_data = new_data_buf.data();
|
||||
}
|
||||
|
||||
int nthread = std::max<int>(1, std::thread::hardware_concurrency()/2);
|
||||
|
||||
auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) {
|
||||
std::vector<float> work(t->ne[0]*n_interleaved);
|
||||
auto tt_orig = ggml_internal_get_type_traits(t->type);
|
||||
auto tt_new = ggml_internal_get_type_traits(new_type);
|
||||
iqk_quantize_any(int(t->type), int(new_type),
|
||||
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
|
||||
t->nb[0], t->nb[1], t->nb[2], t->nb[3],
|
||||
tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread);
|
||||
};
|
||||
std::vector<std::thread> workers(nthread-1);
|
||||
for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it);
|
||||
compute(nthread-1);
|
||||
for (auto & w : workers) w.join();
|
||||
|
||||
if (!is_host) {
|
||||
ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new);
|
||||
}
|
||||
|
||||
model.output_mtp_ptr = std::move(new_output);
|
||||
model.output_mtp = model.output_mtp_ptr.get();
|
||||
|
||||
}
|
||||
|
||||
static void llm_prepare_mla(llama_model & model, int mla) {
|
||||
if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA && model.arch != LLM_ARCH_MISTRAL4) return;
|
||||
const auto& hparams = model.hparams;
|
||||
@ -2768,6 +2853,7 @@ static bool llm_load_tensors(
|
||||
const float * tensor_split,
|
||||
ggml_type cache_type_k,
|
||||
ggml_type cache_type_v,
|
||||
ggml_type extra_output_type,
|
||||
uint32_t max_ctx_size,
|
||||
int n_seq_max,
|
||||
int n_ubatch,
|
||||
@ -3364,6 +3450,9 @@ static bool llm_load_tensors(
|
||||
if (model.arch == LLM_ARCH_GEMMA4) {
|
||||
llm_scale_gate_inp_s(model, use_mmap_buffer);
|
||||
}
|
||||
if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) {
|
||||
llm_requantize_output_tensor(model, extra_output_type);
|
||||
}
|
||||
|
||||
if (use_mmap_buffer) {
|
||||
for (auto & mapping : ml.mappings) {
|
||||
@ -3521,7 +3610,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
|
||||
params.type_k, params.type_v, params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
|
||||
params.type_k, params.type_v, params.extra_output_type,
|
||||
params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin,
|
||||
params.worst_graph_tokens, params.flash_attn,
|
||||
params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
@ -5617,6 +5707,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.n_last_k =*/ -1,
|
||||
/*.n_first_v =*/ -1,
|
||||
/*.n_last_v =*/ -1,
|
||||
/*.extra_output_type =*/ GGML_TYPE_COUNT,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rpc_servers =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user