From e5355e98954ccb49b84b4a3e635fbf6838a48f7f Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Thu, 23 Apr 2026 09:05:39 +0200 Subject: [PATCH] Quantization options (#1677) --- examples/benchmark/benchmark-matmult.cpp | 4 +- examples/quantize-stats/quantize-stats.cpp | 2 +- examples/quantize/quantize.cpp | 8 + ggml/include/ggml.h | 8 +- ggml/src/CMakeLists.txt | 3 - ggml/src/ggml-quants.c | 130 ++++-- ggml/src/ggml-quants.h | 46 +- ggml/src/ggml.c | 139 +++--- ggml/src/iqk/iqk_quantize.cpp | 505 ++++++++++++--------- ggml/src/iqk/iqk_quantize.h | 96 ++-- include/llama.h | 2 + src/llama-quantize.cpp | 16 +- src/llama.cpp | 1 + 13 files changed, 568 insertions(+), 392 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 47cb16c6..b56a64b1 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -192,7 +192,7 @@ int main(int argc, char ** argv) { // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr, nullptr); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -205,7 +205,7 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr, nullptr); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 7e680ef3..d801bc15 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1337,7 +1337,7 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo int last = std::min(first + chunk, nrows); for (int row = first; row < last; ++row) { auto xr = values + row*n_per_row; - ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr); + ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr, nullptr); const float * dptr = (const float *)Q.data(); const float d = *dptr; const block_iq4_ks * iq4 = (const block_iq4_ks *)(dptr + 1); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 51a66a8d..83f79722 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -167,6 +167,8 @@ static void usage(const char * executable) { printf(" --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n"); printf(" --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n"); printf(" --repack-pattern Comma separated list of regexs to use for matching tensor names to be repacked.\n\n"); + printf(" --symmetric-q40 Use [-7:7] range for Q4_0 quantization (turns off imatrix)\n\n"); + printf(" --slow-iq2ks Use the original very slow IQ2_KS quantization method.\n\n"); printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n"); printf(" --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n"); printf(" --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n"); @@ -348,6 +350,8 @@ int main(int argc, char ** argv) { std::vector included_weights, excluded_weights; std::vector kv_overrides; std::vector custom_quants; + quantize_user_data user_data = { false, false }; + params.user_data = &user_data; std::vector repack_patterns; @@ -360,6 +364,10 @@ int main(int argc, char ** argv) { params.ignore_imatrix_rules = true; } else if (strcmp(argv[arg_idx], "--dry-run") == 0) { params.dry_run = true; + } else if (strcmp(argv[arg_idx], "--symmetric-q40") == 0) { + user_data.symmetric_q4_0 = true; + } else if (strcmp(argv[arg_idx], "--slow-iq2ks") == 0) { + user_data.slow_iq2_ks = true; } else if (strcmp(argv[arg_idx], "--repack") == 0) { params.only_repack = true; } else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2854614e..0d164166 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2934,6 +2934,11 @@ extern "C" { // some quantization type cannot be used without an importance matrix GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type); + struct quantize_user_data { + bool symmetric_q4_0; + bool slow_iq2_ks; + }; + // calls ggml_quantize_init internally (i.e. can allocate memory) GGML_API size_t ggml_quantize_chunk( enum ggml_type type, @@ -2942,7 +2947,8 @@ extern "C" { int64_t start, int64_t nrows, int64_t n_per_row, - const float * imatrix); + const float * imatrix, + const struct quantize_user_data * user_data); // // gguf diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 0d4f4023..078876dd 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -300,9 +300,6 @@ if (GGML_IQK_MUL_MAT) message(STATUS "Disabling IQK Flash Attention kernels") endif() endif() -if (IQK_SLOW_IQ2KS_QUANTIZE) - set_source_files_properties(iqk/iqk_quantize.cpp PROPERTIES COMPILE_DEFINITIONS IQK_SLOW_IQ2KS_QUANTIZE) -endif() if (GGML_CUDA) cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 6f9007e0..8209ea93 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -2464,7 +2464,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); if (!quant_weights) { quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -2696,7 +2698,9 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri } } -size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); if (!quant_weights) { quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -2901,7 +2905,9 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri } } -size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); if (!quant_weights) { quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -3132,7 +3138,9 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri } } -size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); if (!quant_weights) { quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -3341,7 +3349,9 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri } } -size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); if (!quant_weights) { quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row); @@ -3389,7 +3399,45 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri } } -size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +static void quantize_row_q4_0_symmetric(const float * restrict x, block_q4_0 * restrict y, int64_t k) { + static const int qk = QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + for (int j = 0; j < qk; j++) { + const float v = fabsf(x[i*qk + j]); + amax = MAX(amax, v); + } + + const float d = amax / 7; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); + if (user_data && user_data->symmetric_q4_0) { + quantize_row_q4_0_symmetric(src, dst, (int64_t)nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); + } if (!quant_weights) { quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); @@ -3434,7 +3482,9 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri } } -size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); if (!quant_weights) { quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row); @@ -3488,7 +3538,9 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri } } -size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); if (!quant_weights) { quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row); @@ -3541,7 +3593,9 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri } } -size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); if (!quant_weights) { quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row); @@ -3593,7 +3647,9 @@ static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restri } } -size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row); char * qrow = (char *)dst; for (int64_t row = 0; row < nrow; ++row) { @@ -3604,7 +3660,9 @@ size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nr return nrow * row_size; } -size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row); @@ -13342,7 +13400,9 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } } -size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -13362,10 +13422,12 @@ void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int64_t void quantize_row_iq2_xxs_ref(const float * restrict x, block_iq2_xxs * restrict y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_xxs(x, y, 1, k, NULL); + quantize_iq2_xxs(x, y, 1, k, NULL, NULL); } -size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -13385,7 +13447,7 @@ void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int64_t k void quantize_row_iq2_xs_ref(const float * restrict x, block_iq2_xs * restrict y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_xs(x, y, 1, k, NULL); + quantize_iq2_xs(x, y, 1, k, NULL, NULL); } // @@ -13818,7 +13880,9 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } } -size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -14024,7 +14088,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } #define IQ3S_BLOCK_SIZE 32 -size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; float scales[QK_K/IQ3S_BLOCK_SIZE]; @@ -14054,7 +14120,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) { assert(k % QK_K == 0); - quantize_iq3_s(x, y, 1, k, NULL); + quantize_iq3_s(x, y, 1, k, NULL, NULL); } @@ -14358,7 +14424,9 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); float scales[QK_K/IQ1S_BLOCK_SIZE]; float weight[IQ1S_BLOCK_SIZE]; @@ -14383,7 +14451,7 @@ void quantize_row_iq1_s_ref (const float * GGML_RESTRICT x, block_iq1_s * GGM float qw[QK_K]; for (int j = 0; j < QK_K; ++j) qw[j] = 1; for (int ibl = 0; ibl < nblock; ++ibl) { - quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw); + quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw, NULL); } } @@ -14660,7 +14728,9 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); float scales[QK_K/IQ1M_BLOCK_SIZE]; float weight[IQ1M_BLOCK_SIZE]; @@ -14683,7 +14753,7 @@ void quantize_row_iq1_m_ref (const float * GGML_RESTRICT x, block_iq1_m * GGM float qw[QK_K]; for (int j = 0; j < QK_K; ++j) qw[j] = 1; for (int ibl = 0; ibl < nblock; ++ibl) { - quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw); + quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw, NULL); } } @@ -14881,7 +14951,9 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } -size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK4_NL == 0); int64_t nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; @@ -14923,7 +14995,9 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y quantize_row_iq4_nl(x, y, k); } -size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -14951,7 +15025,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) { assert(k % QK_K == 0); - quantize_iq4_xs(x, y, 1, k, NULL); + quantize_iq4_xs(x, y, 1, k, NULL, NULL); } // =============================== 2.5625 bpw @@ -15124,7 +15198,9 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { +size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights, + const struct quantize_user_data * user_data) { + GGML_UNUSED(user_data); GGML_ASSERT(n_per_row%QK_K == 0); int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -15138,7 +15214,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_s(x, y, 1, k, NULL); + quantize_iq2_s(x, y, 1, k, NULL, NULL); } void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) { diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 7c8e2110..3e9d6d2b 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -123,29 +123,31 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +struct quantize_user_data; -size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q6_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); + +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); +size_t quantize_q6_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 23d0cc60..59842a2c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -28455,7 +28455,8 @@ size_t ggml_quantize_chunk( int64_t start, int64_t nrows, int64_t n_per_row, - const float * imatrix) { + const float * imatrix, + const struct quantize_user_data * user_data) { const int64_t n = (int64_t) nrows * n_per_row; if (ggml_quantize_requires_imatrix(type)) { @@ -28473,74 +28474,74 @@ size_t ggml_quantize_chunk( size_t result = 0; switch (type) { - case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q6_0: result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_KV: result = quantize_q8_KV(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q2_K_R4: result = quantize_q2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q3_K_R4: result = quantize_q3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_K_R4: result = quantize_q4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q5_K_R4: result = quantize_q5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_XS_R4:result = quantize_iq2_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_XXS_R4:result = quantize_iq3_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_BN_R4:result = quantize_iq2_bn_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_NL_R4: result = quantize_iq4_nl_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_XS_R8: result = quantize_iq4_xs_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_R8: result = quantize_q4_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q5_0_R4: result = quantize_q5_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q6_0_R4: result = quantize_q6_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q8_0_R8: result = quantize_q8_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ5_KS_R4:result = quantize_iq5_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ5_KS: result = quantize_iq5_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ1_KT: result = quantize_iq1_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_KT: result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_KT: result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_KS: result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ2_KL: result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ3_K_R4:result = quantize_iq3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ5_K_R4:result = quantize_iq5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_IQ6_K: result = quantize_iq6_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q6_0: result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_KV: result = quantize_q8_KV(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q2_K_R4: result = quantize_q2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q3_K_R4: result = quantize_q3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q4_K_R4: result = quantize_q4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q5_K_R4: result = quantize_q5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_XS_R4:result = quantize_iq2_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_XXS_R4:result = quantize_iq3_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_BN: result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_BN: result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_BN_R4:result = quantize_iq2_bn_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_NL_R4: result = quantize_iq4_nl_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_XS_R8: result = quantize_iq4_xs_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q4_0_R8: result = quantize_q4_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q5_0_R4: result = quantize_q5_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q6_0_R4: result = quantize_q6_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q8_0_R8: result = quantize_q8_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_MXFP4: result = quantize_mxfp4 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ5_KS_R4:result = quantize_iq5_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ5_KS: result = quantize_iq5_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_KS: result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ1_KT: result = quantize_iq1_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_KT: result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_KT: result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_KS: result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ2_KL: result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_K: result = quantize_iq4_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ3_K_R4:result = quantize_iq3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ5_K: result = quantize_iq5_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ5_K_R4:result = quantize_iq5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; + case GGML_TYPE_IQ6_K: result = quantize_iq6_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break; case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 1d718049..7f183468 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -41,11 +41,11 @@ inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -typedef void (*quantize_func_t)(const float * src, void * qdata, int n_per_row, const float * imatrix); +typedef void (*quantize_func_t)(const float * src, void * qdata, int n_per_row, const float * imatrix, const quantize_user_data * user_data); struct QHelper { - QHelper(const float * imatrix, int n_per_row, int block_size) : m_imatrix(imatrix), - m_n_per_row(n_per_row), m_block_size(block_size) { + QHelper(const float * imatrix, const quantize_user_data * user_data, int n_per_row, int block_size) : + m_imatrix(imatrix), m_user_data(user_data), m_n_per_row(n_per_row), m_block_size(block_size) { if (m_imatrix) { m_weight.resize(m_n_per_row); } @@ -78,13 +78,14 @@ struct QHelper { auto cdst = (char *)dst; for (int row = 0; row < nrows; ++row) { auto weights = row_weights(src); - qfunc(src, cdst, m_n_per_row, weights); + qfunc(src, cdst, m_n_per_row, weights, m_user_data); src += m_n_per_row; cdst += row_size; } } private: const float * m_imatrix; + const quantize_user_data * m_user_data; const int m_n_per_row; const int m_block_size; std::vector m_weight; @@ -92,12 +93,12 @@ private: template size_t quantize_repack(ggml_type type, const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, - const Func& q_func, const RepackFunc& repack) { + const quantize_user_data * user_data, const Func& q_func, const RepackFunc& repack) { GGML_ASSERT(nrows%n_repack == 0); GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(type, n_per_row); std::vector qtmp(n_repack*row_size); - QHelper helper(imatrix, n_per_row, block_size); + QHelper helper(imatrix, user_data, n_per_row, block_size); char * qrow = (char *)dst; for (int row = 0; row < nrows; row += n_repack) { helper.quantize(n_repack, src, qtmp.data(), row_size, q_func); @@ -269,7 +270,8 @@ void iqk_quantize_any(int from_type, int to_type, } -size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data *) { IQ1BNQuantizer iq1bn; auto row_size = ggml_row_size(GGML_TYPE_IQ1_BN, n_per_row); auto qrow = (char *)dst; @@ -281,11 +283,11 @@ size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_p } void quantize_row_iq1_bn_ref(const float * x, block_iq1_bn * y, int64_t k) { - quantize_iq1_bn(x, y, 1, k, nullptr); + quantize_iq1_bn(x, y, 1, k, nullptr, nullptr); } void quantize_row_iq1_bn(const float * x, void * y, int64_t k) { - quantize_iq1_bn(x, y, 1, k, nullptr); + quantize_iq1_bn(x, y, 1, k, nullptr, nullptr); } void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) { @@ -311,7 +313,8 @@ void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) { } } -size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { IQ1BNQuantizer iq1bn; auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row); auto qrow = (char *)dst; @@ -323,11 +326,11 @@ size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_p } void quantize_row_iq2_bn_ref(const float * x, block_iq2_bn * y, int64_t k) { - quantize_iq2_bn(x, y, 1, k, nullptr); + quantize_iq2_bn(x, y, 1, k, nullptr, nullptr); } void quantize_row_iq2_bn(const float * x, void * y, int64_t k) { - quantize_iq2_bn(x, y, 1, k, nullptr); + quantize_iq2_bn(x, y, 1, k, nullptr, nullptr); } void dequantize_row_iq2_bn(const block_iq2_bn * x, float * y, int64_t k) { @@ -1102,7 +1105,8 @@ inline int best_index_iq2nl(const int8_t * values, float x) { return x - values[idx] < values[idx+1] - x ? idx : idx + 1; } -void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) { +void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, + [[maybe_unused]] const quantize_user_data * user_data) { constexpr int kBlockSize = 16; @@ -1249,7 +1253,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl void quantize_row_iq2_k_ref(const float * x, block_iq2_k * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_k(x, (void *)y, 1, k, nullptr); + quantize_iq2_k(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq2_k(const float * x, void * vy, int64_t k) { @@ -1258,9 +1262,10 @@ void quantize_row_iq2_k(const float * x, void * vy, int64_t k) { quantize_row_iq2_k_ref(x, y, k); } -size_t quantize_iq2_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); - QHelper helper(imatrix, n_per_row, 16); + QHelper helper(imatrix, user_data, n_per_row, 16); auto row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row); helper.quantize(nrows, src, dst, row_size, quantize_row_iq2_k_impl); return nrows * row_size; @@ -1316,7 +1321,7 @@ void vec_dot_iq2_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, } namespace { -#if defined(__AVX2__) && !defined(IQK_SLOW_IQ2KS_QUANTIZE) +#if defined(__AVX2__) inline void to_values_i32(__m256i idx, __m256i ivalues, __m256i * iv) { auto ival = _mm256_shuffle_epi8(ivalues, idx); auto ival_1 = _mm256_srli_si256(ival, 8); @@ -1446,7 +1451,7 @@ float compute_1block_iq2ks_rmse(float d, const __m256 * vx, const __m256 * vw, c this_idx = idx; return hsum_float_8(vrmse); } -void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) { +void quantize_row_iq2_ks_fast_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) { constexpr int kBlockSize = 32; @@ -1601,7 +1606,7 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f float sumq2 = hsum_float_8(vsumq2); *dptr = GGML_FP32_TO_FP16(1.000f*(sumq2 > 0 ? sumqx/sumq2 : d)); } -#else +#endif void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) { constexpr int kBlockSize = 32; @@ -1751,12 +1756,11 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f } *dptr = GGML_FP32_TO_FP16(1.030f*(sumq2 > 0 ? sumqx/sumq2 : d)); } -#endif } void quantize_row_iq2_ks_ref(const float * x, block_iq2_ks * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_ks(x, (void *)y, 1, k, nullptr); + quantize_iq2_ks(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq2_ks(const float * x, void * vy, int64_t k) { @@ -1765,17 +1769,25 @@ void quantize_row_iq2_ks(const float * x, void * vy, int64_t k) { quantize_row_iq2_ks_ref(x, y, k); } -size_t quantize_iq2_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row); int nblock = n_per_row/QK_K; std::vector all_scales(nblock*(QK_K/kBlockSize)), all_sw(nblock*(QK_K/kBlockSize)); std::vector all_Ls(nblock*(QK_K/kBlockSize)); - auto q_func = [&all_scales, &all_sw, &all_Ls] (const float * x, void * vy, int n_per_row, const float * imatrix) { + auto q_func = [&all_scales, &all_sw, &all_Ls] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { +#ifdef __AVX2__ + if (user_data && !user_data->slow_iq2_ks) { + quantize_row_iq2_ks_fast_impl(x, vy, n_per_row, imatrix, all_scales.data(), all_sw.data(), all_Ls.data()); + return; + } +#endif quantize_row_iq2_ks_impl(x, vy, n_per_row, imatrix, all_scales.data(), all_sw.data(), all_Ls.data()); }; - QHelper helper(imatrix, n_per_row, kBlockSize); + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); helper.quantize(nrows, src, dst, row_size, q_func); return nrows * row_size; } @@ -2121,7 +2133,7 @@ void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const f void quantize_row_iq2_kl_ref(const float * x, block_iq2_kl * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_kl(x, (void *)y, 1, k, nullptr); + quantize_iq2_kl(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) { @@ -2130,16 +2142,18 @@ void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) { quantize_row_iq2_kl_ref(x, y, k); } -size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ2_KL, n_per_row); int nblock = n_per_row/QK_K; std::vector all_scales(nblock*(QK_K/kBlockSize)); - auto q_func = [&all_scales] (const float * x, void * vy, int n_per_row, const float * imatrix) { + auto q_func = [&all_scales] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq2_kl_impl(x, vy, n_per_row, imatrix, all_scales.data()); }; - QHelper helper(imatrix, n_per_row, kBlockSize); + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); helper.quantize(nrows, src, dst, row_size, q_func); return nrows * row_size; } @@ -2196,7 +2210,8 @@ void vec_dot_iq2_kl_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx // namespace { -static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) { +static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, + [[maybe_unused]] const quantize_user_data * user_data) { constexpr int ntry = 3; @@ -2416,7 +2431,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c void quantize_row_iq3_k_ref(const float * x, block_iq3_k * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq3_k(x, (void *)y, 1, k, nullptr); + quantize_iq3_k(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq3_k(const float * x, void * vy, int64_t k) { @@ -2425,9 +2440,10 @@ void quantize_row_iq3_k(const float * x, void * vy, int64_t k) { quantize_row_iq3_k_ref(x, y, k); } -size_t quantize_iq3_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq3_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); - QHelper helper(imatrix, n_per_row, 16); + QHelper helper(imatrix, user_data, n_per_row, 16); auto row_size = ggml_row_size(GGML_TYPE_IQ3_K, n_per_row); helper.quantize(nrows, src, dst, row_size, quantize_row_iq3_k_impl); return nrows * row_size; @@ -2650,21 +2666,23 @@ static void quantize_row_iq3_ks_impl(const int super_block_size, const int block } void quantize_row_iq3_ks_ref(const float * x, block_iq3_ks * y, int64_t k) { - quantize_iq3_ks(x, (void *)y, 1, k, nullptr); + quantize_iq3_ks(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq3_ks(const float * x, void * y, int64_t k) { - quantize_iq3_ks(x, (void *)y, 1, k, nullptr); + quantize_iq3_ks(x, (void *)y, 1, k, nullptr, nullptr); } -size_t quantize_iq3_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq3_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); float weight[kBlockSize]; std::vector all_scales(n_per_row/kBlockSize); auto row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row); - QHelper helper(imatrix, n_per_row, kBlockSize); - auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) { + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); + auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq3_ks_impl(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq3nl_values, imatrix, 5); }; helper.quantize(nrows, src, dst, row_size, q_func); @@ -2975,7 +2993,7 @@ static void quantize_row_iq4_k_impl_bs16(const int super_block_size, const int b void quantize_row_iq4_k_ref(const float * x, block_iq4_k * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq4_k(x, (void *)y, 1, k, nullptr); + quantize_iq4_k(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq4_k(const float * x, void * vy, int64_t k) { @@ -2984,12 +3002,14 @@ void quantize_row_iq4_k(const float * x, void * vy, int64_t k) { quantize_row_iq4_k_ref(x, y, k); } -size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); uint8_t L[QK_K]; float weight[16]; float scales[QK_K/16]; - auto q_func = [&L, &weight, &scales] (const float * x, void * vy, int n_per_row, const float * imatrix) { + auto q_func = [&L, &weight, &scales] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { block_iq4_k * iq4 = (block_iq4_k *)vy; int nblock = n_per_row/QK_K; for (int ibl = 0; ibl < nblock; ++ibl) { @@ -2999,7 +3019,7 @@ size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_pe } }; auto row_size = ggml_row_size(GGML_TYPE_IQ4_K, n_per_row); - QHelper helper(imatrix, n_per_row, 16); + QHelper helper(imatrix, user_data, n_per_row, 16); helper.quantize(nrows, src, dst, row_size, q_func); return nrows * row_size; } @@ -3131,7 +3151,8 @@ inline int best_index_iq5nl(const int8_t * values, float x) { return ix < 32 ? ix : x - values[ix-32] < values[ix-31] - x ? ix-32 : ix-31; } -void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) { +void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, + [[maybe_unused]] const quantize_user_data * user_data) { const int ntry = 5; const float step = 1.f; @@ -3315,7 +3336,7 @@ void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const fl void quantize_row_iq5_k_ref(const float * x, block_iq5_k * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq5_k(x, (void *)y, 1, k, nullptr); + quantize_iq5_k(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq5_k(const float * x, void * vy, int64_t k) { @@ -3324,9 +3345,10 @@ void quantize_row_iq5_k(const float * x, void * vy, int64_t k) { quantize_row_iq5_k_ref(x, y, k); } -size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); - QHelper helper(imatrix, n_per_row, 16); + QHelper helper(imatrix, user_data, n_per_row, 16); auto row_size = ggml_row_size(GGML_TYPE_IQ5_K, n_per_row); helper.quantize(nrows, src, dst, row_size, quantize_row_iq5_k_impl); return nrows * row_size; @@ -3674,7 +3696,7 @@ void quantize_row_iq6_k_impl(const float * x, void * vy, int n_per_row, const fl void quantize_row_iq6_k_ref(const float * x, block_iq6_k * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq6_k(x, (void *)y, 1, k, nullptr); + quantize_iq6_k(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq6_k(const float * x, void * vy, int64_t k) { @@ -3683,18 +3705,20 @@ void quantize_row_iq6_k(const float * x, void * vy, int64_t k) { quantize_row_iq6_k_ref(x, y, k); } -size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); float values[128]; for (int i = 0; i < 64; ++i) { values[i] = iq6nl_values[i]; values[i+64] = values[i] + S_IQ6K; } - auto q_func = [values] (const float * x, void * vy, int n_per_row, const float * imatrix) { + auto q_func = [values] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq6_k_impl(x, vy, n_per_row, imatrix, values, values + 64); }; auto row_size = ggml_row_size(GGML_TYPE_IQ6_K, n_per_row); - QHelper helper(imatrix, n_per_row, 16); + QHelper helper(imatrix, user_data, n_per_row, 16); helper.quantize(nrows, src, dst, row_size, q_func); return nrows * row_size; } @@ -4113,14 +4137,15 @@ static void quantize_row_mxfp4_impl(int n_per_row, const float * x, char * cy, } void quantize_row_mxfp4_ref(const float * x, block_mxfp4 * y, int64_t k) { - quantize_mxfp4(x, (void *)y, 1, k, nullptr); + quantize_mxfp4(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_mxfp4(const float * x, void * y, int64_t k) { - quantize_mxfp4(x, (void *)y, 1, k, nullptr); + quantize_mxfp4(x, (void *)y, 1, k, nullptr, nullptr); } -size_t quantize_mxfp4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_mxfp4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { constexpr int kBlockSize = QK_MXFP4; GGML_ASSERT(n_per_row%kBlockSize == 0); auto row_size = ggml_row_size(GGML_TYPE_MXFP4, n_per_row); @@ -4351,21 +4376,23 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int } void quantize_row_iq4_ks_ref(const float * x, block_iq4_ks * y, int64_t k) { - quantize_iq4_ks(x, (void *)y, 1, k, nullptr); + quantize_iq4_ks(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq4_ks(const float * x, void * y, int64_t k) { - quantize_iq4_ks(x, (void *)y, 1, k, nullptr); + quantize_iq4_ks(x, (void *)y, 1, k, nullptr, nullptr); } -size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row); float weight[kBlockSize]; std::vector all_scales(n_per_row/kBlockSize); - QHelper helper(imatrix, n_per_row, kBlockSize); - auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) { + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); + auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq4_k_impl_bs128(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq4k_values, imatrix, 7); }; helper.quantize(nrows, src, dst, row_size, q_func); @@ -4592,21 +4619,23 @@ static void quantize_row_iq5_ks_impl(const int super_block_size, const int block } void quantize_row_iq5_ks_ref(const float * x, block_iq5_ks * y, int64_t k) { - quantize_iq5_ks(x, (void *)y, 1, k, nullptr); + quantize_iq5_ks(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq5_ks(const float * x, void * y, int64_t k) { - quantize_iq5_ks(x, (void *)y, 1, k, nullptr); + quantize_iq5_ks(x, (void *)y, 1, k, nullptr, nullptr); } -size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row); float weight[kBlockSize]; std::vector all_scales(n_per_row/kBlockSize); - QHelper helper(imatrix, n_per_row, kBlockSize); - auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) { + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); + auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq5_ks_impl(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq5nl_values, imatrix, 5); }; helper.quantize(nrows, src, dst, row_size, q_func); @@ -4951,15 +4980,17 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy, } } -size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ4_KSS, n_per_row); std::vector all_scales(n_per_row/kBlockSize); float weight[kBlockSize]; auto table = scramble_table(); - QHelper helper(imatrix, n_per_row, kBlockSize); - auto q_func = [&all_scales, &weight, table] (const float * x, void * vy, int n_per_row, const float * imatrix) { + QHelper helper(imatrix, user_data, n_per_row, kBlockSize); + auto q_func = [&all_scales, &weight, table] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { quantize_row_iq4_kss_impl(n_per_row, x, (char *)vy, all_scales.data(), weight, iq4k_values, imatrix, table, 7); }; helper.quantize(nrows, src, dst, row_size, q_func); @@ -4967,11 +4998,11 @@ size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_ } void quantize_row_iq4_kss_ref(const float * x, block_iq4_kss * y, int64_t k) { - quantize_iq4_kss(x, y, 1, k, nullptr); + quantize_iq4_kss(x, y, 1, k, nullptr, nullptr); } void quantize_row_iq4_kss(const float * x, void * y, int64_t k) { - quantize_iq4_kss(x, (block_iq4_kss *)y, 1, k, nullptr); + quantize_iq4_kss(x, (block_iq4_kss *)y, 1, k, nullptr, nullptr); } void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) { @@ -5019,12 +5050,12 @@ void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_iq4_nl_r4_ref(const float * x, block_iq4_nl_r4 * y, int64_t k) { // we assume we are called with 4 rows - quantize_iq4_nl_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq4_nl_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq4_nl_r4(const float * x, void * y, int64_t k) { // we assume we are called with 4 rows - quantize_iq4_nl_r4(x, y, 4, k/4, nullptr); + quantize_iq4_nl_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq4_nl(int nrows, int n_per_row, const block_iq4_nl * x, block_iq4_nl_r4 * y, [[maybe_unused]] bool online) { @@ -5048,13 +5079,15 @@ static void repack_iq4_nl(int nrows, int n_per_row, const block_iq4_nl * x, bloc } } -size_t quantize_iq4_nl_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_nl_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); auto row_size_nl = ggml_row_size(GGML_TYPE_IQ4_NL, n_per_row); std::vector qtmp(4*row_size_nl); - QHelper helper(imatrix, n_per_row, 32); - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq4_nl(x, (char *)vy, 1, n_per_row, imatrix); + QHelper helper(imatrix, user_data, n_per_row, 32); + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq4_nl(x, (char *)vy, 1, n_per_row, imatrix, nullptr); }; char * qrow = (char *)dst; for (int row = 0; row < nrows; row += 4) { @@ -5107,12 +5140,12 @@ void vec_dot_iq4_nl_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_q4_0_r8_ref(const float * x, block_iq4_nl_r8 * y, int64_t k) { // we assume we are called with 8 rows - quantize_q4_0_r8(x, (void *)y, 8, k/8, nullptr); + quantize_q4_0_r8(x, (void *)y, 8, k/8, nullptr, nullptr); } void quantize_row_q4_0_r8(const float * x, void * y, int64_t k) { // we assume we are called with 8 rows - quantize_q4_0_r8(x, y, 8, k/8, nullptr); + quantize_q4_0_r8(x, y, 8, k/8, nullptr, nullptr); } static void repack_q4_0(int nrows, int n_per_row, const block_q4_0 * x, block_iq4_nl_r8 * y, [[maybe_unused]] bool online) { @@ -5161,13 +5194,15 @@ static void modify_q4_0_r8(int64_t k, char * cy) { } #endif -size_t quantize_q4_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_q4_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(nrows%8 == 0); auto row_size_nl = ggml_row_size(GGML_TYPE_Q4_0, n_per_row); std::vector qtmp(8*row_size_nl); - QHelper helper(imatrix, n_per_row, 32); - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q4_0(x, (char *)vy, 1, n_per_row, imatrix); + QHelper helper(imatrix, user_data, n_per_row, 32); + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q4_0(x, (char *)vy, 1, n_per_row, imatrix, nullptr); }; char * qrow = (char *)dst; for (int row = 0; row < nrows; row += 8) { @@ -5217,12 +5252,12 @@ void vec_dot_q4_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q8_0_r8_ref(const float * x, block_q8_0_r8 * y, int64_t k) { // we assume we are called with 4 rows - quantize_q8_0_r8(x, (void *)y, 8, k/8, nullptr); + quantize_q8_0_r8(x, (void *)y, 8, k/8, nullptr, nullptr); } void quantize_row_q8_0_r8(const float * x, void * y, int64_t k) { // we assume we are called with 4 rows - quantize_q8_0_r8(x, y, 8, k/8, nullptr); + quantize_q8_0_r8(x, y, 8, k/8, nullptr, nullptr); } static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y, [[maybe_unused]] bool online) { @@ -5267,13 +5302,14 @@ static void modify_q8_0_r8(int64_t k, char * cy) { } #endif -size_t quantize_q8_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_q8_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%8 == 0); auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); std::vector qtmp(8*row_size_0); char * qrow = (char *)dst; for (int row = 0; row < nrows; row += 8) { - quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix); + quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix, nullptr); repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow, false); src += 8*n_per_row; qrow += 8*row_size_0; @@ -5316,12 +5352,12 @@ void vec_dot_q8_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q5_0_r4_ref(const float * x, block_q5_0_r4 * y, int64_t k) { // we assume we are called with 4 rows - quantize_q5_0_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q5_0_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q5_0_r4(const float * x, void * y, int64_t k) { // we assume we are called with 4 rows - quantize_q5_0_r4(x, y, 4, k/4, nullptr); + quantize_q5_0_r4(x, y, 4, k/4, nullptr, nullptr); } static inline void convert_q5_0(const block_q5_0& x, uint8_t * L) { @@ -5364,13 +5400,14 @@ static void repack_q5_0(int nrows, int n_per_row, const block_q5_0 * x, block_q5 } } -size_t quantize_q5_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_q5_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); auto row_size_0 = ggml_row_size(GGML_TYPE_Q5_0, n_per_row); std::vector qtmp(4*row_size_0); char * qrow = (char *)dst; for (int row = 0; row < nrows; row += 4) { - quantize_q5_0(src, qtmp.data(), 4, n_per_row, imatrix); + quantize_q5_0(src, qtmp.data(), 4, n_per_row, imatrix, user_data); repack_q5_0(4, n_per_row, (const block_q5_0 *)qtmp.data(), (block_q5_0_r4 *)qrow, false); src += 4*n_per_row; qrow += 4*row_size_0; @@ -5417,12 +5454,12 @@ void vec_dot_q5_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q6_0_r4_ref(const float * x, block_q6_0_r4 * y, int64_t k) { // we assume we are called with 4 rows - quantize_q6_0_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q6_0_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q6_0_r4(const float * x, void * y, int64_t k) { // we assume we are called with 4 rows - quantize_q6_0_r4(x, y, 4, k/4, nullptr); + quantize_q6_0_r4(x, y, 4, k/4, nullptr, nullptr); } static inline void convert_q6_0(const block_q6_0& x, uint8_t * L) { @@ -5461,14 +5498,16 @@ static void repack_q6_0(int nrows, int n_per_row, const block_q6_0 * x, block_q6 } } -size_t quantize_q6_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_q6_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); auto row_size_0 = ggml_row_size(GGML_TYPE_Q6_0, n_per_row); std::vector qtmp(4*row_size_0); char * qrow = (char *)dst; - QHelper helper(imatrix, n_per_row, 32); - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q6_0(x, (char *)vy, 1, n_per_row, imatrix); + QHelper helper(imatrix, user_data, n_per_row, 32); + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q6_0(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; for (int row = 0; row < nrows; row += 4) { helper.quantize(4, src, qtmp.data(), row_size_0, q_func); @@ -5518,11 +5557,11 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_iq4_xs_r8_ref(const float * x, block_iq4_xs_r8 * y, int64_t k) { - quantize_iq4_xs_r8(x, (void *)y, 8, k/8, nullptr); + quantize_iq4_xs_r8(x, (void *)y, 8, k/8, nullptr, nullptr); } void quantize_row_iq4_xs_r8(const float * x, void * y, int64_t k) { - quantize_iq4_xs_r8(x, y, 8, k/8, nullptr); + quantize_iq4_xs_r8(x, y, 8, k/8, nullptr, nullptr); } static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r8 * y, [[maybe_unused]] bool online) { @@ -5557,11 +5596,13 @@ static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, bloc } } -size_t quantize_iq4_xs_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq4_xs(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq4_xs_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq4_xs(x, (char *)vy, 1, n_per_row, imatrix, nullptr); }; - return quantize_repack<32, block_iq4_xs, block_iq4_xs_r8, 8>(GGML_TYPE_IQ4_XS, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<32, block_iq4_xs, block_iq4_xs_r8, 8>(GGML_TYPE_IQ4_XS, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq4_xs); } @@ -5603,11 +5644,11 @@ void vec_dot_iq4_xs_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq4_ks_r4_ref(const float * x, block_iq4_ks_r4 * y, int64_t k) { - quantize_iq4_ks_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq4_ks_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq4_ks_r4(const float * x, void * y, int64_t k) { - quantize_iq4_ks_r4(x, y, 4, k/4, nullptr); + quantize_iq4_ks_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq4_ks(int nrows, int n_per_row, const block_iq4_ks * x, block_iq4_ks_r4 * y, [[maybe_unused]] bool online) { @@ -5644,14 +5685,15 @@ static void repack_iq4_ks(int nrows, int n_per_row, const block_iq4_ks * x, bloc } } -size_t quantize_iq4_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq4_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq4_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq4_ks(4, n_per_row, (const block_iq4_ks *)qtmp.data(), (block_iq4_ks_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -5703,11 +5745,11 @@ void vec_dot_iq4_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // ========================================= iq2_bn_r4 // void quantize_row_iq2_bn_r4_ref(const float * x, block_iq2_bn * y, int64_t k) { - quantize_iq2_bn_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq2_bn_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq2_bn_r4(const float * x, void * y, int64_t k) { - quantize_iq2_bn_r4(x, y, 4, k/4, nullptr); + quantize_iq2_bn_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -5772,14 +5814,15 @@ void repack_iq2_bn(int nrows, int n_per_row, const char * x, char * y, [[maybe_u } } -size_t quantize_iq2_bn_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_bn_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_IQ1BN == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq2_bn(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq2_bn(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq2_bn(4, n_per_row, qtmp.data(), qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -5826,11 +5869,11 @@ void vec_dot_iq2_bn_r4_q8_K64(int n, float * s, size_t bs, const void * vx, size // void quantize_row_q4_k_r4_ref(const float * x, block_q4_k_r4 * y, int64_t k) { - quantize_q4_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q4_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q4_k_r4(const float * x, void * y, int64_t k) { - quantize_q4_k_r4(x, y, 4, k/4, nullptr); + quantize_q4_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -5887,11 +5930,13 @@ static void repack_q4_k(int nrows, int n_per_row, const block_q4_K * x, block_q4 } } -size_t quantize_q4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q4_K(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_q4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q4_K(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<32, block_q4_K, block_q4_k_r4, 4>(GGML_TYPE_Q4_K, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<32, block_q4_K, block_q4_k_r4, 4>(GGML_TYPE_Q4_K, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_q4_k); } @@ -5940,11 +5985,11 @@ void vec_dot_q4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q6_k_r4_ref(const float * x, block_q6_k_r4 * y, int64_t k) { - quantize_q6_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q6_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q6_k_r4(const float * x, void * y, int64_t k) { - quantize_q6_k_r4(x, y, 4, k/4, nullptr); + quantize_q6_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -5996,11 +6041,13 @@ static void repack_q6_k(int nrows, int n_per_row, const block_q6_K * x, block_q6 } } -size_t quantize_q6_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q6_K(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_q6_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q6_K(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_q6_K, block_q6_k_r4, 4>(GGML_TYPE_Q6_K, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_q6_K, block_q6_k_r4, 4>(GGML_TYPE_Q6_K, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_q6_k); } @@ -6052,11 +6099,11 @@ void vec_dot_q6_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q5_k_r4_ref(const float * x, block_q5_k_r4 * y, int64_t k) { - quantize_q5_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q5_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q5_k_r4(const float * x, void * y, int64_t k) { - quantize_q5_k_r4(x, y, 4, k/4, nullptr); + quantize_q5_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -6107,11 +6154,13 @@ static void repack_q5_k(int nrows, int n_per_row, const block_q5_K * x, block_q5 } } -size_t quantize_q5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q5_K(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_q5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q5_K(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<32, block_q5_K, block_q5_k_r4, 4>(GGML_TYPE_Q5_K, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<32, block_q5_K, block_q5_k_r4, 4>(GGML_TYPE_Q5_K, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_q5_k); } @@ -6164,11 +6213,11 @@ void vec_dot_q5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q3_k_r4_ref(const float * x, block_q3_k_r4 * y, int64_t k) { - quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q3_k_r4(const float * x, void * y, int64_t k) { - quantize_q3_k_r4(x, y, 4, k/4, nullptr); + quantize_q3_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -6236,11 +6285,13 @@ static void repack_q3_k(int nrows, int n_per_row, const block_q3_K * x, block_q3 } } -size_t quantize_q3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q3_K(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_q3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q3_K(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_q3_K, block_q3_k_r4, 4>(GGML_TYPE_Q3_K, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_q3_K, block_q3_k_r4, 4>(GGML_TYPE_Q3_K, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_q3_k); } @@ -6293,11 +6344,11 @@ void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q2_k_r4_ref(const float * x, block_q2_k_r4 * y, int64_t k) { - quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q2_k_r4(const float * x, void * y, int64_t k) { - quantize_q2_k_r4(x, y, 4, k/4, nullptr); + quantize_q2_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -6345,11 +6396,13 @@ static void repack_q2_k(int nrows, int n_per_row, const block_q2_K * x, block_q2 } } -size_t quantize_q2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_q2_K(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_q2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_q2_K(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_q2_K, block_q2_k_r4, 4>(GGML_TYPE_Q2_K, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_q2_K, block_q2_k_r4, 4>(GGML_TYPE_Q2_K, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_q2_k); } @@ -6401,11 +6454,11 @@ void vec_dot_q2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_iq4_k_r4_ref(const float * x, block_iq4_k_r4 * y, int64_t k) { - quantize_iq4_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq4_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq4_k_r4(const float * x, void * y, int64_t k) { - quantize_iq4_k_r4(x, y, 4, k/4, nullptr); + quantize_iq4_k_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq4_k(int nrows, int n_per_row, const block_iq4_k * x, block_iq4_k_r4 * y, [[maybe_unused]] bool online) { @@ -6453,14 +6506,15 @@ static void repack_iq4_k(int nrows, int n_per_row, const block_iq4_k * x, block_ } } -size_t quantize_iq4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ4_K, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq4_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq4_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq4_k(4, n_per_row, (const block_iq4_k *)qtmp.data(), (block_iq4_k_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -6515,11 +6569,11 @@ void vec_dot_iq4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq5_k_r4_ref(const float * x, block_iq5_k_r4 * y, int64_t k) { - quantize_iq5_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq5_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq5_k_r4(const float * x, void * y, int64_t k) { - quantize_iq5_k_r4(x, y, 4, k/4, nullptr); + quantize_iq5_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -6590,14 +6644,15 @@ static void repack_iq5_k(int nrows, int n_per_row, const block_iq5_k * x, block_ } } -size_t quantize_iq5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ5_K, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq5_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq5_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq5_k(4, n_per_row, (const block_iq5_k *)qtmp.data(), (block_iq5_k_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -6652,11 +6707,11 @@ void vec_dot_iq5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq5_ks_r4_ref(const float * x, block_iq5_ks_r4 * y, int64_t k) { - quantize_iq5_ks_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq5_ks_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq5_ks_r4(const float * x, void * y, int64_t k) { - quantize_iq5_ks_r4(x, y, 4, k/4, nullptr); + quantize_iq5_ks_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq5_ks(int nrows, int n_per_row, const block_iq5_ks * x, block_iq5_ks_r4 * y, [[maybe_unused]] bool online) { @@ -6697,14 +6752,15 @@ static void repack_iq5_ks(int nrows, int n_per_row, const block_iq5_ks * x, bloc } } -size_t quantize_iq5_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq5_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq5_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq5_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq5_ks(4, n_per_row, (const block_iq5_ks *)qtmp.data(), (block_iq5_ks_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -6772,11 +6828,11 @@ void vec_dot_iq5_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_q8_k_r8_ref(const float * x, block_q8_k_r8 * y, int64_t k) { - quantize_q8_k_r8(x, (void *)y, 8, k/8, nullptr); + quantize_q8_k_r8(x, (void *)y, 8, k/8, nullptr, nullptr); } void quantize_row_q8_k_r8(const float * x, void * y, int64_t k) { - quantize_q8_k_r8(x, y, 8, k/8, nullptr); + quantize_q8_k_r8(x, y, 8, k/8, nullptr, nullptr); } static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r8 * y, [[maybe_unused]] bool online) { @@ -6819,7 +6875,8 @@ static void modify_q8_k_r8(int64_t k, char * cy) { } #endif -size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) { +size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%8 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; @@ -6870,11 +6927,11 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q8_k_r16_ref(const float * x, block_q8_k_r16 * y, int64_t k) { - quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr); + quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr, nullptr); } void quantize_row_q8_k_r16(const float * x, void * y, int64_t k) { - quantize_q8_k_r16(x, y, 16, k/16, nullptr); + quantize_q8_k_r16(x, y, 16, k/16, nullptr, nullptr); } static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r16 * y, [[maybe_unused]] bool online) { @@ -6903,7 +6960,8 @@ static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q } } -size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) { +size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%16 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; @@ -6956,11 +7014,11 @@ void vec_dot_q8_k_r16_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_q8_KV_r8_ref(const float * x, void * y, int64_t k) { - quantize_q8_KV_r8(x, y, 8, k/8, nullptr); + quantize_q8_KV_r8(x, y, 8, k/8, nullptr, nullptr); } void quantize_row_q8_KV_r8(const float * x, void * y, int64_t k) { - quantize_q8_KV_r8(x, y, 8, k/8, nullptr); + quantize_q8_KV_r8(x, y, 8, k/8, nullptr, nullptr); } static void repack_q8_KV(int nrows, int n_per_row, const char * cx, char * cy, [[maybe_unused]] bool online) { @@ -7051,7 +7109,8 @@ static void modify_q8_KV_r8(int64_t k, char * cy) { } #endif -size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) { +size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%8 == 0); GGML_ASSERT(n_per_row%16 == 0); char * qcur = (char *)dst; @@ -7059,7 +7118,7 @@ size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n auto row_size_1 = ggml_row_size(GGML_TYPE_Q8_KV_R8, n_per_row); std::vector qtmp(8*row_size_0); for (int row = 0; row < nrows; row += 8) { - quantize_q8_KV(src, (void *)qtmp.data(), 8, n_per_row, imatrix); + quantize_q8_KV(src, (void *)qtmp.data(), 8, n_per_row, imatrix, user_data); repack_q8_KV(8, n_per_row, qtmp.data(), qcur, false); qcur += 8*row_size_1; src += 8*n_per_row; @@ -7137,11 +7196,11 @@ void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT d // void quantize_row_iq3_k_r4_ref(const float * x, block_iq3_k_r4 * y, int64_t k) { - quantize_iq3_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq3_k_r4(const float * x, void * y, int64_t k) { - quantize_iq3_k_r4(x, y, 4, k/4, nullptr); + quantize_iq3_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -7207,14 +7266,15 @@ static void repack_iq3_k(int nrows, int n_per_row, const block_iq3_k * x, block_ } } -size_t quantize_iq3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ3_K, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq3_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq3_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq3_k(4, n_per_row, (const block_iq3_k *)qtmp.data(), (block_iq3_k_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -7273,11 +7333,11 @@ void vec_dot_iq3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq2_k_r4_ref(const float * x, block_iq2_k_r4 * y, int64_t k) { - quantize_iq2_k_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq2_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq2_k_r4(const float * x, void * y, int64_t k) { - quantize_iq2_k_r4(x, y, 4, k/4, nullptr); + quantize_iq2_k_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -7332,14 +7392,15 @@ static void repack_iq2_k(int nrows, int n_per_row, const block_iq2_k * x, block_ } } -size_t quantize_iq2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%QK_K == 0); char * qcur = (char *)dst; auto row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row); std::vector qtmp(4*row_size); for (int row = 0; row < nrows; row += 4) { - quantize_iq2_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix); + quantize_iq2_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data); repack_iq2_k(4, n_per_row, (const block_iq2_k *)qtmp.data(), (block_iq2_k_r4 *)qcur, false); qcur += 4*row_size; src += 4*n_per_row; @@ -7412,11 +7473,11 @@ inline uint8_t scrambled_sign(uint8_t s) { // void quantize_row_iq2_xxs_r4_ref(const float * x, block_iq2_xxs_r4 * y, int64_t k) { - quantize_iq2_xxs_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq2_xxs_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq2_xxs_r4(const float * x, void * y, int64_t k) { - quantize_iq2_xxs_r4(x, y, 4, k/4, nullptr); + quantize_iq2_xxs_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq2_xxs(int nrows, int n_per_row, const block_iq2_xxs * x, block_iq2_xxs_r4 * y, [[maybe_unused]] bool online) { @@ -7452,11 +7513,13 @@ static void repack_iq2_xxs(int nrows, int n_per_row, const block_iq2_xxs * x, bl } } -size_t quantize_iq2_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq2_xxs(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq2_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq2_xxs(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<32, block_iq2_xxs, block_iq2_xxs_r4, 4>(GGML_TYPE_IQ2_XXS, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<32, block_iq2_xxs, block_iq2_xxs_r4, 4>(GGML_TYPE_IQ2_XXS, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq2_xxs); } @@ -7505,11 +7568,11 @@ void vec_dot_iq2_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_ // void quantize_row_iq2_xs_r4_ref(const float * x, block_iq2_xs_r4 * y, int64_t k) { - quantize_iq2_xs_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq2_xs_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq2_xs_r4(const float * x, void * y, int64_t k) { - quantize_iq2_xs_r4(x, y, 4, k/4, nullptr); + quantize_iq2_xs_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq2_xs(int nrows, int n_per_row, const block_iq2_xs * x, block_iq2_xs_r4 * y, [[maybe_unused]] bool online) { @@ -7537,11 +7600,13 @@ static void repack_iq2_xs(int nrows, int n_per_row, const block_iq2_xs * x, bloc } } -size_t quantize_iq2_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq2_xs(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq2_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq2_xs(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_iq2_xs, block_iq2_xs_r4, 4>(GGML_TYPE_IQ2_XS, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_iq2_xs, block_iq2_xs_r4, 4>(GGML_TYPE_IQ2_XS, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq2_xs); } @@ -7585,11 +7650,11 @@ void vec_dot_iq2_xs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq2_s_r4_ref(const float * x, block_iq2_s_r4 * y, int64_t k) { - quantize_iq2_s_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq2_s_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq2_s_r4(const float * x, void * y, int64_t k) { - quantize_iq2_s_r4(x, y, 4, k/4, nullptr); + quantize_iq2_s_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq2_s(int nrows, int n_per_row, const block_iq2_s * x, block_iq2_s_r4 * y, [[maybe_unused]] bool online) { @@ -7618,11 +7683,13 @@ static void repack_iq2_s(int nrows, int n_per_row, const block_iq2_s * x, block_ } } -size_t quantize_iq2_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq2_s(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq2_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq2_s(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_iq2_s, block_iq2_s_r4, 4>(GGML_TYPE_IQ2_S, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_iq2_s, block_iq2_s_r4, 4>(GGML_TYPE_IQ2_S, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq2_s); } @@ -7665,11 +7732,11 @@ void vec_dot_iq2_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t // void quantize_row_iq3_xxs_r4_ref(const float * x, block_iq3_xxs_r4 * y, int64_t k) { - quantize_iq3_xxs_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq3_xxs_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq3_xxs_r4(const float * x, void * y, int64_t k) { - quantize_iq3_xxs_r4(x, y, 4, k/4, nullptr); + quantize_iq3_xxs_r4(x, y, 4, k/4, nullptr, nullptr); } namespace { @@ -7708,11 +7775,13 @@ static void repack_iq3_xxs(int nrows, int n_per_row, const block_iq3_xxs * x, bl } } -size_t quantize_iq3_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq3_xxs(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq3_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq3_xxs(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<32, block_iq3_xxs, block_iq3_xxs_r4, 4>(GGML_TYPE_IQ3_XXS, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<32, block_iq3_xxs, block_iq3_xxs_r4, 4>(GGML_TYPE_IQ3_XXS, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq3_xxs); } @@ -7761,11 +7830,11 @@ void vec_dot_iq3_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_ // void quantize_row_iq3_s_r4_ref(const float * x, block_iq3_s_r4 * y, int64_t k) { - quantize_iq3_s_r4(x, (void *)y, 4, k/4, nullptr); + quantize_iq3_s_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_iq3_s_r4(const float * x, void * y, int64_t k) { - quantize_iq3_s_r4(x, y, 4, k/4, nullptr); + quantize_iq3_s_r4(x, y, 4, k/4, nullptr, nullptr); } static void repack_iq3_s(int nrows, int n_per_row, const block_iq3_s * x, block_iq3_s_r4 * y, [[maybe_unused]] bool online) { @@ -7806,11 +7875,13 @@ static void repack_iq3_s(int nrows, int n_per_row, const block_iq3_s * x, block_ } } -size_t quantize_iq3_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { - auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) { - quantize_iq3_s(x, (char *)vy, 1, n_per_row, imatrix); +size_t quantize_iq3_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { + quantize_iq3_s(x, (char *)vy, 1, n_per_row, imatrix, user_data); }; - return quantize_repack<16, block_iq3_s, block_iq3_s_r4, 4>(GGML_TYPE_IQ3_S, src, dst, nrows, n_per_row, imatrix, + return quantize_repack<16, block_iq3_s, block_iq3_s_r4, 4>(GGML_TYPE_IQ3_S, src, dst, nrows, n_per_row, imatrix, user_data, q_func, repack_iq3_s); } @@ -7851,14 +7922,15 @@ void vec_dot_iq3_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t } void quantize_row_iq1_s_r4_ref(const float * x, block_iq1_s_r4 * y, int64_t k) { - quantize_iq1_s_r4(x, y, 4, k/4, nullptr); + quantize_iq1_s_r4(x, y, 4, k/4, nullptr, nullptr); } void quantize_row_iq1_s_r4(const float * x, void * y, int64_t k) { - quantize_iq1_s_r4(x, y, 4, k/4, nullptr); + quantize_iq1_s_r4(x, y, 4, k/4, nullptr, nullptr); } -size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%kBlockSize == 0); @@ -7982,14 +8054,15 @@ void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t } void quantize_row_iq1_m_r4_ref(const float * x, block_iq1_m_r4 * y, int64_t k) { - quantize_iq1_m_r4(x, y, 4, k/4, nullptr); + quantize_iq1_m_r4(x, y, 4, k/4, nullptr, nullptr); } void quantize_row_iq1_m_r4(const float * x, void * y, int64_t k) { - quantize_iq1_m_r4(x, y, 4, k/4, nullptr); + quantize_iq1_m_r4(x, y, 4, k/4, nullptr, nullptr); } -size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { constexpr int kBlockSize = 32; GGML_ASSERT(nrows%4 == 0); GGML_ASSERT(n_per_row%kBlockSize == 0); @@ -8135,7 +8208,8 @@ void quantize_row_q8_KV_ref(const float * x, void * y, int64_t k) { quantize_row_q8_KV(x, y, k); } -size_t quantize_q8_KV(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_q8_KV(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { (void)imatrix; auto row_size = ggml_row_size(GGML_TYPE_Q8_KV, n_per_row); auto q = (char *)dst; @@ -9195,7 +9269,7 @@ void quantize_row_iq1_kt_impl(const float * x, void * vy, int n_per_row, const f void quantize_row_iq1_kt_ref(const float * GGML_RESTRICT x, block_iq1_kt * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); - quantize_iq1_kt(x, (void *)y, 1, k, nullptr); + quantize_iq1_kt(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -9204,7 +9278,8 @@ void quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_iq1_kt_ref(x, y, k); } -size_t quantize_iq1_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq1_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ1_KT, n_per_row); std::vector scales(n_per_row/QuantizerIQ1KT::kBlockSize); @@ -9475,7 +9550,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); - quantize_iq2_kt(x, (void *)y, 1, k, nullptr); + quantize_iq2_kt(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -9484,7 +9559,8 @@ void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_iq2_kt_ref(x, y, k); } -size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row); std::vector scales(n_per_row/QuantizerIQ2KT::kBlockSize); @@ -9745,7 +9821,7 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f void quantize_row_iq3_kt_ref(const float * x, block_iq3_kt * y, int64_t k) { assert(k % QK_K == 0); - quantize_iq3_kt(x, (void *)y, 1, k, nullptr); + quantize_iq3_kt(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq3_kt(const float * x, void * vy, int64_t k) { @@ -9754,7 +9830,8 @@ void quantize_row_iq3_kt(const float * x, void * vy, int64_t k) { quantize_row_iq3_kt_ref(x, y, k); } -size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ3_KT, n_per_row); std::vector scales(n_per_row/QuantizerIQ3KT::kBlockSize); @@ -10009,7 +10086,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f void quantize_row_iq4_kt_ref(const float * GGML_RESTRICT x, block_iq4_kt * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); - quantize_iq4_kt(x, (void *)y, 1, k, nullptr); + quantize_iq4_kt(x, (void *)y, 1, k, nullptr, nullptr); } void quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { @@ -10018,7 +10095,8 @@ void quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, quantize_row_iq4_kt_ref(x, y, k); } -size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) { +size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(n_per_row%QK_K == 0); auto row_size = ggml_row_size(GGML_TYPE_IQ4_KT, n_per_row); std::vector scales(n_per_row/QuantizerIQ4KT::kBlockSize); @@ -10101,7 +10179,8 @@ void quantize_row_q1_0_g128(const float * x, void * vy, int64_t k) { } } -size_t quantize_q1_0_g128(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) { +size_t quantize_q1_0_g128(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix, + [[maybe_unused]] const quantize_user_data * user_data) { GGML_ASSERT(n_per_row % QK1_0_G128 == 0); int64_t ntot = nrows * n_per_row; quantize_row_q1_0_g128(src, dst, ntot); diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h index d85c7599..b5fb8a31 100644 --- a/ggml/src/iqk/iqk_quantize.h +++ b/ggml/src/iqk/iqk_quantize.h @@ -19,285 +19,287 @@ extern "C" { #define GGML_RESTRICT restrict #endif +struct quantize_user_data; + void quantize_row_iq2_k_ref(const float * GGML_RESTRICT x, block_iq2_k * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_k(const block_iq2_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_k_ref(const float * GGML_RESTRICT x, block_iq3_k * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_k(const block_iq3_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_ks_ref(const float * GGML_RESTRICT x, block_iq3_ks * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_ks(const block_iq3_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_k_ref(const float * GGML_RESTRICT x, block_iq4_k * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_k(const block_iq4_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq5_k_ref(const float * GGML_RESTRICT x, block_iq5_k * GGML_RESTRICT y, int64_t k); void quantize_row_iq5_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq5_k(const block_iq5_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq5_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq6_k_ref(const float * GGML_RESTRICT x, block_iq6_k * GGML_RESTRICT y, int64_t k); void quantize_row_iq6_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq6_k(const block_iq6_k * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq6_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_ks_ref(const float * GGML_RESTRICT x, block_iq4_ks * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_ks(const block_iq4_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_kss_ref(const float * GGML_RESTRICT x, block_iq4_kss * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_kss(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_kss(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_kss(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_kss(const block_iq4_kss * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_kss_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t k); void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_mxfp4_q8_0_x4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_ks_ref(const float * GGML_RESTRICT x, block_iq2_ks * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_ks(const block_iq2_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_kl_ref(const float * GGML_RESTRICT x, block_iq2_kl * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_kl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_kl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_kl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_kl(const block_iq2_kl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_kl_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq1_kt_ref(const float * GGML_RESTRICT x, block_iq1_kt * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq1_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq1_kt(const block_iq1_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq1_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_kt(const block_iq2_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_kt_ref(const float * GGML_RESTRICT x, block_iq3_kt * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_kt(const block_iq3_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_kt_ref(const float * GGML_RESTRICT x, block_iq4_kt * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_kt(const block_iq4_kt * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq5_ks_ref(const float * GGML_RESTRICT x, block_iq5_ks * GGML_RESTRICT y, int64_t k); void quantize_row_iq5_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq5_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq5_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq5_ks(const block_iq5_ks * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq5_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_nl_r4_ref(const float * GGML_RESTRICT x, block_iq4_nl_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_nl_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_nl_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_nl_r4(const block_iq4_nl_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_nl_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q4_0_r8_ref(const float * GGML_RESTRICT x, block_iq4_nl_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q4_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q4_0_r8(const block_iq4_nl_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q4_0_r8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q8_0_r8_ref(const float * GGML_RESTRICT x, block_q8_0_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_0_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q8_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q8_0_r8(const block_q8_0_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_0_r8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q5_0_r4_ref(const float * GGML_RESTRICT x, block_q5_0_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q5_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q5_0_r4(const block_q5_0_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q5_0_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q6_0_r4_ref(const float * GGML_RESTRICT x, block_q6_0_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q6_0_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q6_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q6_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q6_0_r4(const block_q6_0_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q6_0_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_xs_r8_ref(const float * GGML_RESTRICT x, block_iq4_xs_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_xs_r8(const block_iq4_xs_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_xs_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_bn (const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_bn_r4_ref (const float * GGML_RESTRICT x, block_iq2_bn * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_bn_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void dequantize_row_iq2_bn_r4(const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_bn_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_bn_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void vec_dot_iq2_bn_r4_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q3_k_r4_ref(const float * GGML_RESTRICT x, block_q3_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q3_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q3_k_r4(const block_q3_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q3_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q2_k_r4_ref(const float * GGML_RESTRICT x, block_q2_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q2_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q2_k_r4(const block_q2_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q2_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q4_k_r4_ref(const float * GGML_RESTRICT x, block_q4_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q4_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q4_k_r4(const block_q4_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q4_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q5_k_r4_ref(const float * GGML_RESTRICT x, block_q5_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q5_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q5_k_r4(const block_q5_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q5_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q6_k_r4_ref(const float * GGML_RESTRICT x, block_q6_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_q6_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q6_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q6_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q6_k_r4(const block_q6_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q6_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq5_k_r4_ref(const float * GGML_RESTRICT x, block_iq5_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq5_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq5_k_r4(const block_iq5_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq5_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_k_r4_ref(const float * GGML_RESTRICT x, block_iq4_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_k_r4(const block_iq4_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_k_r4_ref(const float * GGML_RESTRICT x, block_iq3_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_k_r4(const block_iq3_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_k_r4_ref(const float * GGML_RESTRICT x, block_iq2_k_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_k_r4(const block_iq2_k_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq4_ks_r4_ref(const float * GGML_RESTRICT x, block_iq4_ks_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_ks_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq4_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq4_ks_r4(const block_iq4_ks_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq4_ks_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq5_ks_r4_ref(const float * GGML_RESTRICT x, block_iq5_ks_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq5_ks_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq5_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq5_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq5_ks_r4(const block_iq5_ks_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq5_ks_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_xxs_r4_ref(const float * GGML_RESTRICT x, block_iq2_xxs_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_xxs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_xxs_r4(const block_iq2_xxs_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_xxs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_xs_r4_ref(const float * GGML_RESTRICT x, block_iq2_xs_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_xs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_xs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_xs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_xs_r4(const block_iq2_xs_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_xs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq2_s_r4_ref(const float * GGML_RESTRICT x, block_iq2_s_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq2_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq2_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq2_s_r4(const block_iq2_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq2_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_xxs_r4_ref(const float * GGML_RESTRICT x, block_iq3_xxs_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_xxs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_xxs_r4(const block_iq3_xxs_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_xxs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq3_s_r4_ref(const float * GGML_RESTRICT x, block_iq3_s_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq3_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq3_s_r4(const block_iq3_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq3_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq1_s_r4_ref(const float * GGML_RESTRICT x, block_iq1_s_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq1_s_r4(const block_iq1_s_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_iq1_m_r4_ref(const float * GGML_RESTRICT x, block_iq1_m_r4 * GGML_RESTRICT y, int64_t k); void quantize_row_iq1_m_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_iq1_m_r4(const block_iq1_m_r4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_iq1_m_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q8_k_r8(const block_q8_k_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q8_k_r16_ref(const float * GGML_RESTRICT x, block_q8_k_r16 * GGML_RESTRICT y, int64_t k); void quantize_row_q8_k_r16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q8_k_r16(const block_q8_k_r16 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_k_r16_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q8_KV_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_KV(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q8_KV(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_KV_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q8_KV_r8_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q8_KV_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q8_KV_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_KV_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q8_KV_r8(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q8_KV_r8_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128 * GGML_RESTRICT y, int64_t k); void quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data); void dequantize_row_q1_0_g128(const block_q1_0_g128 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); void vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/include/llama.h b/include/llama.h index 7cc9d73b..732bdde7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -505,6 +505,7 @@ extern "C" { }; // model quantization parameters + struct quantize_user_data; typedef struct llama_model_quantize_params { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype @@ -532,6 +533,7 @@ extern "C" { void * kv_overrides; // pointer to vector containing overrides void * custom_quants; // pointer to vector containing custom quantization rules void * repack_pattern; // pointer to a vector containing regexes to be used for matching tensor names. Can be null + struct quantize_user_data * user_data; // so we can pass extra data to the quantization functions } llama_model_quantize_params; // grammar types diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index ca5853e2..00d2796a 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -831,10 +831,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { +static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, + const float * imatrix, const quantize_user_data * user_data, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread - size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); + size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix, user_data); if (!ggml_validate_row_data(new_type, new_data, new_size)) { throw std::runtime_error("quantized data validation failed"); } @@ -846,7 +847,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa size_t new_size = 0; bool valid = true; auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, - nrows, n_per_row, imatrix]() { + nrows, n_per_row, imatrix, user_data]() { const int64_t nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { @@ -860,7 +861,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa } lock.unlock(); const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); - size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); + size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix, user_data); local_size += this_size; // validate the quantized data @@ -1574,7 +1575,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::mutex mutex; int counter = 0; bool valid = true; - auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix] () { + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () { int ne2 = tensor->ne[2]; auto row_size = ggml_row_size(new_type, tensor->ne[0]); auto matrix_size = row_size * tensor->ne[1]; @@ -1591,7 +1592,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s lock.unlock(); auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr; auto this_data = (char *)new_data + i02*matrix_size; - auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0], this_imatrix); + auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0], + this_imatrix, user_data); local_size += this_size; // validate the quantized data @@ -1624,7 +1626,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; - new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use); } } } diff --git a/src/llama.cpp b/src/llama.cpp index be435db9..b9e62e0d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5165,6 +5165,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.custom_quants =*/ nullptr, /*.repack_pattern =*/ nullptr, + /*.user_data =*/ nullptr, }; return result;