From e5355e98954ccb49b84b4a3e635fbf6838a48f7f Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Thu, 23 Apr 2026 09:05:39 +0200
Subject: [PATCH] Quantization options (#1677)

---
 examples/benchmark/benchmark-matmult.cpp   |   4 +-
 examples/quantize-stats/quantize-stats.cpp |   2 +-
 examples/quantize/quantize.cpp             |   8 +
 ggml/include/ggml.h                        |   8 +-
 ggml/src/CMakeLists.txt                    |   3 -
 ggml/src/ggml-quants.c                     | 130 ++++--
 ggml/src/ggml-quants.h                     |  46 +-
 ggml/src/ggml.c                            | 139 +++---
 ggml/src/iqk/iqk_quantize.cpp              | 505 ++++++++++++---------
 ggml/src/iqk/iqk_quantize.h                |  96 ++--
 include/llama.h                            |   2 +
 src/llama-quantize.cpp                     |  16 +-
 src/llama.cpp                              |   1 +
 13 files changed, 568 insertions(+), 392 deletions(-)

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 47cb16c6..b56a64b1 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -192,7 +192,7 @@ int main(int argc, char ** argv)  {
     // Set up a the benchmark matrices
     // printf("Creating new tensor q11 & Running quantize\n");
     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr, nullptr);
 
     // Set up a the compute graph
     // printf("Creating new tensor q31\n");
@@ -205,7 +205,7 @@ int main(int argc, char ** argv)  {
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr, nullptr);
 
     // printf("Creating new tensor q32\n");
     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 7e680ef3..d801bc15 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1337,7 +1337,7 @@ static void analyze_iq4ks(const char * name, int nrows, int n_per_row, const flo
             int last = std::min(first + chunk, nrows);
             for (int row = first; row < last; ++row) {
                 auto xr = values + row*n_per_row;
-                ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr);
+                ggml_quantize_chunk(GGML_TYPE_IQ4_KS, xr, (void *)Q.data(), 0, 1, n_per_row, nullptr, nullptr);
                 const float * dptr = (const float *)Q.data();
                 const float d = *dptr;
                 const block_iq4_ks * iq4 = (const block_iq4_ks *)(dptr + 1);
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 51a66a8d..83f79722 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -167,6 +167,8 @@ static void usage(const char * executable) {
     printf("  --custom-q regex1=type1,regex2=type2...: use this to specify custom quantization type rules.\n\n");
     printf("  --repack Repack all tensors to the corresponding _r4/8 variant if available.\n\n");
     printf("  --repack-pattern Comma separated list of regexs to use for matching tensor names to be repacked.\n\n");
+    printf("  --symmetric-q40  Use [-7:7] range for Q4_0 quantization (turns off imatrix)\n\n");
+    printf("  --slow-iq2ks Use the original very slow IQ2_KS quantization method.\n\n");
     printf("Additional specific tensor quantization types used in the custom quant scheme 'CQS (default is Q2_K):\n");
     printf("      --attn-q-type ggml_type: use this ggml_type for the attn_q.weight tensor.\n");
     printf("      --attn-k-type ggml_type: use this ggml_type for the attn_k.weight tensor.\n");
@@ -348,6 +350,8 @@ int main(int argc, char ** argv) {
     std::vector<std::string> included_weights, excluded_weights;
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<CustomQ> custom_quants;
+    quantize_user_data user_data = { false, false };
+    params.user_data = &user_data;
 
     std::vector<std::string> repack_patterns;
 
@@ -360,6 +364,10 @@ int main(int argc, char ** argv) {
             params.ignore_imatrix_rules = true;
         } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
             params.dry_run = true;
+        } else if (strcmp(argv[arg_idx], "--symmetric-q40") == 0) {
+            user_data.symmetric_q4_0 = true;
+        } else if (strcmp(argv[arg_idx], "--slow-iq2ks") == 0) {
+            user_data.slow_iq2_ks = true;
         } else if (strcmp(argv[arg_idx], "--repack") == 0) {
             params.only_repack = true;
         } else if (strcmp(argv[arg_idx], "--repack-pattern") == 0) {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2854614e..0d164166 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2934,6 +2934,11 @@ extern "C" {
     // some quantization type cannot be used without an importance matrix
     GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
 
+    struct quantize_user_data {
+        bool  symmetric_q4_0;
+        bool  slow_iq2_ks;
+    };
+
     // calls ggml_quantize_init internally (i.e. can allocate memory)
     GGML_API size_t ggml_quantize_chunk(
             enum ggml_type   type,
@@ -2942,7 +2947,8 @@ extern "C" {
                    int64_t   start,
                    int64_t   nrows,
                    int64_t   n_per_row,
-               const float * imatrix);
+               const float * imatrix,
+               const struct quantize_user_data * user_data);
 
     //
     // gguf
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 0d4f4023..078876dd 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -300,9 +300,6 @@ if (GGML_IQK_MUL_MAT)
         message(STATUS "Disabling IQK Flash Attention kernels")
     endif()
 endif()
-if (IQK_SLOW_IQ2KS_QUANTIZE)
-    set_source_files_properties(iqk/iqk_quantize.cpp PROPERTIES COMPILE_DEFINITIONS IQK_SLOW_IQ2KS_QUANTIZE)
-endif()
 
 if (GGML_CUDA)
     cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 6f9007e0..8209ea93 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -2464,7 +2464,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
     }
 }
 
-size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -2696,7 +2698,9 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     }
 }
 
-size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -2901,7 +2905,9 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
     }
 }
 
-size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -3132,7 +3138,9 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
     }
 }
 
-size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -3341,7 +3349,9 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
     }
 }
 
-size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -3389,7 +3399,45 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     }
 }
 
-size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_0_symmetric(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+    static const int qk = QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        for (int j = 0; j < qk; j++) {
+            const float v = fabsf(x[i*qk + j]);
+            amax = MAX(amax, v);
+        }
+
+        const float d  = amax / 7;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
+    if (user_data && user_data->symmetric_q4_0) {
+        quantize_row_q4_0_symmetric(src, dst, (int64_t)nrow*n_per_row);
+        return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    }
     if (!quant_weights) {
         quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -3434,7 +3482,9 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
     }
 }
 
-size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     if (!quant_weights) {
         quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -3488,7 +3538,9 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     }
 }
 
-size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     if (!quant_weights) {
         quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -3541,7 +3593,9 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
     }
 }
 
-size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     if (!quant_weights) {
         quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -3593,7 +3647,9 @@ static void quantize_row_q6_0_impl(const float * restrict x, block_q6_0 * restri
     }
 }
 
-size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     size_t row_size = ggml_row_size(GGML_TYPE_Q6_0, n_per_row);
     char * qrow = (char *)dst;
     for (int64_t row = 0; row < nrow; ++row) {
@@ -3604,7 +3660,9 @@ size_t quantize_q6_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
     quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -13342,7 +13400,9 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     }
 }
 
-size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -13362,10 +13422,12 @@ void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int64_t
 
 void quantize_row_iq2_xxs_ref(const float * restrict x, block_iq2_xxs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_xxs(x, y, 1, k, NULL);
+    quantize_iq2_xxs(x, y, 1, k, NULL, NULL);
 }
 
-size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -13385,7 +13447,7 @@ void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int64_t k
 
 void quantize_row_iq2_xs_ref(const float * restrict x, block_iq2_xs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_xs(x, y, 1, k, NULL);
+    quantize_iq2_xs(x, y, 1, k, NULL, NULL);
 }
 
 //
@@ -13818,7 +13880,9 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
     }
 }
 
-size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -14024,7 +14088,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
 }
 
 #define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -14054,7 +14120,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k)
 
 void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq3_s(x, y, 1, k, NULL);
+    quantize_iq3_s(x, y, 1, k, NULL, NULL);
 }
 
 
@@ -14358,7 +14424,9 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1S_BLOCK_SIZE];
     float  weight[IQ1S_BLOCK_SIZE];
@@ -14383,7 +14451,7 @@ void quantize_row_iq1_s_ref  (const float * GGML_RESTRICT x, block_iq1_s   * GGM
     float qw[QK_K];
     for (int j = 0; j < QK_K; ++j) qw[j] = 1;
     for (int ibl = 0; ibl < nblock; ++ibl) {
-        quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+        quantize_iq1_s(x + ibl*QK_K, &y[ibl], 1, QK_K, qw, NULL);
     }
 }
 
@@ -14660,7 +14728,9 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1M_BLOCK_SIZE];
     float  weight[IQ1M_BLOCK_SIZE];
@@ -14683,7 +14753,7 @@ void quantize_row_iq1_m_ref  (const float * GGML_RESTRICT x, block_iq1_m   * GGM
     float qw[QK_K];
     for (int j = 0; j < QK_K; ++j) qw[j] = 1;
     for (int ibl = 0; ibl < nblock; ++ibl) {
-        quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw);
+        quantize_iq1_m(x + ibl*QK_K, &y[ibl], 1, QK_K, qw, NULL);
     }
 }
 
@@ -14881,7 +14951,9 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
     }
 }
 
-size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK4_NL == 0);
     int64_t nblock = n_per_row/QK4_NL;
     char * qrow = (char *)dst;
@@ -14923,7 +14995,9 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
     quantize_row_iq4_nl(x, y, k);
 }
 
-size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -14951,7 +15025,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k
 
 void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq4_xs(x, y, 1, k, NULL);
+    quantize_iq4_xs(x, y, 1, k, NULL, NULL);
 }
 
 // =============================== 2.5625 bpw
@@ -15124,7 +15198,9 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights,
+        const struct quantize_user_data * user_data) {
+    GGML_UNUSED(user_data);
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -15138,7 +15214,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
 
 void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_s(x, y, 1, k, NULL);
+    quantize_iq2_s(x, y, 1, k, NULL, NULL);
 }
 
 void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 7c8e2110..3e9d6d2b 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -123,29 +123,31 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+struct quantize_user_data;
 
-size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q6_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_iq1_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
+size_t quantize_q6_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * user_data);
 
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 23d0cc60..59842a2c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -28455,7 +28455,8 @@ size_t ggml_quantize_chunk(
                int64_t   start,
                int64_t   nrows,
                int64_t   n_per_row,
-           const float * imatrix) {
+           const float * imatrix,
+           const struct quantize_user_data * user_data) {
     const int64_t n = (int64_t) nrows * n_per_row;
 
     if (ggml_quantize_requires_imatrix(type)) {
@@ -28473,74 +28474,74 @@ size_t ggml_quantize_chunk(
     size_t result = 0;
 
     switch (type) {
-        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_0:    result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_KV:   result = quantize_q8_KV(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q2_K_R4: result = quantize_q2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q3_K_R4: result = quantize_q3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_K_R4: result = quantize_q4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_K_R4: result = quantize_q5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_XS_R4:result = quantize_iq2_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_XXS_R4:result = quantize_iq3_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_BN:  result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_BN:  result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_BN_R4:result = quantize_iq2_bn_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_NL_R4: result = quantize_iq4_nl_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_XS_R8: result = quantize_iq4_xs_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_R8: result = quantize_q4_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_0_R4: result = quantize_q5_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_0_R4: result = quantize_q6_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_0_R8: result = quantize_q8_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_KS:  result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ5_KS_R4:result = quantize_iq5_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ5_KS:  result = quantize_iq5_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_K:   result = quantize_iq2_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_KS:  result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ1_KT:  result = quantize_iq1_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_KT:  result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_KT:  result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_KT:  result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_K:   result = quantize_iq3_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_KS:  result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ2_KL:  result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_K:   result = quantize_iq4_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ3_K_R4:result = quantize_iq3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ5_K:   result = quantize_iq5_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ5_K_R4:result = quantize_iq5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_IQ6_K:   result = quantize_iq6_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q6_0:    result = quantize_q6_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_KV:   result = quantize_q8_KV(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q2_K_R4: result = quantize_q2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q3_K_R4: result = quantize_q3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q4_K_R4: result = quantize_q4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q5_K_R4: result = quantize_q5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_XS_R4:result = quantize_iq2_xs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_XXS_R4:result = quantize_iq3_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_S_R4:result = quantize_iq3_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_S_R4:result = quantize_iq2_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_S_R4:result = quantize_iq1_s_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_M_R4:result = quantize_iq1_m_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_BN:  result = quantize_iq1_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_BN:  result = quantize_iq2_bn (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_BN_R4:result = quantize_iq2_bn_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_NL_R4: result = quantize_iq4_nl_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_XS_R8: result = quantize_iq4_xs_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q4_0_R8: result = quantize_q4_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q5_0_R4: result = quantize_q5_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q6_0_R4: result = quantize_q6_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q8_0_R8: result = quantize_q8_0_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_KS:  result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ5_KS_R4:result = quantize_iq5_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ5_KS:  result = quantize_iq5_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_K:   result = quantize_iq2_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_KS:  result = quantize_iq2_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ1_KT:  result = quantize_iq1_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_KT:  result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_KT:  result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_KT:  result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_K:   result = quantize_iq3_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_KS:  result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ2_KL:  result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_K:   result = quantize_iq4_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ3_K_R4:result = quantize_iq3_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ4_K_R4:result = quantize_iq4_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ5_K:   result = quantize_iq5_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ5_K_R4:result = quantize_iq5_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
+        case GGML_TYPE_IQ6_K:   result = quantize_iq6_k  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix, user_data); break;
         case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 1d718049..7f183468 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -41,11 +41,11 @@ inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-typedef void (*quantize_func_t)(const float * src, void * qdata, int n_per_row, const float * imatrix);
+typedef void (*quantize_func_t)(const float * src, void * qdata, int n_per_row, const float * imatrix, const quantize_user_data * user_data);
 
 struct QHelper {
-    QHelper(const float * imatrix, int n_per_row, int block_size) : m_imatrix(imatrix),
-        m_n_per_row(n_per_row), m_block_size(block_size) {
+    QHelper(const float * imatrix, const quantize_user_data * user_data, int n_per_row, int block_size) :
+        m_imatrix(imatrix), m_user_data(user_data), m_n_per_row(n_per_row), m_block_size(block_size) {
         if (m_imatrix) {
             m_weight.resize(m_n_per_row);
         }
@@ -78,13 +78,14 @@ struct QHelper {
         auto cdst = (char *)dst;
         for (int row = 0; row < nrows; ++row) {
             auto weights = row_weights(src);
-            qfunc(src, cdst, m_n_per_row, weights);
+            qfunc(src, cdst, m_n_per_row, weights, m_user_data);
             src  += m_n_per_row;
             cdst += row_size;
         }
     }
 private:
     const float * m_imatrix;
+    const quantize_user_data * m_user_data;
     const int     m_n_per_row;
     const int     m_block_size;
     std::vector<float> m_weight;
@@ -92,12 +93,12 @@ private:
 
 template <int block_size, typename Block, typename Block_repacked, int n_repack, typename Func, typename RepackFunc>
 size_t quantize_repack(ggml_type type, const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
-        const Func& q_func, const RepackFunc& repack) {
+        const quantize_user_data * user_data, const Func& q_func, const RepackFunc& repack) {
     GGML_ASSERT(nrows%n_repack == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(type, n_per_row);
     std::vector<char> qtmp(n_repack*row_size);
-    QHelper helper(imatrix, n_per_row, block_size);
+    QHelper helper(imatrix, user_data, n_per_row, block_size);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrows; row += n_repack) {
         helper.quantize(n_repack, src, qtmp.data(), row_size, q_func);
@@ -269,7 +270,8 @@ void iqk_quantize_any(int from_type, int to_type,
 }
 
 
-size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data *) {
     IQ1BNQuantizer iq1bn;
     auto row_size = ggml_row_size(GGML_TYPE_IQ1_BN, n_per_row);
     auto qrow = (char *)dst;
@@ -281,11 +283,11 @@ size_t quantize_iq1_bn(const float * src, void * dst, int64_t nrows, int64_t n_p
 }
 
 void quantize_row_iq1_bn_ref(const float * x, block_iq1_bn * y, int64_t k) {
-    quantize_iq1_bn(x, y, 1, k, nullptr);
+    quantize_iq1_bn(x, y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq1_bn(const float * x, void * y, int64_t k) {
-    quantize_iq1_bn(x, y, 1, k, nullptr);
+    quantize_iq1_bn(x, y, 1, k, nullptr, nullptr);
 }
 
 void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) {
@@ -311,7 +313,8 @@ void dequantize_row_iq1_bn(const block_iq1_bn * x, float * y, int64_t k) {
     }
 }
 
-size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     IQ1BNQuantizer iq1bn;
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
     auto qrow = (char *)dst;
@@ -323,11 +326,11 @@ size_t quantize_iq2_bn(const float * src, void * dst, int64_t nrows, int64_t n_p
 }
 
 void quantize_row_iq2_bn_ref(const float * x, block_iq2_bn * y, int64_t k) {
-    quantize_iq2_bn(x, y, 1, k, nullptr);
+    quantize_iq2_bn(x, y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq2_bn(const float * x, void * y, int64_t k) {
-    quantize_iq2_bn(x, y, 1, k, nullptr);
+    quantize_iq2_bn(x, y, 1, k, nullptr, nullptr);
 }
 
 void dequantize_row_iq2_bn(const block_iq2_bn * x, float * y, int64_t k) {
@@ -1102,7 +1105,8 @@ inline int best_index_iq2nl(const int8_t * values, float x) {
     return x - values[idx] < values[idx+1] - x ? idx : idx + 1;
 }
 
-void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) {
+void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights,
+        [[maybe_unused]] const quantize_user_data * user_data) {
 
     constexpr int kBlockSize = 16;
 
@@ -1249,7 +1253,7 @@ void quantize_row_iq2_k_impl(const float * x, void * vy, int n_per_row, const fl
 
 void quantize_row_iq2_k_ref(const float * x, block_iq2_k  * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_k(x, (void *)y, 1, k, nullptr);
+    quantize_iq2_k(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq2_k(const float * x, void * vy, int64_t k) {
@@ -1258,9 +1262,10 @@ void quantize_row_iq2_k(const float * x, void * vy, int64_t k) {
     quantize_row_iq2_k_ref(x, y, k);
 }
 
-size_t quantize_iq2_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    QHelper helper(imatrix, n_per_row, 16);
+    QHelper helper(imatrix, user_data, n_per_row, 16);
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row);
     helper.quantize(nrows, src, dst, row_size, quantize_row_iq2_k_impl);
     return nrows * row_size;
@@ -1316,7 +1321,7 @@ void vec_dot_iq2_k_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx,
 }
 
 namespace {
-#if defined(__AVX2__) && !defined(IQK_SLOW_IQ2KS_QUANTIZE)
+#if defined(__AVX2__)
 inline void to_values_i32(__m256i idx, __m256i ivalues, __m256i * iv) {
     auto ival = _mm256_shuffle_epi8(ivalues, idx);
     auto ival_1 = _mm256_srli_si256(ival, 8);
@@ -1446,7 +1451,7 @@ float compute_1block_iq2ks_rmse(float d, const __m256 * vx, const __m256 * vw, c
     this_idx = idx;
     return hsum_float_8(vrmse);
 }
-void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) {
+void quantize_row_iq2_ks_fast_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) {
 
     constexpr int kBlockSize = 32;
 
@@ -1601,7 +1606,7 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f
     float sumq2 = hsum_float_8(vsumq2);
     *dptr = GGML_FP32_TO_FP16(1.000f*(sumq2 > 0 ? sumqx/sumq2 : d));
 }
-#else
+#endif
 void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const float * quant_weights, float * all_scales, float * all_sw, int8_t * all_Ls) {
 
     constexpr int kBlockSize = 32;
@@ -1751,12 +1756,11 @@ void quantize_row_iq2_ks_impl(const float * x, void * vy, int n_per_row, const f
     }
     *dptr = GGML_FP32_TO_FP16(1.030f*(sumq2 > 0 ? sumqx/sumq2 : d));
 }
-#endif
 }
 
 void quantize_row_iq2_ks_ref(const float * x, block_iq2_ks * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq2_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq2_ks(const float * x, void * vy, int64_t k) {
@@ -1765,17 +1769,25 @@ void quantize_row_iq2_ks(const float * x, void * vy, int64_t k) {
     quantize_row_iq2_ks_ref(x, y, k);
 }
 
-size_t quantize_iq2_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_KS, n_per_row);
     int nblock = n_per_row/QK_K;
     std::vector<float> all_scales(nblock*(QK_K/kBlockSize)), all_sw(nblock*(QK_K/kBlockSize));
     std::vector<int8_t> all_Ls(nblock*(QK_K/kBlockSize));
-    auto q_func = [&all_scales, &all_sw, &all_Ls] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    auto q_func = [&all_scales, &all_sw, &all_Ls] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+#ifdef __AVX2__
+        if (user_data && !user_data->slow_iq2_ks) {
+            quantize_row_iq2_ks_fast_impl(x, vy, n_per_row, imatrix, all_scales.data(), all_sw.data(), all_Ls.data());
+            return;
+        }
+#endif
         quantize_row_iq2_ks_impl(x, vy, n_per_row, imatrix, all_scales.data(), all_sw.data(), all_Ls.data());
     };
-    QHelper helper(imatrix, n_per_row, kBlockSize);
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
     helper.quantize(nrows, src, dst, row_size, q_func);
     return nrows * row_size;
 }
@@ -2121,7 +2133,7 @@ void quantize_row_iq2_kl_impl(const float * x, void * vy, int n_per_row, const f
 
 void quantize_row_iq2_kl_ref(const float * x, block_iq2_kl * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_kl(x, (void *)y, 1, k, nullptr);
+    quantize_iq2_kl(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) {
@@ -2130,16 +2142,18 @@ void quantize_row_iq2_kl(const float * x, void * vy, int64_t k) {
     quantize_row_iq2_kl_ref(x, y, k);
 }
 
-size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_kl(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_KL, n_per_row);
     int nblock = n_per_row/QK_K;
     std::vector<float> all_scales(nblock*(QK_K/kBlockSize));
-    auto q_func = [&all_scales] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    auto q_func = [&all_scales] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq2_kl_impl(x, vy, n_per_row, imatrix, all_scales.data());
     };
-    QHelper helper(imatrix, n_per_row, kBlockSize);
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
     helper.quantize(nrows, src, dst, row_size, q_func);
     return nrows * row_size;
 }
@@ -2196,7 +2210,8 @@ void vec_dot_iq2_kl_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx
 //
 namespace {
 
-static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) {
+static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights,
+        [[maybe_unused]] const quantize_user_data * user_data) {
 
     constexpr int ntry = 3;
 
@@ -2416,7 +2431,7 @@ static void quantize_row_iq3_k_impl(const float * x, void * vy, int n_per_row, c
 
 void quantize_row_iq3_k_ref(const float * x, block_iq3_k * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq3_k(x, (void *)y, 1, k, nullptr);
+    quantize_iq3_k(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq3_k(const float * x, void * vy, int64_t k) {
@@ -2425,9 +2440,10 @@ void quantize_row_iq3_k(const float * x, void * vy, int64_t k) {
     quantize_row_iq3_k_ref(x, y, k);
 }
 
-size_t quantize_iq3_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq3_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    QHelper helper(imatrix, n_per_row, 16);
+    QHelper helper(imatrix, user_data, n_per_row, 16);
     auto row_size = ggml_row_size(GGML_TYPE_IQ3_K, n_per_row);
     helper.quantize(nrows, src, dst, row_size, quantize_row_iq3_k_impl);
     return nrows * row_size;
@@ -2650,21 +2666,23 @@ static void quantize_row_iq3_ks_impl(const int super_block_size, const int block
 }
 
 void quantize_row_iq3_ks_ref(const float * x, block_iq3_ks * y, int64_t k) {
-    quantize_iq3_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq3_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq3_ks(const float * x, void * y, int64_t k) {
-    quantize_iq3_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq3_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
-size_t quantize_iq3_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq3_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     float weight[kBlockSize];
     std::vector<float> all_scales(n_per_row/kBlockSize);
     auto row_size = ggml_row_size(GGML_TYPE_IQ3_KS, n_per_row);
-    QHelper helper(imatrix, n_per_row, kBlockSize);
-    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
+    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq3_ks_impl(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq3nl_values, imatrix, 5);
     };
     helper.quantize(nrows, src, dst, row_size, q_func);
@@ -2975,7 +2993,7 @@ static void quantize_row_iq4_k_impl_bs16(const int super_block_size, const int b
 
 void quantize_row_iq4_k_ref(const float * x, block_iq4_k * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq4_k(x, (void *)y, 1, k, nullptr);
+    quantize_iq4_k(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq4_k(const float * x, void * vy, int64_t k) {
@@ -2984,12 +3002,14 @@ void quantize_row_iq4_k(const float * x, void * vy, int64_t k) {
     quantize_row_iq4_k_ref(x, y, k);
 }
 
-size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     uint8_t L[QK_K];
     float weight[16];
     float scales[QK_K/16];
-    auto q_func = [&L, &weight, &scales] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    auto q_func = [&L, &weight, &scales] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
         block_iq4_k * iq4 = (block_iq4_k *)vy;
         int nblock = n_per_row/QK_K;
         for (int ibl = 0; ibl < nblock; ++ibl) {
@@ -2999,7 +3019,7 @@ size_t quantize_iq4_k(const float * src, void * dst, int64_t nrows, int64_t n_pe
         }
     };
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_K, n_per_row);
-    QHelper helper(imatrix, n_per_row, 16);
+    QHelper helper(imatrix, user_data, n_per_row, 16);
     helper.quantize(nrows, src, dst, row_size, q_func);
     return nrows * row_size;
 }
@@ -3131,7 +3151,8 @@ inline int best_index_iq5nl(const int8_t * values, float x) {
     return ix < 32 ? ix : x - values[ix-32] < values[ix-31] - x ? ix-32 : ix-31;
 }
 
-void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights) {
+void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const float * quant_weights,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     const int ntry = 5;
     const float step = 1.f;
 
@@ -3315,7 +3336,7 @@ void quantize_row_iq5_k_impl(const float * x, void * vy, int n_per_row, const fl
 
 void quantize_row_iq5_k_ref(const float * x, block_iq5_k * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq5_k(x, (void *)y, 1, k, nullptr);
+    quantize_iq5_k(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq5_k(const float * x, void * vy, int64_t k) {
@@ -3324,9 +3345,10 @@ void quantize_row_iq5_k(const float * x, void * vy, int64_t k) {
     quantize_row_iq5_k_ref(x, y, k);
 }
 
-size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq5_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
-    QHelper helper(imatrix, n_per_row, 16);
+    QHelper helper(imatrix, user_data, n_per_row, 16);
     auto row_size = ggml_row_size(GGML_TYPE_IQ5_K, n_per_row);
     helper.quantize(nrows, src, dst, row_size, quantize_row_iq5_k_impl);
     return nrows * row_size;
@@ -3674,7 +3696,7 @@ void quantize_row_iq6_k_impl(const float * x, void * vy, int n_per_row, const fl
 
 void quantize_row_iq6_k_ref(const float * x, block_iq6_k * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq6_k(x, (void *)y, 1, k, nullptr);
+    quantize_iq6_k(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq6_k(const float * x, void * vy, int64_t k) {
@@ -3683,18 +3705,20 @@ void quantize_row_iq6_k(const float * x, void * vy, int64_t k) {
     quantize_row_iq6_k_ref(x, y, k);
 }
 
-size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq6_k(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     float values[128];
     for (int i = 0; i < 64; ++i) {
         values[i] = iq6nl_values[i];
         values[i+64] = values[i] + S_IQ6K;
     }
-    auto q_func = [values] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    auto q_func = [values] (const float * x, void * vy, int n_per_row, const float * imatrix,
+             [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq6_k_impl(x, vy, n_per_row, imatrix, values, values + 64);
     };
     auto row_size = ggml_row_size(GGML_TYPE_IQ6_K, n_per_row);
-    QHelper helper(imatrix, n_per_row, 16);
+    QHelper helper(imatrix, user_data, n_per_row, 16);
     helper.quantize(nrows, src, dst, row_size, q_func);
     return nrows * row_size;
 }
@@ -4113,14 +4137,15 @@ static void quantize_row_mxfp4_impl(int n_per_row, const float * x, char * cy,
 }
 
 void quantize_row_mxfp4_ref(const float * x, block_mxfp4 * y, int64_t k) {
-    quantize_mxfp4(x, (void *)y, 1, k, nullptr);
+    quantize_mxfp4(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_mxfp4(const float * x, void * y, int64_t k) {
-    quantize_mxfp4(x, (void *)y, 1, k, nullptr);
+    quantize_mxfp4(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
-size_t quantize_mxfp4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_mxfp4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+         [[maybe_unused]] const quantize_user_data * user_data) {
     constexpr int kBlockSize = QK_MXFP4;
     GGML_ASSERT(n_per_row%kBlockSize == 0);
     auto row_size = ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
@@ -4351,21 +4376,23 @@ static void quantize_row_iq4_k_impl_bs128(const int super_block_size, const int
 }
 
 void quantize_row_iq4_ks_ref(const float * x, block_iq4_ks * y, int64_t k) {
-    quantize_iq4_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq4_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq4_ks(const float * x, void * y, int64_t k) {
-    quantize_iq4_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq4_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
-size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
     float weight[kBlockSize];
     std::vector<float> all_scales(n_per_row/kBlockSize);
-    QHelper helper(imatrix, n_per_row, kBlockSize);
-    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
+    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix,
+             [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq4_k_impl_bs128(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq4k_values, imatrix, 7);
     };
     helper.quantize(nrows, src, dst, row_size, q_func);
@@ -4592,21 +4619,23 @@ static void quantize_row_iq5_ks_impl(const int super_block_size, const int block
 }
 
 void quantize_row_iq5_ks_ref(const float * x, block_iq5_ks * y, int64_t k) {
-    quantize_iq5_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq5_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq5_ks(const float * x, void * y, int64_t k) {
-    quantize_iq5_ks(x, (void *)y, 1, k, nullptr);
+    quantize_iq5_ks(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
-size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq5_ks(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row);
     float weight[kBlockSize];
     std::vector<float> all_scales(n_per_row/kBlockSize);
-    QHelper helper(imatrix, n_per_row, kBlockSize);
-    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
+    auto q_func = [&all_scales, &weight, block_size = kBlockSize] (const float * x, void * vy, int n_per_row, const float * imatrix,
+             [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq5_ks_impl(QK_K, block_size, n_per_row, x, (char *)vy, all_scales.data(), weight, iq5nl_values, imatrix, 5);
     };
     helper.quantize(nrows, src, dst, row_size, q_func);
@@ -4951,15 +4980,17 @@ static void quantize_row_iq4_kss_impl(int n_per_row, const float * x, char * cy,
 }
 }
 
-size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size    = ggml_row_size(GGML_TYPE_IQ4_KSS, n_per_row);
     std::vector<float> all_scales(n_per_row/kBlockSize);
     float weight[kBlockSize];
     auto table = scramble_table();
-    QHelper helper(imatrix, n_per_row, kBlockSize);
-    auto q_func = [&all_scales, &weight, table] (const float * x, void * vy, int n_per_row, const float * imatrix) {
+    QHelper helper(imatrix, user_data, n_per_row, kBlockSize);
+    auto q_func = [&all_scales, &weight, table] (const float * x, void * vy, int n_per_row, const float * imatrix,
+             [[maybe_unused]] const quantize_user_data * user_data) {
         quantize_row_iq4_kss_impl(n_per_row, x, (char *)vy, all_scales.data(), weight, iq4k_values, imatrix, table, 7);
     };
     helper.quantize(nrows, src, dst, row_size, q_func);
@@ -4967,11 +4998,11 @@ size_t quantize_iq4_kss(const float * src, void * dst, int64_t nrows, int64_t n_
 }
 
 void quantize_row_iq4_kss_ref(const float * x, block_iq4_kss * y, int64_t k) {
-    quantize_iq4_kss(x, y, 1, k, nullptr);
+    quantize_iq4_kss(x, y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq4_kss(const float * x, void * y, int64_t k) {
-    quantize_iq4_kss(x, (block_iq4_kss *)y, 1, k, nullptr);
+    quantize_iq4_kss(x, (block_iq4_kss *)y, 1, k, nullptr, nullptr);
 }
 
 void dequantize_row_iq4_kss(const block_iq4_kss * x, float * y, int64_t k) {
@@ -5019,12 +5050,12 @@ void vec_dot_iq4_kss_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 void quantize_row_iq4_nl_r4_ref(const float * x, block_iq4_nl_r4  * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_iq4_nl_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq4_nl_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq4_nl_r4(const float * x, void * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_iq4_nl_r4(x, y, 4, k/4, nullptr);
+    quantize_iq4_nl_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq4_nl(int nrows, int n_per_row, const block_iq4_nl * x, block_iq4_nl_r4 * y, [[maybe_unused]] bool online) {
@@ -5048,13 +5079,15 @@ static void repack_iq4_nl(int nrows, int n_per_row, const block_iq4_nl * x, bloc
     }
 }
 
-size_t quantize_iq4_nl_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_nl_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     auto row_size_nl = ggml_row_size(GGML_TYPE_IQ4_NL, n_per_row);
     std::vector<char> qtmp(4*row_size_nl);
-    QHelper helper(imatrix, n_per_row, 32);
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq4_nl(x, (char *)vy, 1, n_per_row, imatrix);
+    QHelper helper(imatrix, user_data, n_per_row, 32);
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq4_nl(x, (char *)vy, 1, n_per_row, imatrix, nullptr);
     };
     char * qrow = (char *)dst;
     for (int row = 0; row < nrows; row += 4) {
@@ -5107,12 +5140,12 @@ void vec_dot_iq4_nl_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t
 //
 void quantize_row_q4_0_r8_ref(const float * x, block_iq4_nl_r8  * y, int64_t k) {
     // we assume we are called with 8 rows
-    quantize_q4_0_r8(x, (void *)y, 8, k/8, nullptr);
+    quantize_q4_0_r8(x, (void *)y, 8, k/8, nullptr, nullptr);
 }
 
 void quantize_row_q4_0_r8(const float * x, void * y, int64_t k) {
     // we assume we are called with 8 rows
-    quantize_q4_0_r8(x, y, 8, k/8, nullptr);
+    quantize_q4_0_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 static void repack_q4_0(int nrows, int n_per_row, const block_q4_0 * x, block_iq4_nl_r8 * y, [[maybe_unused]] bool online) {
@@ -5161,13 +5194,15 @@ static void modify_q4_0_r8(int64_t k, char * cy) {
 }
 #endif
 
-size_t quantize_q4_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_q4_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%8 == 0);
     auto row_size_nl = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     std::vector<char> qtmp(8*row_size_nl);
-    QHelper helper(imatrix, n_per_row, 32);
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q4_0(x, (char *)vy, 1, n_per_row, imatrix);
+    QHelper helper(imatrix, user_data, n_per_row, 32);
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q4_0(x, (char *)vy, 1, n_per_row, imatrix, nullptr);
     };
     char * qrow = (char *)dst;
     for (int row = 0; row < nrows; row += 8) {
@@ -5217,12 +5252,12 @@ void vec_dot_q4_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
 //
 void quantize_row_q8_0_r8_ref(const float * x, block_q8_0_r8  * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q8_0_r8(x, (void *)y, 8, k/8, nullptr);
+    quantize_q8_0_r8(x, (void *)y, 8, k/8, nullptr, nullptr);
 }
 
 void quantize_row_q8_0_r8(const float * x, void * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q8_0_r8(x, y, 8, k/8, nullptr);
+    quantize_q8_0_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 static void repack_q8_0(int nrows, int n_per_row, const block_q8_0 * x, block_q8_0_r8 * y, [[maybe_unused]] bool online) {
@@ -5267,13 +5302,14 @@ static void modify_q8_0_r8(int64_t k, char * cy) {
 }
 #endif
 
-size_t quantize_q8_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_q8_0_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%8 == 0);
     auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
     std::vector<char> qtmp(8*row_size_0);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrows; row += 8) {
-        quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix);
+        quantize_q8_0(src, qtmp.data(), 8, n_per_row, imatrix, nullptr);
         repack_q8_0(8, n_per_row, (const block_q8_0 *)qtmp.data(), (block_q8_0_r8 *)qrow, false);
         src += 8*n_per_row;
         qrow += 8*row_size_0;
@@ -5316,12 +5352,12 @@ void vec_dot_q8_0_r8_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
 //
 void quantize_row_q5_0_r4_ref(const float * x, block_q5_0_r4  * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q5_0_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q5_0_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q5_0_r4(const float * x, void * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q5_0_r4(x, y, 4, k/4, nullptr);
+    quantize_q5_0_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static inline void convert_q5_0(const block_q5_0& x, uint8_t * L) {
@@ -5364,13 +5400,14 @@ static void repack_q5_0(int nrows, int n_per_row, const block_q5_0 * x, block_q5
     }
 }
 
-size_t quantize_q5_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_q5_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     auto row_size_0 = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
     std::vector<char> qtmp(4*row_size_0);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrows; row += 4) {
-        quantize_q5_0(src, qtmp.data(), 4, n_per_row, imatrix);
+        quantize_q5_0(src, qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_q5_0(4, n_per_row, (const block_q5_0 *)qtmp.data(), (block_q5_0_r4 *)qrow, false);
         src += 4*n_per_row;
         qrow += 4*row_size_0;
@@ -5417,12 +5454,12 @@ void vec_dot_q5_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
 //
 void quantize_row_q6_0_r4_ref(const float * x, block_q6_0_r4  * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q6_0_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q6_0_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q6_0_r4(const float * x, void * y, int64_t k) {
     // we assume we are called with 4 rows
-    quantize_q6_0_r4(x, y, 4, k/4, nullptr);
+    quantize_q6_0_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static inline void convert_q6_0(const block_q6_0& x, uint8_t * L) {
@@ -5461,14 +5498,16 @@ static void repack_q6_0(int nrows, int n_per_row, const block_q6_0 * x, block_q6
     }
 }
 
-size_t quantize_q6_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_q6_0_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     auto row_size_0 = ggml_row_size(GGML_TYPE_Q6_0, n_per_row);
     std::vector<char> qtmp(4*row_size_0);
     char * qrow = (char *)dst;
-    QHelper helper(imatrix, n_per_row, 32);
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q6_0(x, (char *)vy, 1, n_per_row, imatrix);
+    QHelper helper(imatrix, user_data, n_per_row, 32);
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q6_0(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
     for (int row = 0; row < nrows; row += 4) {
         helper.quantize(4, src, qtmp.data(), row_size_0, q_func);
@@ -5518,11 +5557,11 @@ void vec_dot_q6_0_r4_q8_0(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_iq4_xs_r8_ref(const float * x, block_iq4_xs_r8 * y, int64_t k) {
-    quantize_iq4_xs_r8(x, (void *)y, 8, k/8, nullptr);
+    quantize_iq4_xs_r8(x, (void *)y, 8, k/8, nullptr, nullptr);
 }
 
 void quantize_row_iq4_xs_r8(const float * x, void * y, int64_t k) {
-    quantize_iq4_xs_r8(x, y, 8, k/8, nullptr);
+    quantize_iq4_xs_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, block_iq4_xs_r8 * y, [[maybe_unused]] bool online) {
@@ -5557,11 +5596,13 @@ static void repack_iq4_xs(int nrows, int n_per_row, const block_iq4_xs * x, bloc
     }
 }
 
-size_t quantize_iq4_xs_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq4_xs(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq4_xs_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq4_xs(x, (char *)vy, 1, n_per_row, imatrix, nullptr);
     };
-    return quantize_repack<32, block_iq4_xs, block_iq4_xs_r8, 8>(GGML_TYPE_IQ4_XS, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<32, block_iq4_xs, block_iq4_xs_r8, 8>(GGML_TYPE_IQ4_XS, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq4_xs);
 }
 
@@ -5603,11 +5644,11 @@ void vec_dot_iq4_xs_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq4_ks_r4_ref(const float * x, block_iq4_ks_r4 * y, int64_t k) {
-    quantize_iq4_ks_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq4_ks_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq4_ks_r4(const float * x, void * y, int64_t k) {
-    quantize_iq4_ks_r4(x, y, 4, k/4, nullptr);
+    quantize_iq4_ks_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq4_ks(int nrows, int n_per_row, const block_iq4_ks * x, block_iq4_ks_r4 * y, [[maybe_unused]] bool online) {
@@ -5644,14 +5685,15 @@ static void repack_iq4_ks(int nrows, int n_per_row, const block_iq4_ks * x, bloc
     }
 }
 
-size_t quantize_iq4_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_KS, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq4_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq4_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq4_ks(4, n_per_row, (const block_iq4_ks *)qtmp.data(), (block_iq4_ks_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -5703,11 +5745,11 @@ void vec_dot_iq4_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 // ========================================= iq2_bn_r4
 //
 void quantize_row_iq2_bn_r4_ref(const float * x, block_iq2_bn  * y, int64_t k) {
-    quantize_iq2_bn_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq2_bn_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq2_bn_r4(const float * x, void * y, int64_t k) {
-    quantize_iq2_bn_r4(x, y, 4, k/4, nullptr);
+    quantize_iq2_bn_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -5772,14 +5814,15 @@ void repack_iq2_bn(int nrows, int n_per_row, const char * x, char * y, [[maybe_u
 }
 }
 
-size_t quantize_iq2_bn_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_bn_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_IQ1BN == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_BN, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq2_bn(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq2_bn(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq2_bn(4, n_per_row, qtmp.data(), qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -5826,11 +5869,11 @@ void vec_dot_iq2_bn_r4_q8_K64(int n, float * s, size_t bs, const void * vx, size
 //
 
 void quantize_row_q4_k_r4_ref(const float * x, block_q4_k_r4 * y, int64_t k) {
-    quantize_q4_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q4_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q4_k_r4(const float * x, void * y, int64_t k) {
-    quantize_q4_k_r4(x, y, 4, k/4, nullptr);
+    quantize_q4_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -5887,11 +5930,13 @@ static void repack_q4_k(int nrows, int n_per_row, const block_q4_K * x, block_q4
     }
 }
 
-size_t quantize_q4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q4_K(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_q4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q4_K(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<32, block_q4_K, block_q4_k_r4, 4>(GGML_TYPE_Q4_K, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<32, block_q4_K, block_q4_k_r4, 4>(GGML_TYPE_Q4_K, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_q4_k);
 }
 
@@ -5940,11 +5985,11 @@ void vec_dot_q4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q6_k_r4_ref(const float * x, block_q6_k_r4 * y, int64_t k) {
-    quantize_q6_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q6_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q6_k_r4(const float * x, void * y, int64_t k) {
-    quantize_q6_k_r4(x, y, 4, k/4, nullptr);
+    quantize_q6_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -5996,11 +6041,13 @@ static void repack_q6_k(int nrows, int n_per_row, const block_q6_K * x, block_q6
     }
 }
 
-size_t quantize_q6_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q6_K(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_q6_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q6_K(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_q6_K, block_q6_k_r4, 4>(GGML_TYPE_Q6_K, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_q6_K, block_q6_k_r4, 4>(GGML_TYPE_Q6_K, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_q6_k);
 }
 
@@ -6052,11 +6099,11 @@ void vec_dot_q6_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q5_k_r4_ref(const float * x, block_q5_k_r4 * y, int64_t k) {
-    quantize_q5_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q5_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q5_k_r4(const float * x, void * y, int64_t k) {
-    quantize_q5_k_r4(x, y, 4, k/4, nullptr);
+    quantize_q5_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -6107,11 +6154,13 @@ static void repack_q5_k(int nrows, int n_per_row, const block_q5_K * x, block_q5
     }
 }
 
-size_t quantize_q5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q5_K(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_q5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q5_K(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<32, block_q5_K, block_q5_k_r4, 4>(GGML_TYPE_Q5_K, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<32, block_q5_K, block_q5_k_r4, 4>(GGML_TYPE_Q5_K, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_q5_k);
 }
 
@@ -6164,11 +6213,11 @@ void vec_dot_q5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q3_k_r4_ref(const float * x, block_q3_k_r4 * y, int64_t k) {
-    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q3_k_r4(const float * x, void * y, int64_t k) {
-    quantize_q3_k_r4(x, y, 4, k/4, nullptr);
+    quantize_q3_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -6236,11 +6285,13 @@ static void repack_q3_k(int nrows, int n_per_row, const block_q3_K * x, block_q3
     }
 }
 
-size_t quantize_q3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q3_K(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_q3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q3_K(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_q3_K, block_q3_k_r4, 4>(GGML_TYPE_Q3_K, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_q3_K, block_q3_k_r4, 4>(GGML_TYPE_Q3_K, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_q3_k);
 }
 
@@ -6293,11 +6344,11 @@ void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q2_k_r4_ref(const float * x, block_q2_k_r4 * y, int64_t k) {
-    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q2_k_r4(const float * x, void * y, int64_t k) {
-    quantize_q2_k_r4(x, y, 4, k/4, nullptr);
+    quantize_q2_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -6345,11 +6396,13 @@ static void repack_q2_k(int nrows, int n_per_row, const block_q2_K * x, block_q2
     }
 }
 
-size_t quantize_q2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_q2_K(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_q2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_q2_K(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_q2_K, block_q2_k_r4, 4>(GGML_TYPE_Q2_K, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_q2_K, block_q2_k_r4, 4>(GGML_TYPE_Q2_K, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_q2_k);
 }
 
@@ -6401,11 +6454,11 @@ void vec_dot_q2_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_iq4_k_r4_ref(const float * x, block_iq4_k_r4 * y, int64_t k) {
-    quantize_iq4_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq4_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq4_k_r4(const float * x, void * y, int64_t k) {
-    quantize_iq4_k_r4(x, y, 4, k/4, nullptr);
+    quantize_iq4_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq4_k(int nrows, int n_per_row, const block_iq4_k * x, block_iq4_k_r4 * y, [[maybe_unused]] bool online) {
@@ -6453,14 +6506,15 @@ static void repack_iq4_k(int nrows, int n_per_row, const block_iq4_k * x, block_
     }
 }
 
-size_t quantize_iq4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_K, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq4_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq4_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq4_k(4, n_per_row, (const block_iq4_k *)qtmp.data(), (block_iq4_k_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -6515,11 +6569,11 @@ void vec_dot_iq4_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq5_k_r4_ref(const float * x, block_iq5_k_r4 * y, int64_t k) {
-    quantize_iq5_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq5_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq5_k_r4(const float * x, void * y, int64_t k) {
-    quantize_iq5_k_r4(x, y, 4, k/4, nullptr);
+    quantize_iq5_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -6590,14 +6644,15 @@ static void repack_iq5_k(int nrows, int n_per_row, const block_iq5_k * x, block_
     }
 }
 
-size_t quantize_iq5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq5_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ5_K, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq5_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq5_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq5_k(4, n_per_row, (const block_iq5_k *)qtmp.data(), (block_iq5_k_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -6652,11 +6707,11 @@ void vec_dot_iq5_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq5_ks_r4_ref(const float * x, block_iq5_ks_r4 * y, int64_t k) {
-    quantize_iq5_ks_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq5_ks_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq5_ks_r4(const float * x, void * y, int64_t k) {
-    quantize_iq5_ks_r4(x, y, 4, k/4, nullptr);
+    quantize_iq5_ks_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq5_ks(int nrows, int n_per_row, const block_iq5_ks * x, block_iq5_ks_r4 * y, [[maybe_unused]] bool online) {
@@ -6697,14 +6752,15 @@ static void repack_iq5_ks(int nrows, int n_per_row, const block_iq5_ks * x, bloc
     }
 }
 
-size_t quantize_iq5_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq5_ks_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ5_KS, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq5_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq5_ks(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq5_ks(4, n_per_row, (const block_iq5_ks *)qtmp.data(), (block_iq5_ks_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -6772,11 +6828,11 @@ void vec_dot_iq5_ks_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_q8_k_r8_ref(const float * x, block_q8_k_r8 * y, int64_t k) {
-    quantize_q8_k_r8(x, (void *)y, 8, k/8, nullptr);
+    quantize_q8_k_r8(x, (void *)y, 8, k/8, nullptr, nullptr);
 }
 
 void quantize_row_q8_k_r8(const float * x, void * y, int64_t k) {
-    quantize_q8_k_r8(x, y, 8, k/8, nullptr);
+    quantize_q8_k_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 static void repack_q8_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r8 * y, [[maybe_unused]] bool online) {
@@ -6819,7 +6875,8 @@ static void modify_q8_k_r8(int64_t k, char * cy) {
 }
 #endif
 
-size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
+size_t quantize_q8_k_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%8 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
@@ -6870,11 +6927,11 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q8_k_r16_ref(const float * x, block_q8_k_r16 * y, int64_t k) {
-    quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr);
+    quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr, nullptr);
 }
 
 void quantize_row_q8_k_r16(const float * x, void * y, int64_t k) {
-    quantize_q8_k_r16(x, y, 16, k/16, nullptr);
+    quantize_q8_k_r16(x, y, 16, k/16, nullptr, nullptr);
 }
 
 static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r16 * y, [[maybe_unused]] bool online) {
@@ -6903,7 +6960,8 @@ static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q
     }
 }
 
-size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
+size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%16 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
@@ -6956,11 +7014,11 @@ void vec_dot_q8_k_r16_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_q8_KV_r8_ref(const float * x, void * y, int64_t k) {
-    quantize_q8_KV_r8(x, y, 8, k/8, nullptr);
+    quantize_q8_KV_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 void quantize_row_q8_KV_r8(const float * x, void * y, int64_t k) {
-    quantize_q8_KV_r8(x, y, 8, k/8, nullptr);
+    quantize_q8_KV_r8(x, y, 8, k/8, nullptr, nullptr);
 }
 
 static void repack_q8_KV(int nrows, int n_per_row, const char * cx, char * cy, [[maybe_unused]] bool online) {
@@ -7051,7 +7109,8 @@ static void modify_q8_KV_r8(int64_t k, char * cy) {
 }
 #endif
 
-size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
+size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%8 == 0);
     GGML_ASSERT(n_per_row%16 == 0);
     char * qcur = (char *)dst;
@@ -7059,7 +7118,7 @@ size_t quantize_q8_KV_r8(const float * src, void * dst, int64_t nrows, int64_t n
     auto row_size_1 = ggml_row_size(GGML_TYPE_Q8_KV_R8, n_per_row);
     std::vector<char> qtmp(8*row_size_0);
     for (int row = 0; row < nrows; row += 8) {
-        quantize_q8_KV(src, (void *)qtmp.data(), 8, n_per_row, imatrix);
+        quantize_q8_KV(src, (void *)qtmp.data(), 8, n_per_row, imatrix, user_data);
         repack_q8_KV(8, n_per_row, qtmp.data(), qcur, false);
         qcur += 8*row_size_1;
         src += 8*n_per_row;
@@ -7137,11 +7196,11 @@ void repack_bf16_bf16_r16(const void * GGML_RESTRICT src, void * GGML_RESTRICT d
 //
 
 void quantize_row_iq3_k_r4_ref(const float * x, block_iq3_k_r4 * y, int64_t k) {
-    quantize_iq3_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq3_k_r4(const float * x, void * y, int64_t k) {
-    quantize_iq3_k_r4(x, y, 4, k/4, nullptr);
+    quantize_iq3_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -7207,14 +7266,15 @@ static void repack_iq3_k(int nrows, int n_per_row, const block_iq3_k * x, block_
     }
 }
 
-size_t quantize_iq3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq3_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ3_K, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq3_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq3_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq3_k(4, n_per_row, (const block_iq3_k *)qtmp.data(), (block_iq3_k_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -7273,11 +7333,11 @@ void vec_dot_iq3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq2_k_r4_ref(const float * x, block_iq2_k_r4 * y, int64_t k) {
-    quantize_iq2_k_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq2_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq2_k_r4(const float * x, void * y, int64_t k) {
-    quantize_iq2_k_r4(x, y, 4, k/4, nullptr);
+    quantize_iq2_k_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -7332,14 +7392,15 @@ static void repack_iq2_k(int nrows, int n_per_row, const block_iq2_k * x, block_
     }
 }
 
-size_t quantize_iq2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_k_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%QK_K == 0);
     char * qcur = (char *)dst;
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_K, n_per_row);
     std::vector<char> qtmp(4*row_size);
     for (int row = 0; row < nrows; row += 4) {
-        quantize_iq2_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix);
+        quantize_iq2_k(src, (void *)qtmp.data(), 4, n_per_row, imatrix, user_data);
         repack_iq2_k(4, n_per_row, (const block_iq2_k *)qtmp.data(), (block_iq2_k_r4 *)qcur, false);
         qcur += 4*row_size;
         src += 4*n_per_row;
@@ -7412,11 +7473,11 @@ inline uint8_t scrambled_sign(uint8_t s) {
 //
 
 void quantize_row_iq2_xxs_r4_ref(const float * x, block_iq2_xxs_r4 * y, int64_t k) {
-    quantize_iq2_xxs_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq2_xxs_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq2_xxs_r4(const float * x, void * y, int64_t k) {
-    quantize_iq2_xxs_r4(x, y, 4, k/4, nullptr);
+    quantize_iq2_xxs_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq2_xxs(int nrows, int n_per_row, const block_iq2_xxs * x, block_iq2_xxs_r4 * y, [[maybe_unused]] bool online) {
@@ -7452,11 +7513,13 @@ static void repack_iq2_xxs(int nrows, int n_per_row, const block_iq2_xxs * x, bl
     }
 }
 
-size_t quantize_iq2_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq2_xxs(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq2_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq2_xxs(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<32, block_iq2_xxs, block_iq2_xxs_r4, 4>(GGML_TYPE_IQ2_XXS, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<32, block_iq2_xxs, block_iq2_xxs_r4, 4>(GGML_TYPE_IQ2_XXS, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq2_xxs);
 }
 
@@ -7505,11 +7568,11 @@ void vec_dot_iq2_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_
 //
 
 void quantize_row_iq2_xs_r4_ref(const float * x, block_iq2_xs_r4 * y, int64_t k) {
-    quantize_iq2_xs_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq2_xs_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq2_xs_r4(const float * x, void * y, int64_t k) {
-    quantize_iq2_xs_r4(x, y, 4, k/4, nullptr);
+    quantize_iq2_xs_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq2_xs(int nrows, int n_per_row, const block_iq2_xs * x, block_iq2_xs_r4 * y, [[maybe_unused]] bool online) {
@@ -7537,11 +7600,13 @@ static void repack_iq2_xs(int nrows, int n_per_row, const block_iq2_xs * x, bloc
     }
 }
 
-size_t quantize_iq2_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq2_xs(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq2_xs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq2_xs(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_iq2_xs, block_iq2_xs_r4, 4>(GGML_TYPE_IQ2_XS, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_iq2_xs, block_iq2_xs_r4, 4>(GGML_TYPE_IQ2_XS, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq2_xs);
 }
 
@@ -7585,11 +7650,11 @@ void vec_dot_iq2_xs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq2_s_r4_ref(const float * x, block_iq2_s_r4 * y, int64_t k) {
-    quantize_iq2_s_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq2_s_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq2_s_r4(const float * x, void * y, int64_t k) {
-    quantize_iq2_s_r4(x, y, 4, k/4, nullptr);
+    quantize_iq2_s_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq2_s(int nrows, int n_per_row, const block_iq2_s * x, block_iq2_s_r4 * y, [[maybe_unused]] bool online) {
@@ -7618,11 +7683,13 @@ static void repack_iq2_s(int nrows, int n_per_row, const block_iq2_s * x, block_
     }
 }
 
-size_t quantize_iq2_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq2_s(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq2_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq2_s(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_iq2_s, block_iq2_s_r4, 4>(GGML_TYPE_IQ2_S, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_iq2_s, block_iq2_s_r4, 4>(GGML_TYPE_IQ2_S, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq2_s);
 }
 
@@ -7665,11 +7732,11 @@ void vec_dot_iq2_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 //
 
 void quantize_row_iq3_xxs_r4_ref(const float * x, block_iq3_xxs_r4 * y, int64_t k) {
-    quantize_iq3_xxs_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq3_xxs_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq3_xxs_r4(const float * x, void * y, int64_t k) {
-    quantize_iq3_xxs_r4(x, y, 4, k/4, nullptr);
+    quantize_iq3_xxs_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 namespace {
@@ -7708,11 +7775,13 @@ static void repack_iq3_xxs(int nrows, int n_per_row, const block_iq3_xxs * x, bl
     }
 }
 
-size_t quantize_iq3_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq3_xxs(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq3_xxs_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq3_xxs(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<32, block_iq3_xxs, block_iq3_xxs_r4, 4>(GGML_TYPE_IQ3_XXS, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<32, block_iq3_xxs, block_iq3_xxs_r4, 4>(GGML_TYPE_IQ3_XXS, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq3_xxs);
 }
 
@@ -7761,11 +7830,11 @@ void vec_dot_iq3_xxs_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_
 //
 
 void quantize_row_iq3_s_r4_ref(const float * x, block_iq3_s_r4 * y, int64_t k) {
-    quantize_iq3_s_r4(x, (void *)y, 4, k/4, nullptr);
+    quantize_iq3_s_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq3_s_r4(const float * x, void * y, int64_t k) {
-    quantize_iq3_s_r4(x, y, 4, k/4, nullptr);
+    quantize_iq3_s_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 static void repack_iq3_s(int nrows, int n_per_row, const block_iq3_s * x, block_iq3_s_r4 * y, [[maybe_unused]] bool online) {
@@ -7806,11 +7875,13 @@ static void repack_iq3_s(int nrows, int n_per_row, const block_iq3_s * x, block_
     }
 }
 
-size_t quantize_iq3_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
-    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix) {
-        quantize_iq3_s(x, (char *)vy, 1, n_per_row, imatrix);
+size_t quantize_iq3_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
+    auto q_func = [] (const float * x, void * vy, int n_per_row, const float * imatrix,
+            [[maybe_unused]] const quantize_user_data * user_data) {
+        quantize_iq3_s(x, (char *)vy, 1, n_per_row, imatrix, user_data);
     };
-    return quantize_repack<16, block_iq3_s, block_iq3_s_r4, 4>(GGML_TYPE_IQ3_S, src, dst, nrows, n_per_row, imatrix,
+    return quantize_repack<16, block_iq3_s, block_iq3_s_r4, 4>(GGML_TYPE_IQ3_S, src, dst, nrows, n_per_row, imatrix, user_data,
             q_func, repack_iq3_s);
 }
 
@@ -7851,14 +7922,15 @@ void vec_dot_iq3_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 }
 
 void quantize_row_iq1_s_r4_ref(const float * x, block_iq1_s_r4  * y, int64_t k) {
-    quantize_iq1_s_r4(x, y, 4, k/4, nullptr);
+    quantize_iq1_s_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq1_s_r4(const float * x, void * y, int64_t k) {
-    quantize_iq1_s_r4(x, y, 4, k/4, nullptr);
+    quantize_iq1_s_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
-size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq1_s_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%kBlockSize == 0);
@@ -7982,14 +8054,15 @@ void vec_dot_iq1_s_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t
 }
 
 void quantize_row_iq1_m_r4_ref(const float * x, block_iq1_m_r4  * y, int64_t k) {
-    quantize_iq1_m_r4(x, y, 4, k/4, nullptr);
+    quantize_iq1_m_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_iq1_m_r4(const float * x, void * y, int64_t k) {
-    quantize_iq1_m_r4(x, y, 4, k/4, nullptr);
+    quantize_iq1_m_r4(x, y, 4, k/4, nullptr, nullptr);
 }
 
-size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq1_m_r4(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     constexpr int kBlockSize = 32;
     GGML_ASSERT(nrows%4 == 0);
     GGML_ASSERT(n_per_row%kBlockSize == 0);
@@ -8135,7 +8208,8 @@ void quantize_row_q8_KV_ref(const float * x, void * y, int64_t k) {
     quantize_row_q8_KV(x, y, k);
 }
 
-size_t quantize_q8_KV(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_q8_KV(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     (void)imatrix;
     auto row_size = ggml_row_size(GGML_TYPE_Q8_KV, n_per_row);
     auto q = (char *)dst;
@@ -9195,7 +9269,7 @@ void quantize_row_iq1_kt_impl(const float * x, void * vy, int n_per_row, const f
 
 void quantize_row_iq1_kt_ref(const float * GGML_RESTRICT x, block_iq1_kt * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq1_kt(x, (void *)y, 1, k, nullptr);
+    quantize_iq1_kt(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
@@ -9204,7 +9278,8 @@ void quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
     quantize_row_iq1_kt_ref(x, y, k);
 }
 
-size_t quantize_iq1_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq1_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ1_KT, n_per_row);
     std::vector<float> scales(n_per_row/QuantizerIQ1KT::kBlockSize);
@@ -9475,7 +9550,7 @@ void quantize_row_iq2_kt_impl(const float * x, void * vy, int n_per_row, const f
 
 void quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq2_kt(x, (void *)y, 1, k, nullptr);
+    quantize_iq2_kt(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
@@ -9484,7 +9559,8 @@ void quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
     quantize_row_iq2_kt_ref(x, y, k);
 }
 
-size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq2_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ2_KT, n_per_row);
     std::vector<float> scales(n_per_row/QuantizerIQ2KT::kBlockSize);
@@ -9745,7 +9821,7 @@ void quantize_row_iq3_kt_impl(const float * x, void * vy, int n_per_row, const f
 
 void quantize_row_iq3_kt_ref(const float * x, block_iq3_kt * y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq3_kt(x, (void *)y, 1, k, nullptr);
+    quantize_iq3_kt(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq3_kt(const float * x, void * vy, int64_t k) {
@@ -9754,7 +9830,8 @@ void quantize_row_iq3_kt(const float * x, void * vy, int64_t k) {
     quantize_row_iq3_kt_ref(x, y, k);
 }
 
-size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq3_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ3_KT, n_per_row);
     std::vector<float> scales(n_per_row/QuantizerIQ3KT::kBlockSize);
@@ -10009,7 +10086,7 @@ void quantize_row_iq4_kt_impl(const float * x, void * vy, int n_per_row, const f
 
 void quantize_row_iq4_kt_ref(const float * GGML_RESTRICT x, block_iq4_kt * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
-    quantize_iq4_kt(x, (void *)y, 1, k, nullptr);
+    quantize_iq4_kt(x, (void *)y, 1, k, nullptr, nullptr);
 }
 
 void quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
@@ -10018,7 +10095,8 @@ void quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
     quantize_row_iq4_kt_ref(x, y, k);
 }
 
-size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix) {
+size_t quantize_iq4_kt(const float * src, void * dst, int64_t nrows, int64_t n_per_row, const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     auto row_size = ggml_row_size(GGML_TYPE_IQ4_KT, n_per_row);
     std::vector<float> scales(n_per_row/QuantizerIQ4KT::kBlockSize);
@@ -10101,7 +10179,8 @@ void quantize_row_q1_0_g128(const float * x, void * vy, int64_t k) {
     }
 }
 
-size_t quantize_q1_0_g128(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
+size_t quantize_q1_0_g128(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix,
+        [[maybe_unused]] const quantize_user_data * user_data) {
     GGML_ASSERT(n_per_row % QK1_0_G128 == 0);
     int64_t ntot = nrows * n_per_row;
     quantize_row_q1_0_g128(src, dst, ntot);
diff --git a/ggml/src/iqk/iqk_quantize.h b/ggml/src/iqk/iqk_quantize.h
index d85c7599..b5fb8a31 100644
--- a/ggml/src/iqk/iqk_quantize.h
+++ b/ggml/src/iqk/iqk_quantize.h
@@ -19,285 +19,287 @@ extern "C" {
 #define GGML_RESTRICT restrict
 #endif
 
+struct quantize_user_data;
+
 void   quantize_row_iq2_k_ref(const float * GGML_RESTRICT x, block_iq2_k  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_k(const block_iq2_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_k_ref(const float * GGML_RESTRICT x, block_iq3_k  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_k(const block_iq3_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_ks_ref(const float * GGML_RESTRICT x, block_iq3_ks  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_ks(const block_iq3_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_k_ref(const float * GGML_RESTRICT x, block_iq4_k  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_k(const block_iq4_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq5_k_ref(const float * GGML_RESTRICT x, block_iq5_k  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq5_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq5_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq5_k(const block_iq5_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq5_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq6_k_ref(const float * GGML_RESTRICT x, block_iq6_k  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq6_k(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq6_k(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq6_k(const block_iq6_k  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq6_k_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_ks_ref(const float * GGML_RESTRICT x, block_iq4_ks  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_ks(const block_iq4_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_kss_ref(const float * GGML_RESTRICT x, block_iq4_kss  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_kss(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_kss(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_kss(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_kss(const block_iq4_kss  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_kss_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_mxfp4(const block_mxfp4 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_mxfp4_q8_0_x4(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_ks_ref(const float * GGML_RESTRICT x, block_iq2_ks  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_ks(const block_iq2_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_kl_ref(const float * GGML_RESTRICT x, block_iq2_kl  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_kl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_kl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_kl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_kl(const block_iq2_kl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_kl_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq1_kt_ref(const float * GGML_RESTRICT x, block_iq1_kt  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq1_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq1_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq1_kt(const block_iq1_kt  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq1_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_kt_ref(const float * GGML_RESTRICT x, block_iq2_kt  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_kt(const block_iq2_kt  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_kt_ref(const float * GGML_RESTRICT x, block_iq3_kt  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_kt(const block_iq3_kt  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_kt_ref(const float * GGML_RESTRICT x, block_iq4_kt  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_kt(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_kt(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_kt(const block_iq4_kt  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_kt_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq5_ks_ref(const float * GGML_RESTRICT x, block_iq5_ks  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq5_ks(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq5_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq5_ks(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq5_ks(const block_iq5_ks  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq5_ks_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_nl_r4_ref(const float * GGML_RESTRICT x, block_iq4_nl_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_nl_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_nl_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_nl_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_nl_r4(const block_iq4_nl_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_nl_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q4_0_r8_ref(const float * GGML_RESTRICT x, block_iq4_nl_r8  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q4_0_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q4_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q4_0_r8(const block_iq4_nl_r8  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q4_0_r8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q8_0_r8_ref(const float * GGML_RESTRICT x, block_q8_0_r8  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q8_0_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q8_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_0_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q8_0_r8(const block_q8_0_r8  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q8_0_r8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q5_0_r4_ref(const float * GGML_RESTRICT x, block_q5_0_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q5_0_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q5_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q5_0_r4(const block_q5_0_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q5_0_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q6_0_r4_ref(const float * GGML_RESTRICT x, block_q6_0_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q6_0_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q6_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q6_0_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q6_0_r4(const block_q6_0_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q6_0_r4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_xs_r8_ref(const float * GGML_RESTRICT x, block_iq4_xs_r8 * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_xs_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_xs_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_xs_r8(const block_iq4_xs_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_xs_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_bn_ref (const float * GGML_RESTRICT x, block_iq2_bn  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_bn (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void   dequantize_row_iq2_bn (const block_iq2_bn  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_bn (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_bn_r4_ref (const float * GGML_RESTRICT x, block_iq2_bn  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_bn_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void   dequantize_row_iq2_bn_r4(const block_iq2_bn * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_bn_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_bn_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   vec_dot_iq2_bn_r4_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q3_k_r4_ref(const float * GGML_RESTRICT x, block_q3_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q3_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q3_k_r4(const block_q3_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q3_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q2_k_r4_ref(const float * GGML_RESTRICT x, block_q2_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q2_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q2_k_r4(const block_q2_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q2_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q4_k_r4_ref(const float * GGML_RESTRICT x, block_q4_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q4_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q4_k_r4(const block_q4_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q4_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q5_k_r4_ref(const float * GGML_RESTRICT x, block_q5_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q5_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q5_k_r4(const block_q5_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q5_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q6_k_r4_ref(const float * GGML_RESTRICT x, block_q6_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q6_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q6_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q6_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q6_k_r4(const block_q6_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q6_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq5_k_r4_ref(const float * GGML_RESTRICT x, block_iq5_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq5_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq5_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq5_k_r4(const block_iq5_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq5_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_k_r4_ref(const float * GGML_RESTRICT x, block_iq4_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_k_r4(const block_iq4_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_k_r4_ref(const float * GGML_RESTRICT x, block_iq3_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_k_r4(const block_iq3_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_k_r4_ref(const float * GGML_RESTRICT x, block_iq2_k_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_k_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_k_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_k_r4(const block_iq2_k_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_k_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq4_ks_r4_ref(const float * GGML_RESTRICT x, block_iq4_ks_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq4_ks_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq4_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq4_ks_r4(const block_iq4_ks_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq4_ks_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq5_ks_r4_ref(const float * GGML_RESTRICT x, block_iq5_ks_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq5_ks_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq5_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq5_ks_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq5_ks_r4(const block_iq5_ks_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq5_ks_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_xxs_r4_ref(const float * GGML_RESTRICT x, block_iq2_xxs_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_xxs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_xxs_r4(const block_iq2_xxs_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_xxs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_xs_r4_ref(const float * GGML_RESTRICT x, block_iq2_xs_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_xs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_xs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_xs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_xs_r4(const block_iq2_xs_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_xs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq2_s_r4_ref(const float * GGML_RESTRICT x, block_iq2_s_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq2_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq2_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq2_s_r4(const block_iq2_s_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq2_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_xxs_r4_ref(const float * GGML_RESTRICT x, block_iq3_xxs_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_xxs_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_xxs_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_xxs_r4(const block_iq3_xxs_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_xxs_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq3_s_r4_ref(const float * GGML_RESTRICT x, block_iq3_s_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq3_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq3_s_r4(const block_iq3_s_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq3_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq1_s_r4_ref(const float * GGML_RESTRICT x, block_iq1_s_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq1_s_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_s_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq1_s_r4(const block_iq1_s_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq1_s_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_iq1_m_r4_ref(const float * GGML_RESTRICT x, block_iq1_m_r4  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_iq1_m_r4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_m_r4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_iq1_m_r4(const block_iq1_m_r4  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_iq1_m_r4_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q8_k_r8_ref(const float * GGML_RESTRICT x, block_q8_k_r8  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q8_k_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q8_k_r8(const block_q8_k_r8  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q8_k_r16_ref(const float * GGML_RESTRICT x, block_q8_k_r16  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q8_k_r16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q8_k_r16(const block_q8_k_r16  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q8_k_r16_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q8_KV_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q8_KV(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q8_KV(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q8_KV_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q8_KV_r8_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q8_KV_r8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q8_KV_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_KV_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q8_KV_r8(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q8_KV_r8_q8_KV(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void   quantize_row_q1_0_g128_ref(const float * GGML_RESTRICT x, block_q1_0_g128  * GGML_RESTRICT y, int64_t k);
 void   quantize_row_q1_0_g128(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q1_0_g128(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix, const struct quantize_user_data * use_data);
 void   dequantize_row_q1_0_g128(const block_q1_0_g128  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void   vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
diff --git a/include/llama.h b/include/llama.h
index 7cc9d73b..732bdde7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -505,6 +505,7 @@ extern "C" {
     };
 
     // model quantization parameters
+    struct quantize_user_data;
     typedef struct llama_model_quantize_params {
         int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
         enum llama_ftype ftype;              // quantize to this llama_ftype
@@ -532,6 +533,7 @@ extern "C" {
         void * kv_overrides;                 // pointer to vector containing overrides
         void * custom_quants;                // pointer to vector containing custom quantization rules
         void * repack_pattern;               // pointer to a vector containing regexes to be used for matching tensor names. Can be null
+        struct quantize_user_data * user_data; // so we can pass extra data to the quantization functions
     } llama_model_quantize_params;
 
     // grammar types
diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp
index ca5853e2..00d2796a 100644
--- a/src/llama-quantize.cpp
+++ b/src/llama-quantize.cpp
@@ -831,10 +831,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
     return new_type;
 }
 
-static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row,
+        const float * imatrix, const quantize_user_data * user_data, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
-        size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
+        size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix, user_data);
         if (!ggml_validate_row_data(new_type, new_data, new_size)) {
             throw std::runtime_error("quantized data validation failed");
         }
@@ -846,7 +847,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
     size_t new_size = 0;
     bool valid = true;
     auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
-            nrows, n_per_row, imatrix]() {
+            nrows, n_per_row, imatrix, user_data]() {
         const int64_t nrows_per_chunk = chunk_size / n_per_row;
         size_t local_size = 0;
         while (true) {
@@ -860,7 +861,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
             }
             lock.unlock();
             const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
-            size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
+            size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix, user_data);
             local_size += this_size;
 
             // validate the quantized data
@@ -1574,7 +1575,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     std::mutex mutex;
                     int counter = 0;
                     bool valid = true;
-                    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix] () {
+                    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, tensor, imatrix, user_data = params->user_data] () {
                         int ne2 = tensor->ne[2];
                         auto row_size = ggml_row_size(new_type, tensor->ne[0]);
                         auto matrix_size = row_size * tensor->ne[1];
@@ -1591,7 +1592,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                             lock.unlock();
                             auto this_imatrix = imatrix ? imatrix + i02 * tensor->ne[0] : nullptr;
                             auto this_data = (char *)new_data + i02*matrix_size;
-                            auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0], this_imatrix);
+                            auto this_size = ggml_quantize_chunk(new_type, f32_data + i02*tensor->ne[0]*tensor->ne[1], this_data, 0, tensor->ne[1], tensor->ne[0],
+                                    this_imatrix, user_data);
                             local_size += this_size;
 
                             // validate the quantized data
@@ -1624,7 +1626,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
                     const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
 
-                    new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                    new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, params->user_data, workers, nthread_use);
                 }
                 }
             }
diff --git a/src/llama.cpp b/src/llama.cpp
index be435db9..b9e62e0d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5165,6 +5165,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.custom_quants               =*/ nullptr,
         /*.repack_pattern              =*/ nullptr,
+        /*.user_data                   =*/ nullptr,
     };
 
     return result;