From 37a77fb0579be9d71e2c73da0553cfd42b7b103a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 20 Jun 2026 12:43:06 +0200 Subject: [PATCH] ggml : optimize AMX (#24806) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flatten the partition over n_batch * M so every thread participates in the quantization | CPU | Model | Test | t/s OLD | t/s NEW | Speedup | |:--------------------------------|:------------------------------|:-------|----------:|----------:|----------:| | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | pp512 | 730.71 | 779.86 | 1.07 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | tg128 | 87.88 | 86.79 | 0.99 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512 | 725.09 | 1023.31 | 1.41 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128 | 83.64 | 83.62 | 1.00 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | pp512 | 820.51 | 924.05 | 1.13 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | tg128 | 90.59 | 92.46 | 1.02 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | pp512 | 776.88 | 872.79 | 1.12 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | tg128 | 89.39 | 90.94 | 1.02 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | pp512 | 719.28 | 1009.27 | 1.40 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | tg128 | 80.62 | 80.86 | 1.00 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | pp512 | 732.29 | 1077.29 | 1.47 | | Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | tg128 | 86.42 | 83.53 | 0.97 | Signed-off-by: Adrien Gallouët --- ggml/src/ggml-cpu/amx/mmq.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index d9383a04be..9f3a744b5d 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - parallel_for_ggml(params, n_batch, [&](int begin, int end) { - for (int batch_idx = begin; batch_idx < end; ++batch_idx) { + parallel_for_ggml(params, n_batch * M, [&](int begin, int end) { + for (int idx = begin; idx < end; ++idx) { + int batch_idx = idx / M; + int m = idx % M; int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2); const float * A_data = (const float *)((const char *)src1->data + src1_offset); char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A; - - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, wdata_batch + m * row_size_A, K); - } + from_float(A_data + m * K, wdata_batch + m * row_size_A, K); } }); });