mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
ggml : optimize AMX (#24806)
Flatten the partition over n_batch * M so every thread participates in
the quantization
| CPU | Model | Test | t/s OLD | t/s NEW | Speedup |
|:--------------------------------|:------------------------------|:-------|----------:|----------:|----------:|
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | pp512 | 730.71 | 779.86 | 1.07 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_NL - 4.5 bpw | tg128 | 87.88 | 86.79 | 0.99 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512 | 725.09 | 1023.31 | 1.41 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128 | 83.64 | 83.62 | 1.00 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | pp512 | 820.51 | 924.05 | 1.13 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_0 | tg128 | 90.59 | 92.46 | 1.02 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | pp512 | 776.88 | 872.79 | 1.12 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_1 | tg128 | 89.39 | 90.94 | 1.02 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | pp512 | 719.28 | 1009.27 | 1.40 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_M | tg128 | 80.62 | 80.86 | 1.00 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | pp512 | 732.29 | 1077.29 | 1.47 |
| Intel(R) Xeon(R) Platinum 8488C | qwen35 0.8B Q4_K_S | tg128 | 86.42 | 83.53 | 0.97 |
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
This commit is contained in:
parent
f4043fec01
commit
37a77fb057
@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
|
||||
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
|
||||
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
|
||||
|
||||
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
|
||||
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
|
||||
parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
|
||||
for (int idx = begin; idx < end; ++idx) {
|
||||
int batch_idx = idx / M;
|
||||
int m = idx % M;
|
||||
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
|
||||
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
|
||||
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
|
||||
|
||||
for (int m = 0; m < M; ++m) {
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
}
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user