iqk_mul_mat for llama.cpp

2026-06-28 04:30:15 -05:00 · 2024-05-27 09:51:08 +02:00 · 2024-05-27 09:51:08 +02:00 · d434b4751a
commit d434b4751a
parent 9fa7946997
7 changed files with 2586 additions and 31 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -154,11 +154,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 # Compile flags
 #

-if (LLAMA_SYCL)
-    set(CMAKE_CXX_STANDARD 17)
-else()
-    set(CMAKE_CXX_STANDARD 11)
-endif()
+set(CMAKE_CXX_STANDARD 17)
+#if (LLAMA_SYCL)
+#    set(CMAKE_CXX_STANDARD 17)
+#else()
+#    set(CMAKE_CXX_STANDARD 11)
+#endif()

 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
@ -402,7 +403,7 @@ if (LLAMA_LLAMAFILE)
    add_compile_definitions(GGML_USE_LLAMAFILE)

    set(GGML_HEADERS_LLAMAFILE sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+    set(GGML_SOURCES_LLAMAFILE sgemm.cpp iqk_mul_mat.cpp)
 endif()

 if (LLAMA_CUBLAS)
--- a/4
+++ b/4
@ -170,8 +170,8 @@ endif

 # keep standard at C11 and C++11
 MK_CPPFLAGS  = -I. -Icommon
-MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
+MK_CFLAGS    = -std=c11   -fPIC -v
+MK_CXXFLAGS  = -std=c++11 -fPIC -v
 MK_NVCCFLAGS = -std=c++11

 # -Ofast tends to produce faster code, but may not be available for some compilers.
--- a/ggml-common.h
+++ b/ggml-common.h
@ -199,6 +199,18 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

+typedef struct {
+    ggml_half d[8];
+    int8_t qs[4*QK8_1];
+} block_q8_1_x4;
+static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
+typedef struct {
+    ggml_half d[4];
+    int8_t qs[4*QK8_0];
+} block_q8_0_x4;
+static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
+
+
 //
 // Super-block quantization structures
 //
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -871,7 +871,10 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
    block_q8_0 * restrict y = vy;

 #if defined(__ARM_NEON)
+    block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
+    int nb4 = 4*(nb/4);
    for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
        float32x4_t srcv [8];
        float32x4_t asrcv[8];
        float32x4_t amaxv[8];
@ -888,16 +891,27 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
        const float d = amax / ((1 << 7) - 1);
        const float id = d ? 1.0f/d : 0.0f;

-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }

        for (int j = 0; j < 8; j++) {
            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
            const int32x4_t   vi = vcvtnq_s32_f32(v);

-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            if (i < nb4) {
+                y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }
        }
    }
 #elif defined(__wasm_simd128__)
@ -1191,7 +1205,10 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
    block_q8_1 * restrict y = vy;

 #if defined(__ARM_NEON)
+    block_q8_1_x4 * restrict y4 = vy;
+    int nb4 = 4*(nb/4);
    for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
        float32x4_t srcv [8];
        float32x4_t asrcv[8];
        float32x4_t amaxv[8];
@ -1208,7 +1225,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
        const float d = amax / ((1 << 7) - 1);
        const float id = d ? 1.0f/d : 0.0f;

-        y[i].d = GGML_FP32_TO_FP16(d);
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }

        int32x4_t accv = vdupq_n_s32(0);

@ -1216,15 +1237,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
            const int32x4_t   vi = vcvtnq_s32_f32(v);

-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            if (i < nb4) {
+                y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }

            accv = vaddq_s32(accv, vi);
        }

-        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        if (i < nb4) {
+            y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        } else {
+            y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        }
    }
 #elif defined(__wasm_simd128__)
    for (int i = 0; i < nb; i++) {
--- a/ggml.c
+++ b/ggml.c
@ -12334,11 +12334,7 @@ UseGgmlGemm1:;
 #endif

    if (params->type == GGML_TASK_TYPE_INIT) {
-        if (ith != 0) {
-            return;
-        }
-        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&state->shared->current_chunk, nth);
+
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -12346,16 +12342,45 @@ UseGgmlGemm1:;
            assert(params->wsize >= ne11*ne12*ne13*row_size);
            GGML_ASSERT(src1->type == GGML_TYPE_F32);

-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
+            int64_t work_size = ne13*ne12*ne11;
+            int64_t work_per_thread = (work_size + nth - 1)/nth;
+            int64_t work_start = work_per_thread * ith;
+            if (work_start >= work_size) {
+                return;
+            }
+            int64_t work_end = MIN(work_size, work_start + work_per_thread);
+            for (int64_t i_work = work_start; i_work < work_end; ++i_work) {
+                int64_t i13 = i_work / (ne11*ne12);
+                int64_t i12 = (i_work - i13*ne11*ne12)/ne11;
+                int64_t i11 = i_work - i13*ne11*ne12 - i12*ne11;
+                from_float_to_vec_dot((const float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                        (void *)(wdata + i_work*row_size), ne10);
            }
        }

+        if (ith == 0) {
+            atomic_store(&state->shared->current_chunk, nth);
+        }
+
+        //// Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        //atomic_store(&state->shared->current_chunk, nth);
+        //if (src1->type != vec_dot_type) {
+        //    char * wdata = params->wdata;
+        //    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+        //    assert(params->wsize >= ne11*ne12*ne13*row_size);
+        //    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        //    for (int64_t i13 = 0; i13 < ne13; ++i13) {
+        //        for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        //            for (int64_t i11 = 0; i11 < ne11; ++i11) {
+        //                from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+        //                wdata += row_size;
+        //            }
+        //        }
+        //    }
+        //}
+
        return;
    }

--- a/iqk_mul_mat.cpp
+++ b/iqk_mul_mat.cpp
--- a/sgemm.cpp
+++ b/sgemm.cpp
@ -849,6 +849,11 @@ class tinyBLAS_Q0_AVX {
 * @param Ctype is GGML data type of `C`
 * @return true if this function was able to service the matmul request
 */
+
+bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
+        float * C, long stride_C, int ith, int nth);
+
+
 bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
                     int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {

@ -861,6 +866,18 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    assert(nth > 0);
    assert(ith < nth);

+    if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
+        if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) {
+            return true;
+        }
+    }
+    if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
+        assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
+        if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float *)C, ldc, ith, nth)) {
+            return true;
+        }
+    }
+
    if (Ctype != GGML_TYPE_F32)
        return false;