iqk_mul_mat for llama.cpp

This commit is contained in:
Kawrakow 2024-05-27 09:51:08 +02:00
parent 9fa7946997
commit d434b4751a
7 changed files with 2586 additions and 31 deletions

View File

@ -154,11 +154,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
# Compile flags
#
if (LLAMA_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD 17)
#if (LLAMA_SYCL)
# set(CMAKE_CXX_STANDARD 17)
#else()
# set(CMAKE_CXX_STANDARD 11)
#endif()
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
@ -402,7 +403,7 @@ if (LLAMA_LLAMAFILE)
add_compile_definitions(GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp iqk_mul_mat.cpp)
endif()
if (LLAMA_CUBLAS)

View File

@ -170,8 +170,8 @@ endif
# keep standard at C11 and C++11
MK_CPPFLAGS = -I. -Icommon
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_CFLAGS = -std=c11 -fPIC -v
MK_CXXFLAGS = -std=c++11 -fPIC -v
MK_NVCCFLAGS = -std=c++11
# -Ofast tends to produce faster code, but may not be available for some compilers.

View File

@ -199,6 +199,18 @@ typedef struct {
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
typedef struct {
ggml_half d[8];
int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
typedef struct {
ggml_half d[4];
int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
//
// Super-block quantization structures
//

View File

@ -871,7 +871,10 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
block_q8_0 * restrict y = vy;
#if defined(__ARM_NEON)
block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
@ -888,16 +891,27 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}
for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
if (i < nb4) {
y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}
}
}
#elif defined(__wasm_simd128__)
@ -1191,7 +1205,10 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
block_q8_1 * restrict y = vy;
#if defined(__ARM_NEON)
block_q8_1_x4 * restrict y4 = vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
@ -1208,7 +1225,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}
int32x4_t accv = vdupq_n_s32(0);
@ -1216,15 +1237,26 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
if (i < nb4) {
y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}
accv = vaddq_s32(accv, vi);
}
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
} else {
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
}
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {

49
ggml.c
View File

@ -12334,11 +12334,7 @@ UseGgmlGemm1:;
#endif
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
return;
}
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store(&state->shared->current_chunk, nth);
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@ -12346,16 +12342,45 @@ UseGgmlGemm1:;
assert(params->wsize >= ne11*ne12*ne13*row_size);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
wdata += row_size;
}
}
int64_t work_size = ne13*ne12*ne11;
int64_t work_per_thread = (work_size + nth - 1)/nth;
int64_t work_start = work_per_thread * ith;
if (work_start >= work_size) {
return;
}
int64_t work_end = MIN(work_size, work_start + work_per_thread);
for (int64_t i_work = work_start; i_work < work_end; ++i_work) {
int64_t i13 = i_work / (ne11*ne12);
int64_t i12 = (i_work - i13*ne11*ne12)/ne11;
int64_t i11 = i_work - i13*ne11*ne12 - i12*ne11;
from_float_to_vec_dot((const float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
(void *)(wdata + i_work*row_size), ne10);
}
}
if (ith == 0) {
atomic_store(&state->shared->current_chunk, nth);
}
//// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
//atomic_store(&state->shared->current_chunk, nth);
//if (src1->type != vec_dot_type) {
// char * wdata = params->wdata;
// const size_t row_size = ggml_row_size(vec_dot_type, ne10);
// assert(params->wsize >= ne11*ne12*ne13*row_size);
// GGML_ASSERT(src1->type == GGML_TYPE_F32);
// for (int64_t i13 = 0; i13 < ne13; ++i13) {
// for (int64_t i12 = 0; i12 < ne12; ++i12) {
// for (int64_t i11 = 0; i11 < ne11; ++i11) {
// from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
// wdata += row_size;
// }
// }
// }
//}
return;
}

2468
iqk_mul_mat.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -849,6 +849,11 @@ class tinyBLAS_Q0_AVX {
* @param Ctype is GGML data type of `C`
* @return true if this function was able to service the matmul request
*/
bool iqk_mul_mat(long Nx, long Ny, long ne00, int typeA, const void * A, const void * B,
float * C, long stride_C, int ith, int nth);
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
@ -861,6 +866,18 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
assert(nth > 0);
assert(ith < nth);
if (Btype == GGML_TYPE_Q8_K && Ctype == GGML_TYPE_F32) {
if (iqk_mul_mat(m, n, k * QK_K, Atype, A, B, (float *)C, ldc, ith, nth)) {
return true;
}
}
if ((Btype == GGML_TYPE_Q8_0 || Btype == GGML_TYPE_Q8_1) && Ctype == GGML_TYPE_F32) {
assert(QK8_0 == QK8_1 == QK4_0 == QK4_1 == QK5_0 == QK5_1 == 32);
if (iqk_mul_mat(m, n, k * QK8_0, Atype, A, B, (float *)C, ldc, ith, nth)) {
return true;
}
}
if (Ctype != GGML_TYPE_F32)
return false;