AVX512+AVXVNNI GEMM implementation for quants using Q8_K for activations (#710)

* q8_k_r16: basics

* q8_k_r16: iq4_xs now uses q8_k_r16 on Zen4+

PP performance is about the same as using q8_k_r8 on the Ryzen-7950X,
so we expect nice gains on Zen5, and we don't need to wory about
using 2 different q8_k_r8 implementations for fancy SIMD.

* q8_k_r16: iq2_xxs now uses q8_k_r16 on Zen4+

* q8_k_r16: iq2_xs now uses q8_k_r16 on Zen4+

* q8_k_r16: iq2_s now uses q8_k_r16 on Zen4+

* q8_k_r16: iq3_xxs now uses q8_k_r16 on Zen4+

* q8_k_r16: iq3_s now uses q8_k_r16 on Zen4+

* q8_k_r16: iq1_s and iq1_m now uses q8_k_r16 on Zen4+

* q8_k_r16: q2_K and q3_K now uses q8_k_r16 on Zen4+

* q8_k_r16: iq2_ks and iq2_k now uses q8_k_r16 on Zen4+

* q8_k_r16: iq2_kl now uses q8_k_r16 on Zen4+

* q8_k_r16: iq3_ks and iq3_k now uses q8_k_r16 on Zen4+

* q8_k_r16: iq4_kss, iq4_ks, and iq4_k now use q8_k_r16 on Zen4+

* q8_k_r16: iq5_ks, iq5_k, and iq6_k now use q8_k_r16 on Zen4+

* Fix AVX2

* Just always set num_rows to 16

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow 2025-08-22 06:27:07 +03:00 committed by GitHub
parent 0b448997ec
commit ca8c72ff1a
12 changed files with 827 additions and 490 deletions

View File

@ -475,6 +475,7 @@ extern "C" {
GGML_TYPE_IQ5_K_R4 = 340,
GGML_TYPE_IQ4_KS_R4 = 344,
GGML_TYPE_IQ5_KS_R4 = 352,
GGML_TYPE_Q8_K_R16 = 397,
GGML_TYPE_Q8_KV_R8 = 398,
GGML_TYPE_Q8_K_R8 = 399,
GGML_TYPE_COUNT,
@ -571,6 +572,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ5_K_R4 = 333, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_KS_R4 = 337, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ5_KS_R4 = 341, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_K_R16 = 397, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors
};

View File

@ -421,6 +421,12 @@ typedef struct {
} block_q8_k_r8;
static_assert(sizeof(block_q8_k_r8) == 8*sizeof(ggml_half) + 8*QK_K, "wrong q8_k_r8 block size/padding");
typedef struct {
ggml_half d[16]; // delta
int8_t qs[16*QK_K]; // quants, stored as unsigned ints
} block_q8_k_r16;
static_assert(sizeof(block_q8_k_r16) == 16*sizeof(ggml_half) + 16*QK_K, "wrong q8_k_r16 block size/padding");
// (Almost) "true" 2-bit quantization.
// Due to the need to use blocks as per ggml design, it ends up using
// 2.0625 bpw because of the 16-bit scale for each block of 256.

View File

@ -15461,6 +15461,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ5_KS_R4:break;
case GGML_TYPE_Q8_KV_R8: break;
case GGML_TYPE_Q8_K_R8: break;
case GGML_TYPE_Q8_K_R16: break;
case GGML_TYPE_Q8_KV: break;
case GGML_TYPE_BF16_R16: break;
case GGML_TYPE_Q4_0_4_4:

View File

@ -1071,6 +1071,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_Q8_K_R16] = {
.type_name = "q8_k_r16",
.blck_size = QK_K,
.type_size = sizeof(block_q8_k_r16)/16,
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q8_k_r16,
.from_float = quantize_row_q8_k_r16,
.from_float_ref = (ggml_from_float_t) quantize_row_q8_k_r16_ref,
.vec_dot = vec_dot_q8_k_r16_q8_k,
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_IQ2_XXS] = {
.type_name = "iq2_xxs",
.blck_size = QK_K,
@ -1934,7 +1947,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
}
static inline int ggml_packed_rows(enum ggml_type type) {
return type == GGML_TYPE_BF16_R16 ? 16
return type == GGML_TYPE_BF16_R16 || type == GGML_TYPE_Q8_K_R16 ? 16
: type == GGML_TYPE_Q8_K_R8 || type == GGML_TYPE_Q8_KV_R8 ||
type == GGML_TYPE_Q8_0_R8 || type == GGML_TYPE_Q4_0_R8 ||
type == GGML_TYPE_IQ4_XS_R8 ? 8
@ -4617,6 +4630,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
case GGML_FTYPE_MOSTLY_Q6_K_R4: wtype = GGML_TYPE_Q6_K_R4; break;
case GGML_FTYPE_MOSTLY_Q8_K_R8: wtype = GGML_TYPE_Q8_K_R8; break;
case GGML_FTYPE_MOSTLY_Q8_K_R16: wtype = GGML_TYPE_Q8_K_R16; break;
case GGML_FTYPE_MOSTLY_Q8_KV_R8: wtype = GGML_TYPE_Q8_KV_R8; break;
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
case GGML_FTYPE_MOSTLY_IQ2_XXS_R4: wtype = GGML_TYPE_IQ2_XXS_R4;break;
@ -11542,6 +11556,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -12094,6 +12109,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -12272,6 +12288,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -14966,6 +14983,17 @@ static void ggml_compute_forward_mul_mat(
#endif
#if GGML_USE_IQK_MULMAT
if (ith == 0) {
static bool first_time = true;
if (first_time) {
first_time = false;
#ifdef HAVE_FANCY_SIMD
printf("======================================= HAVE_FANCY_SIMD is defined\n");
#else
printf("======================================= HAVE_FANCY_SIMD is NOT defined\n");
#endif
}
}
if (dst->type == GGML_TYPE_F32) {
if (iqk_mul_mat_4d(ne01, ne11, ne00,
ne02, ne03, ne12, ne13, nb02, nb03, nb12, nb13, nb2/sizeof(float), nb3/sizeof(float),
@ -15872,6 +15900,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -16290,6 +16319,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -16614,6 +16644,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XXS_R4:
@ -17274,6 +17305,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q6_K_R4:
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_KR8:
case GGML_TYPE_IQ2_XXS:
@ -24380,6 +24412,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

View File

@ -556,6 +556,60 @@ inline void iqk_transpose_8x8(__m256 * m) {
}
}
template <int nr = 8>
static inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
auto max_i16 = _mm256_setzero_si256();
__m256i qs[16];
for (int ib32 = 0; ib32 < 8; ++ib32) {
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
}
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
auto max4 = _mm_cvtepi32_ps(imax4);
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
bool needs_scaling = true;
float dnew = _mm_cvtss_f32(max4) * d0;
if (dnew < 1.f) {
dnew = 1.f; needs_scaling = false;
}
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
for (int ib32 = 0; ib32 < 8; ++ib32) {
if (needs_scaling) {
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
i0 = _mm256_packs_epi32(i0, i1);
i2 = _mm256_packs_epi32(i2, i3);
i0 = _mm256_packs_epi16(i0, i2);
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
_mm256_storeu_si256((__m256i *)block, i0);
} else {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
auto i0_l = _mm256_castsi256_si128(i0);
auto i0_h = _mm256_extracti128_si256(i0, 1);
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)q8_k + 8*nr*ib32;
for (int l = 0; l < 8; ++l) {
qs[nr*l + k] = block[l];
}
}
return dnew;
}
#else
// ------------------------------------ __aarch64__ --------------------------------------------------

View File

@ -1668,71 +1668,23 @@ static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const Da
}
#endif
inline float convert_to_q8_k_r8(int k, int d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
auto max_i16 = _mm256_setzero_si256();
for (int ib32 = 0; ib32 < 8; ++ib32) {
auto q16_l = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
auto q16_h = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
q16_l = _mm256_mullo_epi16(q16_l, _mm256_set1_epi16(scales[2*ib32+0]));
q16_h = _mm256_mullo_epi16(q16_h, _mm256_set1_epi16(scales[2*ib32+1]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(q16_l, q16_l));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(q16_h, q16_h));
}
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
auto max4 = _mm_cvtepi32_ps(imax4);
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
bool needs_scaling = true;
float dnew = _mm_cvtss_f32(max4) / d0;
if (dnew < 1.f) {
dnew = 1.f; needs_scaling = false;
}
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
for (int ib32 = 0; ib32 < 8; ++ib32) {
auto q16_l = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
auto q16_h = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
q16_l = _mm256_mullo_epi16(q16_l, _mm256_set1_epi16(scales[2*ib32+0]));
q16_h = _mm256_mullo_epi16(q16_h, _mm256_set1_epi16(scales[2*ib32+1]));
if (needs_scaling) {
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q16_l));
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q16_l, 1));
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q16_h));
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q16_h, 1));
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
i0 = _mm256_packs_epi32(i0, i1);
i2 = _mm256_packs_epi32(i2, i3);
i0 = _mm256_packs_epi16(i0, i2);
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
_mm256_storeu_si256((__m256i *)block, i0);
} else {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
auto i0 = _mm256_packs_epi16(q16_l, q16_h);
auto i0_l = _mm256_castsi256_si128(i0);
auto i0_h = _mm256_extracti128_si256(i0, 1);
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)q8_k + 64*ib32;
for (int l = 0; l < 8; ++l) {
qs[8*l + k] = block[l];
}
}
return dnew;
}
void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq1_s * x8[8];
const block_iq1_s * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
int16_t ls[16];
@ -1740,10 +1692,10 @@ void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
__m256i qx[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq1_s *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq1_s *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
auto qs = x8[k][i].qs;
auto qh = x8[k][i].qh;
@ -1759,23 +1711,36 @@ void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
qx[ib32] = value;
qs += 4;
}
float dnew = convert_to_q8_k_r8(k, 126, qx, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/126, qx, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
}
void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq1_m * x8[8];
const block_iq1_m * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
int16_t ls[16];
@ -1785,10 +1750,10 @@ void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
auto mask = _mm256_setr_epi32(0x00000008, 0x00000008, 0x00000080, 0x00000080, 0x00080000, 0x00080000, 0x00800000, 0x00800000);
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq1_m *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq1_m *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
const uint16_t * sc = (const uint16_t *)x8[k][i].scales;
iq1m_scale_t scale;
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
@ -1816,9 +1781,15 @@ void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
qs += 4;
qh += 2;
}
float dnew = convert_to_q8_k_r8(k, 126, qx, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/126, qx, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}

View File

@ -2305,68 +2305,22 @@ template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX
#endif
}
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
auto max_i16 = _mm256_setzero_si256();
__m256i qs[16];
for (int ib32 = 0; ib32 < 8; ++ib32) {
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
}
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
auto max4 = _mm_cvtepi32_ps(imax4);
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
bool needs_scaling = true;
float dnew = _mm_cvtss_f32(max4) * d0;
if (dnew < 1.f) {
dnew = 1.f; needs_scaling = false;
}
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
for (int ib32 = 0; ib32 < 8; ++ib32) {
if (needs_scaling) {
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
i0 = _mm256_packs_epi32(i0, i1);
i2 = _mm256_packs_epi32(i2, i3);
i0 = _mm256_packs_epi16(i0, i2);
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
_mm256_storeu_si256((__m256i *)block, i0);
} else {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
auto i0_l = _mm256_castsi256_si128(i0);
auto i0_h = _mm256_extracti128_si256(i0, 1);
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)q8_k + 64*ib32;
for (int l = 0; l < 8; ++l) {
qs[8*l + k] = block[l];
}
}
return dnew;
}
void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_ks * x8[8];
const block_iq2_ks * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values;
{
@ -2374,8 +2328,8 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
values = MM256_SET_M128I(v, v);
}
ggml_half dh[8];
float dnew[8];
ggml_half dh[k_nr];
float dnew[k_nr];
uint32_t block[8];
int16_t ls[16];
@ -2383,14 +2337,14 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
auto ml = _mm256_set1_epi8(0x03);
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx);
dh[k] = dptr[0];
x8[k] = (const block_iq2_ks *)(dptr + 1);
}
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
auto extra = x8[k][i].extra;
for (int i128 = 0; i128 < 2; ++i128) {
ls[8*i128+0] = ls[8*i128+1] = ((x8[k][i].scales[2*i128+0] & 0xf) | ((extra >> 4) & 0x10)) - 16;
@ -2412,24 +2366,116 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
extra >>= 4;
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/125, xv, ls, block, y[i].qs);
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/125, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_mul_ps(_mm512_loadu_ps(dnew), _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dh)));
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(vd, _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
auto vd = _mm256_mul_ps(_mm256_loadu_ps(dnew), _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh)));
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(vd, _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_k * x8[k_nr];
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values;
{
auto v = _mm_loadl_epi64((const __m128i *)iq2nl_values);
values = MM256_SET_M128I(v, v);
}
__m256i xv[8];
uint32_t block[8];
const __m128i scale_shuffle = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800);
union { __m256i vec; int16_t val[16]; } helper;
auto ml = _mm256_set1_epi8(0x03);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_k *)((const char *)vx + (ix+k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales, 8);
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
scl = _mm_add_epi8(scl, _mm_set1_epi8(-8));
helper.vec = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scl, scale_shuffle));
auto extra = x8[k][i].extra;
for (int i128 = 0; i128 < 2; ++i128) {
auto bits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+i128);
xv[4*i128+0] = _mm256_and_si256(bits, ml);
xv[4*i128+1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), ml);
xv[4*i128+2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), ml);
xv[4*i128+3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), ml);
auto shift1 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x02) << 1), _mm_set1_epi8((extra & 0x01) << 2));
auto shift2 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x08) >> 1), _mm_set1_epi8((extra & 0x04) >> 0));
auto shift3 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x20) >> 3), _mm_set1_epi8((extra & 0x10) >> 2));
auto shift4 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x80) >> 5), _mm_set1_epi8((extra & 0x40) >> 4));
xv[4*i128+0] = _mm256_add_epi8(xv[4*i128+0], shift1);
xv[4*i128+1] = _mm256_add_epi8(xv[4*i128+1], shift2);
xv[4*i128+2] = _mm256_add_epi8(xv[4*i128+2], shift3);
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
xv[4*i128+0] = _mm256_shuffle_epi8(values, xv[4*i128+0]);
xv[4*i128+1] = _mm256_shuffle_epi8(values, xv[4*i128+1]);
xv[4*i128+2] = _mm256_shuffle_epi8(values, xv[4*i128+2]);
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
extra >>= 8;
}
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/120, xv, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
}
void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_kl * x8[8];
const block_iq2_kl * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[4];
{
@ -2443,8 +2489,8 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
}
}
ggml_half dh[8];
float dnew[8];
ggml_half dh[k_nr];
float dnew[k_nr];
uint32_t block[8];
int16_t ls[16];
@ -2455,14 +2501,14 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
uint32_t sl32;
const auto sl8 = (const int8_t *)&sl32;
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx);
dh[k] = dptr[0];
x8[k] = (const block_iq2_kl *)(dptr + 1);
}
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
uint32_t aux32;
std::memcpy(&aux32, x8[k][i].scales_l, 4);
auto sh = x8[k][i].scales_h;
@ -2500,87 +2546,118 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
}
hbits = _mm256_srli_epi16(hbits, 4);
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/125, xv, ls, block, y[i].qs);
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/125, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_mul_ps(_mm512_loadu_ps(dnew), _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dh)));
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(vd, _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
auto vd = _mm256_mul_ps(_mm256_loadu_ps(dnew), _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh)));
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(vd, _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
void iqk_convert_iq3_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_k * x8[8];
const block_iq3_ks * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values;
{
auto v = _mm_loadl_epi64((const __m128i *)iq2nl_values);
auto v = _mm_loadu_si128((const __m128i *)iq3nl_values);
values = MM256_SET_M128I(v, v);
}
__m256i xv[8];
ggml_half drow[k_nr];
float dnew[k_nr];
int16_t ls[16];
__m256i xv[8];
uint32_t block[8];
const __m128i scale_shuffle = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800);
union { __m256i vec; int16_t val[16]; } helper;
auto ml = _mm256_set1_epi8(0x03);
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_k *)((const char *)vx + (ix+k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix + k)*bx);
drow[k] = dptr[0];
x8[k] = (const block_iq3_ks *)(dptr + 1);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)drow));
#else
auto vd = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)drow));
#endif
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales, 8);
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
scl = _mm_add_epi8(scl, _mm_set1_epi8(-8));
helper.vec = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scl, scale_shuffle));
for (int k = 0; k < k_nr; ++k) {
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
auto extra = x8[k][i].extra;
for (int i128 = 0; i128 < 2; ++i128) {
auto bits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+i128);
xv[4*i128+0] = _mm256_and_si256(bits, ml);
xv[4*i128+1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), ml);
xv[4*i128+2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), ml);
xv[4*i128+3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), ml);
auto shift1 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x02) << 1), _mm_set1_epi8((extra & 0x01) << 2));
auto shift2 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x08) >> 1), _mm_set1_epi8((extra & 0x04) >> 0));
auto shift3 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x20) >> 3), _mm_set1_epi8((extra & 0x10) >> 2));
auto shift4 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x80) >> 5), _mm_set1_epi8((extra & 0x40) >> 4));
xv[4*i128+0] = _mm256_add_epi8(xv[4*i128+0], shift1);
xv[4*i128+1] = _mm256_add_epi8(xv[4*i128+1], shift2);
xv[4*i128+2] = _mm256_add_epi8(xv[4*i128+2], shift3);
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
xv[4*i128+0] = _mm256_shuffle_epi8(values, xv[4*i128+0]);
xv[4*i128+1] = _mm256_shuffle_epi8(values, xv[4*i128+1]);
xv[4*i128+2] = _mm256_shuffle_epi8(values, xv[4*i128+2]);
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
extra >>= 8;
uint8_t extra_v = extra >> 8;
for (int j = 0; j < 4; ++j) {
ls[2*j+0] = ls[2*j+1] = ((x8[k][i].scales[j] & 0xf) | ((extra << 4) & 0x10)) - 16;
ls[2*j+8] = ls[2*j+9] = ((x8[k][i].scales[j] >> 4) | ((extra << 0) & 0x10)) - 16;
extra >>= 1;
}
float dnew = convert_to_q8_k_r8(k, 1.f/120, xv, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
for (int i128 = 0; i128 < QK_K/128; ++i128) {
auto lbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs + i128);
for (int j = 0; j < 4; ++j) {
xv[4*i128+j] = _mm256_or_si256(_mm256_and_si256(lbits, _mm256_set1_epi8(3)), _mm256_and_si256(_mm256_slli_epi16(hbits, 2), _mm256_set1_epi8(4)));
xv[4*i128+j] = _mm256_add_epi8(xv[4*i128+j], _mm256_set1_epi8((extra_v & 1) << 3));
xv[4*i128+j] = _mm256_shuffle_epi8(values, xv[4*i128+j]);
extra_v >>= 1;
lbits = _mm256_srli_epi16(lbits, 2);
hbits = _mm256_srli_epi16(hbits, 1);
}
}
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq3_k * x8[8];
const block_iq3_k * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values;
{
@ -2601,10 +2678,10 @@ void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
auto ml = _mm256_set1_epi8(0x03);
auto hmask = _mm256_set1_epi8(4);
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_k *)((const char *)vx + (ix+k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_k *)((const char *)vx + (ix+k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales_l, 8);
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
@ -2639,82 +2716,36 @@ void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
hbits = _mm256_srli_epi16(hbits, 4);
extra >>= 8;
}
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
}
y += nb;
}
}
void iqk_convert_iq3_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
int nb = n/QK_K;
const block_iq3_ks * x8[8];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
__m256i values;
{
auto v = _mm_loadu_si128((const __m128i *)iq3nl_values);
values = MM256_SET_M128I(v, v);
}
ggml_half drow[8];
float dnew[8];
int16_t ls[16];
__m256i xv[8];
uint32_t block[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix + k)*bx);
drow[k] = dptr[0];
x8[k] = (const block_iq3_ks *)(dptr + 1);
}
auto vd = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)drow));
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
auto extra = x8[k][i].extra;
uint8_t extra_v = extra >> 8;
for (int j = 0; j < 4; ++j) {
ls[2*j+0] = ls[2*j+1] = ((x8[k][i].scales[j] & 0xf) | ((extra << 4) & 0x10)) - 16;
ls[2*j+8] = ls[2*j+9] = ((x8[k][i].scales[j] >> 4) | ((extra << 0) & 0x10)) - 16;
extra >>= 1;
}
for (int i128 = 0; i128 < QK_K/128; ++i128) {
auto lbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs + i128);
for (int j = 0; j < 4; ++j) {
xv[4*i128+j] = _mm256_or_si256(_mm256_and_si256(lbits, _mm256_set1_epi8(3)), _mm256_and_si256(_mm256_slli_epi16(hbits, 2), _mm256_set1_epi8(4)));
xv[4*i128+j] = _mm256_add_epi8(xv[4*i128+j], _mm256_set1_epi8((extra_v & 1) << 3));
xv[4*i128+j] = _mm256_shuffle_epi8(values, xv[4*i128+j]);
extra_v >>= 1;
lbits = _mm256_srli_epi16(lbits, 2);
hbits = _mm256_srli_epi16(hbits, 1);
}
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq4_kss * x8[8];
const block_iq4_kss * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[2];
{
@ -2724,22 +2755,26 @@ void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
values[1] = MM256_SET_M128I(v2, v2);
}
float drow[8];
float dnew[8];
float drow[k_nr];
float dnew[k_nr];
int16_t ls[16];
__m256i xv[8];
uint32_t block[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
drow[k] = dptr[0];
x8[k] = (const block_iq4_kss *)(dptr + 1);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_loadu_ps(drow);
#else
auto vd = _mm256_loadu_ps(drow);
#endif
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
for (int ib32 = 0; ib32 < 8; ++ib32) {
auto val = _mm_loadu_si128((const __m128i *)x8[k][i].qs+ib32);
auto val_q = _mm_and_si128(val, _mm_set1_epi32(0xfffefffe));
@ -2752,23 +2787,38 @@ void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
ls[2*ib32+0] = ls[2*ib32+1] = ((s8 & 254) - 127);
xv[ib32] = _mm256_shuffle_epi8(values[s8 & 1], xv[ib32]);
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq4_ks * x8[8];
const block_iq4_ks * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[2];
{
@ -2778,22 +2828,26 @@ void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
values[1] = MM256_SET_M128I(v2, v2);
}
float drow[8];
float dnew[8];
float drow[k_nr];
float dnew[k_nr];
int16_t ls[16];
__m256i xv[8];
uint32_t block[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
drow[k] = dptr[0];
x8[k] = (const block_iq4_ks *)(dptr + 1);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_loadu_ps(drow);
#else
auto vd = _mm256_loadu_ps(drow);
#endif
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
for (int ib32 = 0; ib32 < 8; ++ib32) {
ls[2*ib32+0] = (x8[k][i].scales[ib32] & 254) - 127;
ls[2*ib32+1] = ls[2*ib32+0];
@ -2801,23 +2855,38 @@ void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
xv[ib32] = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), _mm256_set1_epi8(0xf));
xv[ib32] = _mm256_shuffle_epi8(values[x8[k][i].scales[ib32] & 1], xv[ib32]);
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq4_k * x8[8];
const block_iq4_k * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[4];
{
@ -2837,10 +2906,10 @@ void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
//union { __m256i vec; int16_t val[16]; } helper;
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq4_k *)((const char *)vx + (ix+k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq4_k *)((const char *)vx + (ix+k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
auto extra = x8[k][i].extra;
//uint64_t aux64;
@ -2860,23 +2929,36 @@ void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
xv[ib32] = _mm256_shuffle_epi8(values[extra & 3], xv[ib32]); extra >>= 2;
}
//float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
}
void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq5_ks * x8[8];
const block_iq5_ks * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[2];
{
@ -2886,8 +2968,8 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
values[1] = MM256_SET_M128I(v2, v2);
}
float drow[8];
float dnew[8];
float drow[k_nr];
float dnew[k_nr];
int16_t ls[16];
__m256i xv[8];
@ -2895,15 +2977,19 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
auto mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) {
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) {
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
drow[k] = dptr[0];
x8[k] = (const block_iq5_ks *)(dptr + 1);
}
#ifdef HAVE_FANCY_SIMD
auto vd = _mm512_loadu_ps(drow);
#else
auto vd = _mm256_loadu_ps(drow);
#endif
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
for (int ib64 = 0; ib64 < 4; ++ib64) {
ls[4*ib64+0] = (x8[k][i].scales[2*ib64+0] & 254) - 127;
@ -2927,23 +3013,38 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
xv[2*ib64+1] = _mm256_add_epi8(xv[2*ib64+1], shift2);
hbits = _mm256_srli_epi16(hbits, 2);
}
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
}
void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq5_k * x8[8];
const block_iq5_k * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[2];
{
@ -2959,10 +3060,10 @@ void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
auto mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq5_k *)((const char *)vx + (ix+k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq5_k *)((const char *)vx + (ix+k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
auto extra = x8[k][i].extra;
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
@ -2989,9 +3090,15 @@ void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
hbits = _mm256_srli_epi16(hbits, 2);
extra >>= 4;
}
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
@ -3103,14 +3210,21 @@ void iqk_convert_iq5_k_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
}
void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq6_k * x8[8];
const block_iq6_k * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
__m256i values[4];
for (int k = 0; k < 4; ++k) {
@ -3139,10 +3253,10 @@ void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
_mm256_and_si256(mask4, _mm256_shuffle_epi8(values[3], l))));
};
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq6_k *)((const char *)vx + (ix+k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq6_k *)((const char *)vx + (ix+k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
helper.vec = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*)x8[k][i].scales));
auto extra = x8[k][i].extra;
@ -3168,9 +3282,15 @@ void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
extra >>= 8;
}
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}

View File

@ -1839,59 +1839,6 @@ static void mul_mat_iq3_s_r4_q8_k(int n, const void * vx, size_t bx, const DataI
}
}
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
auto max_i16 = _mm256_setzero_si256();
__m256i qs[16];
for (int ib32 = 0; ib32 < 8; ++ib32) {
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
}
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
auto max4 = _mm_cvtepi32_ps(imax4);
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
bool needs_scaling = true;
float dnew = _mm_cvtss_f32(max4) * d0;
if (dnew < 1.f) {
dnew = 1.f; needs_scaling = false;
}
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
for (int ib32 = 0; ib32 < 8; ++ib32) {
if (needs_scaling) {
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
i0 = _mm256_packs_epi32(i0, i1);
i2 = _mm256_packs_epi32(i2, i3);
i0 = _mm256_packs_epi16(i0, i2);
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
_mm256_storeu_si256((__m256i *)block, i0);
} else {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
auto i0_l = _mm256_castsi256_si128(i0);
auto i0_h = _mm256_extracti128_si256(i0, 1);
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)q8_k + 64*ib32;
for (int l = 0; l < 8; ++l) {
qs[8*l + k] = block[l];
}
}
return dnew;
}
void iqk_convert_iq2_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
@ -1942,14 +1889,21 @@ void iqk_convert_iq2_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, i
}
void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_xxs * x8[8];
const block_iq2_xxs * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
int16_t ls[16];
EvenSignHelper esh;
@ -1960,11 +1914,10 @@ void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
__m256i values[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_xxs *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_xxs *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
// TODO: simdify
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
for (int ib32 = 0; ib32 < 8; ++ib32) {
std::memcpy(aux32, x8[k][i].qs + 4*ib32, 2*sizeof(uint32_t));
@ -1973,23 +1926,36 @@ void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
values[ib32] = _mm256_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
esh.sign_value(aux32[1], values[ib32]);
}
float dnew = convert_to_q8_k_r8(k, 1.f/124, values, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, values, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
}
void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_xs * x8[8];
const block_iq2_xs * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
uint32_t block[8];
@ -2000,10 +1966,10 @@ void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
DequantizerIQ2XS::Helper sign_helper;
#endif
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_xs *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_xs *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
helper.vec = DequantizerIQ2XS::make_scales(x8[k][i].scales);
auto q2l = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+0);
@ -2017,9 +1983,16 @@ void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
DequantizerIQ2XS::sign_values_helper(q2l, sign_helper, qx+0);
DequantizerIQ2XS::sign_values_helper(q2h, sign_helper, qx+4);
#endif
float dnew = convert_to_q8_k_r8(k, 1.f/124, qx, helper.val, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, qx, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
@ -2306,14 +2279,21 @@ void iqk_convert_iq2_s_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
}
void iqk_convert_iq2_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq2_s * x8[8];
const block_iq2_s * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
uint32_t block[8];
@ -2322,17 +2302,23 @@ void iqk_convert_iq2_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
SignHelper sh;
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_s *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_s *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
helper.vec = DequantizerIQ2S::make_scales(x8[k][i].scales);
DequantizerIQ2S::prepare(x8[k][i].qs+ 0, x8[k][i].qh+0, (const uint16_t *)(x8[k][i].qs + QK_K/8) + 0, sh, qx+0);
DequantizerIQ2S::prepare(x8[k][i].qs+16, x8[k][i].qh+4, (const uint16_t *)(x8[k][i].qs + QK_K/8) + 8, sh, qx+4);
float dnew = convert_to_q8_k_r8(k, 1.f/124, qx, helper.val, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, qx, helper.val, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
@ -2458,14 +2444,21 @@ static void mul_mat_iq2_s_q8_2_X4(int n, const void * vx, size_t bx, const DataI
}
void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq3_xxs * x8[8];
const block_iq3_xxs * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
int16_t ls[16];
EvenSignHelper esh;
@ -2474,10 +2467,10 @@ void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
uint32_t block[8];
uint32_t aux32;
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_xxs *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_xxs *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = 0.25f * GGML_FP16_TO_FP32(x8[k][i].d);
auto qs = x8[k][i].qs;
auto sas = qs + QK_K/4;
@ -2490,9 +2483,15 @@ void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
esh.sign_value(aux32, values[ib32]);
qs += 8;
}
float dnew = convert_to_q8_k_r8(k, 1.f/124, values, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, values, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
@ -2551,14 +2550,21 @@ void iqk_convert_iq3_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, i
}
void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq3_s * x8[8];
const block_iq3_s * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
int16_t ls[16];
SignHelper sh;
@ -2567,10 +2573,10 @@ void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
uint32_t block[8];
__m256i values[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_s *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_s *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
auto qs = x8[k][i].qs;
auto qh = x8[k][i].qh;
@ -2585,9 +2591,15 @@ void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
ls[2*ib32 + 0] = (2*((x8[k][i].scales[ib32/2] >> 4*(ib32%2)) & 0xf) + 1);
ls[2*ib32 + 1] = ls[2*ib32 + 0];
}
float dnew = convert_to_q8_k_r8(k, 1.f/127, values, ls, block, y[i].qs);
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, values, ls, block, y[i].qs);
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}

View File

@ -1859,6 +1859,50 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
}
}
#ifdef HAVE_FANCY_SIMD
template <int nrc_y>
static void mul_mat_q8_k_r16_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%16 == 0);
Q8<nrc_y, block_q8_K> q8(info);
int nbl = n / QK_K;
__m512 acc[nrc_y] = {};
__m512i isum[nrc_y] = {};
__m512i qx[4];
for (int ix = 0; ix < nrc_x; ix += 16) {
const block_q8_k_r16 * iq16 = (const block_q8_k_r16 *)((const char *)vx + ix*bx);
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
auto d4 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)iq16[ibl].d));
for (int ib = 0; ib < QK_K/16; ++ib) {
qx[0] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+0);
qx[1] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+1);
qx[2] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+2);
qx[3] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+3);
for (int iy = 0; iy < nrc_y; ++iy) {
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
auto y256 = MM256_SET_M128I(y128, y128);
auto y = _mm512_inserti32x8(_mm512_castsi256_si512(y256), y256, 1);
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[0], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x00)));
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[1], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x55)));
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[2], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xaa)));
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[3], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xff)));
}
}
auto m4 = _mm512_mul_ps(d4, _mm512_set1_ps(-128.f));
for (int iy = 0; iy < nrc_y; ++iy) {
auto d4y = _mm512_mul_ps(d4, _mm512_set1_ps(q8.scale(iy, ibl)));
acc[iy] = _mm512_fmadd_ps(d4y, _mm512_cvtepi32_ps(isum[iy]), acc[iy]);
acc[iy] = _mm512_fmadd_ps(m4, _mm512_set1_ps(q8.y[iy][ibl].sum), acc[iy]);
isum[iy] = _mm512_setzero_si512();
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, acc[iy]);
acc[iy] = _mm512_setzero_ps();
}
}
}
#endif
template <int nrc_y>
static void mul_mat_q8_KV_q8_KV(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%4 == 0);
@ -2020,14 +2064,21 @@ typedef struct {
} block_q8_1_r8;
void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_q2_K * x8[8];
const block_q2_K * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
float f_values[QK_K];
uint32_t block[8];
@ -2038,10 +2089,10 @@ void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
auto sign_bit = _mm256_set1_ps(-0.0f);
auto perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_q2_K *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_q2_K *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
auto vd = _mm256_set1_ps(GGML_FP16_TO_FP32(x8[k][i].d));
auto vm = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x8[k][i].dmin)), _mm256_set1_ps(-1.f));
auto block_max = _mm256_setzero_ps();
@ -2092,13 +2143,18 @@ void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
i0 = _mm256_permutevar8x32_epi32(i0, perm);
_mm256_storeu_si256((__m256i *)block, i0);
auto q8 = (uint32_t *)y[i].qs + 64*ib32;
for (int l = 0; l < 4; ++l) {
q8[8*l + k + 0] = block[l + 0];
q8[8*l + k + 32] = block[l + 4];
auto q8 = (uint32_t *)y[i].qs + 8*k_nr*ib32;
for (int l = 0; l < 8; ++l) {
q8[k_nr*l + k] = block[l];
}
}
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
@ -2414,14 +2470,21 @@ void iqk_convert_q3_k_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
}
void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_q3_K * x8[8];
const block_q3_K * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
uint32_t block[8];
__m256i values[8];
@ -2432,10 +2495,10 @@ void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
union { __m256i vec; int16_t val[16]; } helper;
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_q3_K *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_q3_K *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].hmask);
helper.vec = _mm256_cvtepi8_epi16(sc3.make_scales((const uint16_t *)x8[k][i].scales));
@ -2505,93 +2568,54 @@ void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)y[i].qs + 64*ib32;
auto qs = (uint32_t *)y[i].qs + 8*k_nr*ib32;
for (int l = 0; l < 8; ++l) {
qs[8*l + k] = block[l];
qs[k_nr*l + k] = block[l];
}
}
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#endif
}
y += nb;
}
}
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
auto max_i16 = _mm256_setzero_si256();
__m256i qs[16];
for (int ib32 = 0; ib32 < 8; ++ib32) {
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
}
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
auto max4 = _mm_cvtepi32_ps(imax4);
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
bool needs_scaling = true;
float dnew = _mm_cvtss_f32(max4) * d0;
if (dnew < 1.f) {
dnew = 1.f; needs_scaling = false;
}
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
for (int ib32 = 0; ib32 < 8; ++ib32) {
if (needs_scaling) {
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
i0 = _mm256_packs_epi32(i0, i1);
i2 = _mm256_packs_epi32(i2, i3);
i0 = _mm256_packs_epi16(i0, i2);
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
_mm256_storeu_si256((__m256i *)block, i0);
} else {
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
auto i0_l = _mm256_castsi256_si128(i0);
auto i0_h = _mm256_extracti128_si256(i0, 1);
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
}
auto qs = (uint32_t *)q8_k + 64*ib32;
for (int l = 0; l < 8; ++l) {
qs[8*l + k] = block[l];
}
}
return dnew;
}
// TODO: move this to iqk_gemm_iquants
void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
#ifdef HAVE_FANCY_SIMD
constexpr int k_nr = 16;
using block_q8_k_r = block_q8_k_r16;
#else
constexpr int k_nr = 8;
using block_q8_k_r = block_q8_k_r8;
#endif
GGML_ASSERT(n%QK_K == 0);
GGML_ASSERT(nrc_x%8 == 0);
GGML_ASSERT(nrc_x%k_nr == 0);
int nb = n/QK_K;
const block_iq4_xs * x8[8];
const block_iq4_xs * x8[k_nr];
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
block_q8_k_r * y = (block_q8_k_r *)vy;
auto values128 = _mm_loadu_si128((const __m128i *)iq4k_values);
auto values = MM256_SET_M128I(values128, values128);
int16_t ls[16];
float dnew[8];
float dnew[k_nr];
uint32_t block[8];
__m256i xv[8];
for (int ix = 0; ix < nrc_x; ix += 8) {
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
for (int ix = 0; ix < nrc_x; ix += k_nr) {
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
for (int i = 0; i < nb; ++i) {
for (int k = 0; k < 8; ++k) {
for (int k = 0; k < k_nr; ++k) {
float d = GGML_FP16_TO_FP32(x8[k][i].d);
for (int ib32 = 0; ib32 < 8; ++ib32) {
ls[2*ib32+0] = ls[2*ib32+1] = (((x8[k][i].scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((x8[k][i].scales_h >> 2*ib32) & 3) << 4)) - 32;
@ -2599,9 +2623,17 @@ void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
xv[ib32] = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(bits, 4), bits), _mm256_set1_epi8(0xf));
xv[ib32] = _mm256_shuffle_epi8(values, xv[ib32]);
}
dnew[k] = d * convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
dnew[k] = d * convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
}
#ifdef HAVE_FANCY_SIMD
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_loadu_ps(dnew), _MM_ROUND_NEAREST));
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
}
#else
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_loadu_ps(dnew), _MM_ROUND_NEAREST));
#endif
}
y += nb;
}
@ -2671,10 +2703,15 @@ bool iqk_set_kernels_kquants(int ne00, int typeA, int typeB, std::array<mul_mat_
break;
case GGML_TYPE_Q8_K_R8:
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r8_q8_k, kernels)
#ifdef HAVE_FANCY_SIMD
func16 = mul_mat_q8_k_r8_q8_k<16>;
#endif
//#ifdef HAVE_FANCY_SIMD
// func16 = mul_mat_q8_k_r8_q8_k<16>;
//#endif
break;
#ifdef HAVE_FANCY_SIMD
case GGML_TYPE_Q8_K_R16:
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r16_q8_k, kernels)
break;
#endif
case GGML_TYPE_Q8_KV:
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_KV_q8_KV, kernels)
#ifdef HAVE_FANCY_SIMD

View File

@ -231,31 +231,36 @@ struct MulMat {
static bool prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny);
static inline ggml_type is_dequant_better(ggml_type type, int nrc_y) {
#ifdef __AVX2__
#ifdef HAVE_FANCY_SIMD
auto q8_k_type = GGML_TYPE_Q8_K_R16;
#else
auto q8_k_type = GGML_TYPE_Q8_K_R8;
#endif
switch (type) {
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? q8_k_type : type;
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? q8_k_type : type;
case GGML_TYPE_Q4_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
case GGML_TYPE_Q4_1 : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
case GGML_TYPE_Q5_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
@ -315,7 +320,8 @@ struct MulMat {
#endif
return type;
}
static inline int num_rows(ggml_type type) {
static inline int num_rows([[maybe_unused]] ggml_type type) {
return 16;
#ifdef HAVE_FANCY_SIMD
switch (type) {
case GGML_TYPE_Q2_K_R4:
@ -346,7 +352,7 @@ struct MulMat {
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K_R8: return 8;
case GGML_TYPE_Q4_0_R8:
case GGML_TYPE_Q8_0_R8:
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_BF16_R16: return 16;
default: return 1;
}
@ -381,6 +387,7 @@ struct MulMat {
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K_R8: return 8;
case GGML_TYPE_Q8_K_R16:
case GGML_TYPE_BF16_R16: return 16;
default: return 1;
}
@ -829,6 +836,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_KV:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_K_R16:
return iqk_set_kernels_kquants(ne00, typeA, typeB, mm.funcs, mm.func16);
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
@ -924,6 +932,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
case GGML_TYPE_Q8_K_R8:
case GGML_TYPE_Q8_KV:
case GGML_TYPE_Q8_KV_R8:
case GGML_TYPE_Q8_K_R16:
return iqk_set_kernels_kquants(ne00, typeA, typeB, m.funcs, m.func16);
case GGML_TYPE_IQ2_KS:
case GGML_TYPE_IQ2_K:

View File

@ -6634,6 +6634,92 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
GGML_UNUSED(by);
}
//
// ========================================= q8_k_r16
//
void quantize_row_q8_k_r16_ref(const float * x, block_q8_k_r16 * y, int64_t k) {
quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr);
}
void quantize_row_q8_k_r16(const float * x, void * y, int64_t k) {
quantize_q8_k_r16(x, y, 16, k/16, nullptr);
}
static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r16 * y, [[maybe_unused]] bool online) {
GGML_ASSERT(nrows%16 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
int nblock = n_per_row/QK_K;
const block_q8_K * x16[16];
for (int row = 0; row < nrows; row += 16) {
for (int k = 0; k < 16; ++k) x16[k] = x + nblock*k;
for (int ibl = 0; ibl < nblock; ++ibl) {
for (int k = 0; k < 16; ++k) {
y[ibl].d[k] = GGML_FP32_TO_FP16(x16[k][ibl].d);
for (int ib = 0; ib < QK_K/4; ++ib) {
for (int i = 0; i < 4; ++i) y[ibl].qs[64*ib + 4*k + i] = x16[k][ibl].qs[4*ib+i];
}
}
#ifdef HAVE_FANCY_SIMD
for (int l = 0; l < 64; ++l) {
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ibl].qs + l), _mm512_set1_epi8(-128));
_mm512_storeu_si512((__m512i *)y[ibl].qs + l, v);
}
#endif
}
x += 16*nblock;
y += nblock;
}
}
size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
GGML_ASSERT(nrows%16 == 0);
GGML_ASSERT(n_per_row%QK_K == 0);
char * qcur = (char *)dst;
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_K, n_per_row);
auto row_size_1 = ggml_row_size(GGML_TYPE_Q8_K_R16, n_per_row);
std::vector<char> qtmp(16*row_size_0);
for (int row = 0; row < nrows; row += 16) {
quantize_row_q8_K32(src, (void *)qtmp.data(), 16*n_per_row);
repack_q16_k(16, n_per_row, (const block_q8_K *)qtmp.data(), (block_q8_k_r16 *)qcur, false);
qcur += 16*row_size_1;
src += 16*n_per_row;
}
return nrows*row_size_1;
}
void dequantize_row_q8_k_r16(const block_q8_k_r16 * x, float * y, int64_t k) {
auto n_per_row = k/16;
float * y16[16];
for (int k = 0; k < 16; ++k) y16[k] = y + n_per_row*k;
int nblock = n_per_row/QK_K;
for (int ibl = 0; ibl < nblock; ++ibl) {
auto qs = (const uint8_t *)x[ibl].qs;
for (int k = 0; k < 16; ++k) {
const float d = GGML_FP16_TO_FP32(x[ibl].d[k]);
const float m = -128.f*d;
for (int ib = 0; ib < QK_K/4; ++ib) {
for (int i = 0; i < 4; ++i) {
y16[k][QK_K*ibl+4*ib+i] = d * qs[64*ib+4*k+i] + m;
}
}
}
}
}
void vec_dot_q8_k_r16_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
#if GGML_USE_IQK_MULMAT
if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_K_R16, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
return;
}
#endif
GGML_ASSERT(n%QK4_NL == 0);
GGML_ASSERT(nrc == 1);
GGML_UNUSED(bs);
GGML_UNUSED(bx);
GGML_UNUSED(by);
}
//
// ========================================= q8_KV_r8
//

View File

@ -277,6 +277,12 @@ size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
void dequantize_row_q8_k_r8(const block_q8_k_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void quantize_row_q8_k_r16_ref(const float * GGML_RESTRICT x, block_q8_k_r16 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_k_r16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void dequantize_row_q8_k_r16(const block_q8_k_r16 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void vec_dot_q8_k_r16_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void quantize_row_q8_KV_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_KV(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);