mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
AVX512+AVXVNNI GEMM implementation for quants using Q8_K for activations (#710)
* q8_k_r16: basics * q8_k_r16: iq4_xs now uses q8_k_r16 on Zen4+ PP performance is about the same as using q8_k_r8 on the Ryzen-7950X, so we expect nice gains on Zen5, and we don't need to wory about using 2 different q8_k_r8 implementations for fancy SIMD. * q8_k_r16: iq2_xxs now uses q8_k_r16 on Zen4+ * q8_k_r16: iq2_xs now uses q8_k_r16 on Zen4+ * q8_k_r16: iq2_s now uses q8_k_r16 on Zen4+ * q8_k_r16: iq3_xxs now uses q8_k_r16 on Zen4+ * q8_k_r16: iq3_s now uses q8_k_r16 on Zen4+ * q8_k_r16: iq1_s and iq1_m now uses q8_k_r16 on Zen4+ * q8_k_r16: q2_K and q3_K now uses q8_k_r16 on Zen4+ * q8_k_r16: iq2_ks and iq2_k now uses q8_k_r16 on Zen4+ * q8_k_r16: iq2_kl now uses q8_k_r16 on Zen4+ * q8_k_r16: iq3_ks and iq3_k now uses q8_k_r16 on Zen4+ * q8_k_r16: iq4_kss, iq4_ks, and iq4_k now use q8_k_r16 on Zen4+ * q8_k_r16: iq5_ks, iq5_k, and iq6_k now use q8_k_r16 on Zen4+ * Fix AVX2 * Just always set num_rows to 16 --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
0b448997ec
commit
ca8c72ff1a
@ -475,6 +475,7 @@ extern "C" {
|
||||
GGML_TYPE_IQ5_K_R4 = 340,
|
||||
GGML_TYPE_IQ4_KS_R4 = 344,
|
||||
GGML_TYPE_IQ5_KS_R4 = 352,
|
||||
GGML_TYPE_Q8_K_R16 = 397,
|
||||
GGML_TYPE_Q8_KV_R8 = 398,
|
||||
GGML_TYPE_Q8_K_R8 = 399,
|
||||
GGML_TYPE_COUNT,
|
||||
@ -571,6 +572,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ5_K_R4 = 333, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_KS_R4 = 337, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ5_KS_R4 = 341, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q8_K_R16 = 397, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q8_KV_R8 = 398, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors
|
||||
};
|
||||
|
||||
@ -421,6 +421,12 @@ typedef struct {
|
||||
} block_q8_k_r8;
|
||||
static_assert(sizeof(block_q8_k_r8) == 8*sizeof(ggml_half) + 8*QK_K, "wrong q8_k_r8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[16]; // delta
|
||||
int8_t qs[16*QK_K]; // quants, stored as unsigned ints
|
||||
} block_q8_k_r16;
|
||||
static_assert(sizeof(block_q8_k_r16) == 16*sizeof(ggml_half) + 16*QK_K, "wrong q8_k_r16 block size/padding");
|
||||
|
||||
// (Almost) "true" 2-bit quantization.
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
|
||||
@ -15461,6 +15461,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||
case GGML_TYPE_IQ5_KS_R4:break;
|
||||
case GGML_TYPE_Q8_KV_R8: break;
|
||||
case GGML_TYPE_Q8_K_R8: break;
|
||||
case GGML_TYPE_Q8_K_R16: break;
|
||||
case GGML_TYPE_Q8_KV: break;
|
||||
case GGML_TYPE_BF16_R16: break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
|
||||
@ -1071,6 +1071,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.nrows = 1,
|
||||
.row_meta_size = 0,
|
||||
},
|
||||
[GGML_TYPE_Q8_K_R16] = {
|
||||
.type_name = "q8_k_r16",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_q8_k_r16)/16,
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q8_k_r16,
|
||||
.from_float = quantize_row_q8_k_r16,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q8_k_r16_ref,
|
||||
.vec_dot = vec_dot_q8_k_r16_q8_k,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
.row_meta_size = 0,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XXS] = {
|
||||
.type_name = "iq2_xxs",
|
||||
.blck_size = QK_K,
|
||||
@ -1934,7 +1947,7 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
}
|
||||
|
||||
static inline int ggml_packed_rows(enum ggml_type type) {
|
||||
return type == GGML_TYPE_BF16_R16 ? 16
|
||||
return type == GGML_TYPE_BF16_R16 || type == GGML_TYPE_Q8_K_R16 ? 16
|
||||
: type == GGML_TYPE_Q8_K_R8 || type == GGML_TYPE_Q8_KV_R8 ||
|
||||
type == GGML_TYPE_Q8_0_R8 || type == GGML_TYPE_Q4_0_R8 ||
|
||||
type == GGML_TYPE_IQ4_XS_R8 ? 8
|
||||
@ -4617,6 +4630,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q6_K_R4: wtype = GGML_TYPE_Q6_K_R4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_K_R8: wtype = GGML_TYPE_Q8_K_R8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_K_R16: wtype = GGML_TYPE_Q8_K_R16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_KV_R8: wtype = GGML_TYPE_Q8_KV_R8; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XXS_R4: wtype = GGML_TYPE_IQ2_XXS_R4;break;
|
||||
@ -11542,6 +11556,7 @@ static void ggml_compute_forward_add(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -12094,6 +12109,7 @@ static void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -12272,6 +12288,7 @@ static void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -14966,6 +14983,17 @@ static void ggml_compute_forward_mul_mat(
|
||||
#endif
|
||||
|
||||
#if GGML_USE_IQK_MULMAT
|
||||
if (ith == 0) {
|
||||
static bool first_time = true;
|
||||
if (first_time) {
|
||||
first_time = false;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
printf("======================================= HAVE_FANCY_SIMD is defined\n");
|
||||
#else
|
||||
printf("======================================= HAVE_FANCY_SIMD is NOT defined\n");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (iqk_mul_mat_4d(ne01, ne11, ne00,
|
||||
ne02, ne03, ne12, ne13, nb02, nb03, nb12, nb13, nb2/sizeof(float), nb3/sizeof(float),
|
||||
@ -15872,6 +15900,7 @@ static void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -16290,6 +16319,7 @@ static void ggml_compute_forward_set(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -16614,6 +16644,7 @@ static void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XXS_R4:
|
||||
@ -17274,6 +17305,7 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q6_K_R4:
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_KR8:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
@ -24380,6 +24412,7 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q6_K_R4: result = quantize_q6_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_K_R8: result = quantize_q8_k_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_K_R16:result = quantize_q8_k_r16(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q8_KV_R8:result = quantize_q8_KV_r8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ2_XXS_R4:result = quantize_iq2_xxs_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
|
||||
@ -556,6 +556,60 @@ inline void iqk_transpose_8x8(__m256 * m) {
|
||||
}
|
||||
}
|
||||
|
||||
template <int nr = 8>
|
||||
static inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
__m256i qs[16];
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
|
||||
}
|
||||
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
|
||||
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
|
||||
auto max4 = _mm_cvtepi32_ps(imax4);
|
||||
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
|
||||
bool needs_scaling = true;
|
||||
float dnew = _mm_cvtss_f32(max4) * d0;
|
||||
if (dnew < 1.f) {
|
||||
dnew = 1.f; needs_scaling = false;
|
||||
}
|
||||
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
if (needs_scaling) {
|
||||
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
|
||||
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
|
||||
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
|
||||
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
|
||||
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
|
||||
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
|
||||
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
|
||||
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
|
||||
i0 = _mm256_packs_epi32(i0, i1);
|
||||
i2 = _mm256_packs_epi32(i2, i3);
|
||||
i0 = _mm256_packs_epi16(i0, i2);
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
} else {
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
|
||||
auto i0_l = _mm256_castsi256_si128(i0);
|
||||
auto i0_h = _mm256_extracti128_si256(i0, 1);
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 8*nr*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[nr*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
}
|
||||
|
||||
#else
|
||||
// ------------------------------------ __aarch64__ --------------------------------------------------
|
||||
|
||||
|
||||
@ -1668,71 +1668,23 @@ static void mul_mat_iq2_bn_r4_q8_k16(int n, const void * vx, size_t bx, const Da
|
||||
}
|
||||
#endif
|
||||
|
||||
inline float convert_to_q8_k_r8(int k, int d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
auto q16_l = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
auto q16_h = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
q16_l = _mm256_mullo_epi16(q16_l, _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
q16_h = _mm256_mullo_epi16(q16_h, _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(q16_l, q16_l));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(q16_h, q16_h));
|
||||
}
|
||||
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
|
||||
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
|
||||
auto max4 = _mm_cvtepi32_ps(imax4);
|
||||
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
|
||||
bool needs_scaling = true;
|
||||
float dnew = _mm_cvtss_f32(max4) / d0;
|
||||
if (dnew < 1.f) {
|
||||
dnew = 1.f; needs_scaling = false;
|
||||
}
|
||||
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
auto q16_l = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
auto q16_h = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
q16_l = _mm256_mullo_epi16(q16_l, _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
q16_h = _mm256_mullo_epi16(q16_h, _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
if (needs_scaling) {
|
||||
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q16_l));
|
||||
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q16_l, 1));
|
||||
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q16_h));
|
||||
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q16_h, 1));
|
||||
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
|
||||
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
|
||||
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
|
||||
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
|
||||
i0 = _mm256_packs_epi32(i0, i1);
|
||||
i2 = _mm256_packs_epi32(i2, i3);
|
||||
i0 = _mm256_packs_epi16(i0, i2);
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
} else {
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
auto i0 = _mm256_packs_epi16(q16_l, q16_h);
|
||||
auto i0_l = _mm256_castsi256_si128(i0);
|
||||
auto i0_h = _mm256_extracti128_si256(i0, 1);
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 64*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
}
|
||||
|
||||
void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq1_s * x8[8];
|
||||
const block_iq1_s * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
int16_t ls[16];
|
||||
|
||||
@ -1740,10 +1692,10 @@ void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
__m256i qx[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq1_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq1_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto qs = x8[k][i].qs;
|
||||
auto qh = x8[k][i].qh;
|
||||
@ -1759,23 +1711,36 @@ void iqk_convert_iq1_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
qx[ib32] = value;
|
||||
qs += 4;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 126, qx, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/126, qx, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq1_m * x8[8];
|
||||
const block_iq1_m * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
int16_t ls[16];
|
||||
|
||||
@ -1785,10 +1750,10 @@ void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
auto mask = _mm256_setr_epi32(0x00000008, 0x00000008, 0x00000080, 0x00000080, 0x00080000, 0x00080000, 0x00800000, 0x00800000);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq1_m *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq1_m *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const uint16_t * sc = (const uint16_t *)x8[k][i].scales;
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
@ -1816,9 +1781,15 @@ void iqk_convert_iq1_m_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
qs += 4;
|
||||
qh += 2;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 126, qx, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/126, qx, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
|
||||
@ -2305,68 +2305,22 @@ template <typename Dequantizer> void set_functions(std::array<mul_mat_t, IQK_MAX
|
||||
#endif
|
||||
}
|
||||
|
||||
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
__m256i qs[16];
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
|
||||
}
|
||||
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
|
||||
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
|
||||
auto max4 = _mm_cvtepi32_ps(imax4);
|
||||
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
|
||||
bool needs_scaling = true;
|
||||
float dnew = _mm_cvtss_f32(max4) * d0;
|
||||
if (dnew < 1.f) {
|
||||
dnew = 1.f; needs_scaling = false;
|
||||
}
|
||||
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
if (needs_scaling) {
|
||||
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
|
||||
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
|
||||
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
|
||||
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
|
||||
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
|
||||
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
|
||||
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
|
||||
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
|
||||
i0 = _mm256_packs_epi32(i0, i1);
|
||||
i2 = _mm256_packs_epi32(i2, i3);
|
||||
i0 = _mm256_packs_epi16(i0, i2);
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
} else {
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
|
||||
auto i0_l = _mm256_castsi256_si128(i0);
|
||||
auto i0_h = _mm256_extracti128_si256(i0, 1);
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 64*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_ks * x8[8];
|
||||
const block_iq2_ks * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values;
|
||||
{
|
||||
@ -2374,8 +2328,8 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
values = MM256_SET_M128I(v, v);
|
||||
}
|
||||
|
||||
ggml_half dh[8];
|
||||
float dnew[8];
|
||||
ggml_half dh[k_nr];
|
||||
float dnew[k_nr];
|
||||
uint32_t block[8];
|
||||
int16_t ls[16];
|
||||
|
||||
@ -2383,14 +2337,14 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
|
||||
auto ml = _mm256_set1_epi8(0x03);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx);
|
||||
dh[k] = dptr[0];
|
||||
x8[k] = (const block_iq2_ks *)(dptr + 1);
|
||||
}
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
auto extra = x8[k][i].extra;
|
||||
for (int i128 = 0; i128 < 2; ++i128) {
|
||||
ls[8*i128+0] = ls[8*i128+1] = ((x8[k][i].scales[2*i128+0] & 0xf) | ((extra >> 4) & 0x10)) - 16;
|
||||
@ -2412,24 +2366,116 @@ void iqk_convert_iq2_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
|
||||
extra >>= 4;
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/125, xv, ls, block, y[i].qs);
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/125, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_mul_ps(_mm512_loadu_ps(dnew), _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dh)));
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(vd, _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
auto vd = _mm256_mul_ps(_mm256_loadu_ps(dnew), _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh)));
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(vd, _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_k * x8[k_nr];
|
||||
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values;
|
||||
{
|
||||
auto v = _mm_loadl_epi64((const __m128i *)iq2nl_values);
|
||||
values = MM256_SET_M128I(v, v);
|
||||
}
|
||||
|
||||
__m256i xv[8];
|
||||
uint32_t block[8];
|
||||
|
||||
const __m128i scale_shuffle = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800);
|
||||
|
||||
union { __m256i vec; int16_t val[16]; } helper;
|
||||
|
||||
auto ml = _mm256_set1_epi8(0x03);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales, 8);
|
||||
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
|
||||
scl = _mm_add_epi8(scl, _mm_set1_epi8(-8));
|
||||
helper.vec = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scl, scale_shuffle));
|
||||
auto extra = x8[k][i].extra;
|
||||
for (int i128 = 0; i128 < 2; ++i128) {
|
||||
auto bits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+i128);
|
||||
xv[4*i128+0] = _mm256_and_si256(bits, ml);
|
||||
xv[4*i128+1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), ml);
|
||||
xv[4*i128+2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), ml);
|
||||
xv[4*i128+3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), ml);
|
||||
auto shift1 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x02) << 1), _mm_set1_epi8((extra & 0x01) << 2));
|
||||
auto shift2 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x08) >> 1), _mm_set1_epi8((extra & 0x04) >> 0));
|
||||
auto shift3 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x20) >> 3), _mm_set1_epi8((extra & 0x10) >> 2));
|
||||
auto shift4 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x80) >> 5), _mm_set1_epi8((extra & 0x40) >> 4));
|
||||
xv[4*i128+0] = _mm256_add_epi8(xv[4*i128+0], shift1);
|
||||
xv[4*i128+1] = _mm256_add_epi8(xv[4*i128+1], shift2);
|
||||
xv[4*i128+2] = _mm256_add_epi8(xv[4*i128+2], shift3);
|
||||
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
|
||||
xv[4*i128+0] = _mm256_shuffle_epi8(values, xv[4*i128+0]);
|
||||
xv[4*i128+1] = _mm256_shuffle_epi8(values, xv[4*i128+1]);
|
||||
xv[4*i128+2] = _mm256_shuffle_epi8(values, xv[4*i128+2]);
|
||||
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
|
||||
extra >>= 8;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/120, xv, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_kl * x8[8];
|
||||
const block_iq2_kl * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[4];
|
||||
{
|
||||
@ -2443,8 +2489,8 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
}
|
||||
}
|
||||
|
||||
ggml_half dh[8];
|
||||
float dnew[8];
|
||||
ggml_half dh[k_nr];
|
||||
float dnew[k_nr];
|
||||
uint32_t block[8];
|
||||
int16_t ls[16];
|
||||
|
||||
@ -2455,14 +2501,14 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
uint32_t sl32;
|
||||
const auto sl8 = (const int8_t *)&sl32;
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix+k)*bx);
|
||||
dh[k] = dptr[0];
|
||||
x8[k] = (const block_iq2_kl *)(dptr + 1);
|
||||
}
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
uint32_t aux32;
|
||||
std::memcpy(&aux32, x8[k][i].scales_l, 4);
|
||||
auto sh = x8[k][i].scales_h;
|
||||
@ -2500,87 +2546,118 @@ void iqk_convert_iq2_kl_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
}
|
||||
hbits = _mm256_srli_epi16(hbits, 4);
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/125, xv, ls, block, y[i].qs);
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/125, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_mul_ps(_mm512_loadu_ps(dnew), _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)dh)));
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(vd, _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
auto vd = _mm256_mul_ps(_mm256_loadu_ps(dnew), _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)dh)));
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(vd, _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
void iqk_convert_iq3_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_k * x8[8];
|
||||
const block_iq3_ks * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values;
|
||||
{
|
||||
auto v = _mm_loadl_epi64((const __m128i *)iq2nl_values);
|
||||
auto v = _mm_loadu_si128((const __m128i *)iq3nl_values);
|
||||
values = MM256_SET_M128I(v, v);
|
||||
}
|
||||
|
||||
__m256i xv[8];
|
||||
ggml_half drow[k_nr];
|
||||
float dnew[k_nr];
|
||||
int16_t ls[16];
|
||||
|
||||
__m256i xv[8];
|
||||
uint32_t block[8];
|
||||
|
||||
const __m128i scale_shuffle = _mm_set_epi32(0x0f070e06, 0x0d050c04, 0x0b030a02, 0x09010800);
|
||||
|
||||
union { __m256i vec; int16_t val[16]; } helper;
|
||||
|
||||
auto ml = _mm256_set1_epi8(0x03);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix + k)*bx);
|
||||
drow[k] = dptr[0];
|
||||
x8[k] = (const block_iq3_ks *)(dptr + 1);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)drow));
|
||||
#else
|
||||
auto vd = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)drow));
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales, 8);
|
||||
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
|
||||
scl = _mm_add_epi8(scl, _mm_set1_epi8(-8));
|
||||
helper.vec = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scl, scale_shuffle));
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
|
||||
auto extra = x8[k][i].extra;
|
||||
for (int i128 = 0; i128 < 2; ++i128) {
|
||||
auto bits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+i128);
|
||||
xv[4*i128+0] = _mm256_and_si256(bits, ml);
|
||||
xv[4*i128+1] = _mm256_and_si256(_mm256_srli_epi16(bits, 2), ml);
|
||||
xv[4*i128+2] = _mm256_and_si256(_mm256_srli_epi16(bits, 4), ml);
|
||||
xv[4*i128+3] = _mm256_and_si256(_mm256_srli_epi16(bits, 6), ml);
|
||||
auto shift1 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x02) << 1), _mm_set1_epi8((extra & 0x01) << 2));
|
||||
auto shift2 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x08) >> 1), _mm_set1_epi8((extra & 0x04) >> 0));
|
||||
auto shift3 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x20) >> 3), _mm_set1_epi8((extra & 0x10) >> 2));
|
||||
auto shift4 = MM256_SET_M128I(_mm_set1_epi8((extra & 0x80) >> 5), _mm_set1_epi8((extra & 0x40) >> 4));
|
||||
xv[4*i128+0] = _mm256_add_epi8(xv[4*i128+0], shift1);
|
||||
xv[4*i128+1] = _mm256_add_epi8(xv[4*i128+1], shift2);
|
||||
xv[4*i128+2] = _mm256_add_epi8(xv[4*i128+2], shift3);
|
||||
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
|
||||
xv[4*i128+0] = _mm256_shuffle_epi8(values, xv[4*i128+0]);
|
||||
xv[4*i128+1] = _mm256_shuffle_epi8(values, xv[4*i128+1]);
|
||||
xv[4*i128+2] = _mm256_shuffle_epi8(values, xv[4*i128+2]);
|
||||
xv[4*i128+3] = _mm256_shuffle_epi8(values, xv[4*i128+3]);
|
||||
extra >>= 8;
|
||||
uint8_t extra_v = extra >> 8;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
ls[2*j+0] = ls[2*j+1] = ((x8[k][i].scales[j] & 0xf) | ((extra << 4) & 0x10)) - 16;
|
||||
ls[2*j+8] = ls[2*j+9] = ((x8[k][i].scales[j] >> 4) | ((extra << 0) & 0x10)) - 16;
|
||||
extra >>= 1;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/120, xv, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
||||
auto lbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs + i128);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
xv[4*i128+j] = _mm256_or_si256(_mm256_and_si256(lbits, _mm256_set1_epi8(3)), _mm256_and_si256(_mm256_slli_epi16(hbits, 2), _mm256_set1_epi8(4)));
|
||||
xv[4*i128+j] = _mm256_add_epi8(xv[4*i128+j], _mm256_set1_epi8((extra_v & 1) << 3));
|
||||
xv[4*i128+j] = _mm256_shuffle_epi8(values, xv[4*i128+j]);
|
||||
extra_v >>= 1;
|
||||
lbits = _mm256_srli_epi16(lbits, 2);
|
||||
hbits = _mm256_srli_epi16(hbits, 1);
|
||||
}
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq3_k * x8[8];
|
||||
const block_iq3_k * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values;
|
||||
{
|
||||
@ -2601,10 +2678,10 @@ void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
auto ml = _mm256_set1_epi8(0x03);
|
||||
auto hmask = _mm256_set1_epi8(4);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
uint64_t aux64; std::memcpy(&aux64, x8[k][i].scales_l, 8);
|
||||
auto scl = _mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), _mm_set1_epi8(0xf));
|
||||
@ -2639,82 +2716,36 @@ void iqk_convert_iq3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
hbits = _mm256_srli_epi16(hbits, 4);
|
||||
extra >>= 8;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq3_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq3_ks * x8[8];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
|
||||
__m256i values;
|
||||
{
|
||||
auto v = _mm_loadu_si128((const __m128i *)iq3nl_values);
|
||||
values = MM256_SET_M128I(v, v);
|
||||
}
|
||||
|
||||
ggml_half drow[8];
|
||||
float dnew[8];
|
||||
int16_t ls[16];
|
||||
|
||||
__m256i xv[8];
|
||||
uint32_t block[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const ggml_half * dptr = (const ggml_half *)((const char *)vx + (ix + k)*bx);
|
||||
drow[k] = dptr[0];
|
||||
x8[k] = (const block_iq3_ks *)(dptr + 1);
|
||||
}
|
||||
auto vd = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)drow));
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
|
||||
auto extra = x8[k][i].extra;
|
||||
uint8_t extra_v = extra >> 8;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
ls[2*j+0] = ls[2*j+1] = ((x8[k][i].scales[j] & 0xf) | ((extra << 4) & 0x10)) - 16;
|
||||
ls[2*j+8] = ls[2*j+9] = ((x8[k][i].scales[j] >> 4) | ((extra << 0) & 0x10)) - 16;
|
||||
extra >>= 1;
|
||||
}
|
||||
for (int i128 = 0; i128 < QK_K/128; ++i128) {
|
||||
auto lbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qs + i128);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
xv[4*i128+j] = _mm256_or_si256(_mm256_and_si256(lbits, _mm256_set1_epi8(3)), _mm256_and_si256(_mm256_slli_epi16(hbits, 2), _mm256_set1_epi8(4)));
|
||||
xv[4*i128+j] = _mm256_add_epi8(xv[4*i128+j], _mm256_set1_epi8((extra_v & 1) << 3));
|
||||
xv[4*i128+j] = _mm256_shuffle_epi8(values, xv[4*i128+j]);
|
||||
extra_v >>= 1;
|
||||
lbits = _mm256_srli_epi16(lbits, 2);
|
||||
hbits = _mm256_srli_epi16(hbits, 1);
|
||||
}
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq4_kss * x8[8];
|
||||
const block_iq4_kss * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[2];
|
||||
{
|
||||
@ -2724,22 +2755,26 @@ void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
values[1] = MM256_SET_M128I(v2, v2);
|
||||
}
|
||||
|
||||
float drow[8];
|
||||
float dnew[8];
|
||||
float drow[k_nr];
|
||||
float dnew[k_nr];
|
||||
int16_t ls[16];
|
||||
|
||||
__m256i xv[8];
|
||||
uint32_t block[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
|
||||
drow[k] = dptr[0];
|
||||
x8[k] = (const block_iq4_kss *)(dptr + 1);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_loadu_ps(drow);
|
||||
#else
|
||||
auto vd = _mm256_loadu_ps(drow);
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
auto val = _mm_loadu_si128((const __m128i *)x8[k][i].qs+ib32);
|
||||
auto val_q = _mm_and_si128(val, _mm_set1_epi32(0xfffefffe));
|
||||
@ -2752,23 +2787,38 @@ void iqk_convert_iq4_kss_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
ls[2*ib32+0] = ls[2*ib32+1] = ((s8 & 254) - 127);
|
||||
xv[ib32] = _mm256_shuffle_epi8(values[s8 & 1], xv[ib32]);
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq4_ks * x8[8];
|
||||
const block_iq4_ks * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[2];
|
||||
{
|
||||
@ -2778,22 +2828,26 @@ void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
values[1] = MM256_SET_M128I(v2, v2);
|
||||
}
|
||||
|
||||
float drow[8];
|
||||
float dnew[8];
|
||||
float drow[k_nr];
|
||||
float dnew[k_nr];
|
||||
int16_t ls[16];
|
||||
|
||||
__m256i xv[8];
|
||||
uint32_t block[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
|
||||
drow[k] = dptr[0];
|
||||
x8[k] = (const block_iq4_ks *)(dptr + 1);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_loadu_ps(drow);
|
||||
#else
|
||||
auto vd = _mm256_loadu_ps(drow);
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
ls[2*ib32+0] = (x8[k][i].scales[ib32] & 254) - 127;
|
||||
ls[2*ib32+1] = ls[2*ib32+0];
|
||||
@ -2801,23 +2855,38 @@ void iqk_convert_iq4_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
xv[ib32] = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(aux128, 4), aux128), _mm256_set1_epi8(0xf));
|
||||
xv[ib32] = _mm256_shuffle_epi8(values[x8[k][i].scales[ib32] & 1], xv[ib32]);
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq4_k * x8[8];
|
||||
const block_iq4_k * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[4];
|
||||
{
|
||||
@ -2837,10 +2906,10 @@ void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
//union { __m256i vec; int16_t val[16]; } helper;
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq4_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq4_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto extra = x8[k][i].extra;
|
||||
//uint64_t aux64;
|
||||
@ -2860,23 +2929,36 @@ void iqk_convert_iq4_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
xv[ib32] = _mm256_shuffle_epi8(values[extra & 3], xv[ib32]); extra >>= 2;
|
||||
}
|
||||
//float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq5_ks * x8[8];
|
||||
const block_iq5_ks * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[2];
|
||||
{
|
||||
@ -2886,8 +2968,8 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
values[1] = MM256_SET_M128I(v2, v2);
|
||||
}
|
||||
|
||||
float drow[8];
|
||||
float dnew[8];
|
||||
float drow[k_nr];
|
||||
float dnew[k_nr];
|
||||
int16_t ls[16];
|
||||
|
||||
__m256i xv[8];
|
||||
@ -2895,15 +2977,19 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
|
||||
auto mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
const float * dptr = (const float *)((const char *)vx + (ix + k)*bx);
|
||||
drow[k] = dptr[0];
|
||||
x8[k] = (const block_iq5_ks *)(dptr + 1);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto vd = _mm512_loadu_ps(drow);
|
||||
#else
|
||||
auto vd = _mm256_loadu_ps(drow);
|
||||
#endif
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
|
||||
for (int ib64 = 0; ib64 < 4; ++ib64) {
|
||||
ls[4*ib64+0] = (x8[k][i].scales[2*ib64+0] & 254) - 127;
|
||||
@ -2927,23 +3013,38 @@ void iqk_convert_iq5_ks_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
xv[2*ib64+1] = _mm256_add_epi8(xv[2*ib64+1], shift2);
|
||||
hbits = _mm256_srli_epi16(hbits, 2);
|
||||
}
|
||||
dnew[k] = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
dnew[k] = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_mul_ps(vd, _mm512_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_mul_ps(vd, _mm256_loadu_ps(dnew)), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq5_k * x8[8];
|
||||
const block_iq5_k * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[2];
|
||||
{
|
||||
@ -2959,10 +3060,10 @@ void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
auto mh = _mm256_set1_epi8(-128); // to avoid stupid warning about 0x80 overflowing
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq5_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq5_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto extra = x8[k][i].extra;
|
||||
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].qh);
|
||||
@ -2989,9 +3090,15 @@ void iqk_convert_iq5_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
hbits = _mm256_srli_epi16(hbits, 2);
|
||||
extra >>= 4;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -3103,14 +3210,21 @@ void iqk_convert_iq5_k_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
}
|
||||
|
||||
void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq6_k * x8[8];
|
||||
const block_iq6_k * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
__m256i values[4];
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
@ -3139,10 +3253,10 @@ void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
_mm256_and_si256(mask4, _mm256_shuffle_epi8(values[3], l))));
|
||||
};
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq6_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq6_k *)((const char *)vx + (ix+k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
helper.vec = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*)x8[k][i].scales));
|
||||
auto extra = x8[k][i].extra;
|
||||
@ -3168,9 +3282,15 @@ void iqk_convert_iq6_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
xv[4*i128+3] = _mm256_add_epi8(xv[4*i128+3], shift4);
|
||||
extra >>= 8;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/127, xv, helper.val, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
|
||||
@ -1839,59 +1839,6 @@ static void mul_mat_iq3_s_r4_q8_k(int n, const void * vx, size_t bx, const DataI
|
||||
}
|
||||
}
|
||||
|
||||
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
__m256i qs[16];
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
|
||||
}
|
||||
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
|
||||
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
|
||||
auto max4 = _mm_cvtepi32_ps(imax4);
|
||||
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
|
||||
bool needs_scaling = true;
|
||||
float dnew = _mm_cvtss_f32(max4) * d0;
|
||||
if (dnew < 1.f) {
|
||||
dnew = 1.f; needs_scaling = false;
|
||||
}
|
||||
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
if (needs_scaling) {
|
||||
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
|
||||
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
|
||||
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
|
||||
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
|
||||
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
|
||||
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
|
||||
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
|
||||
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
|
||||
i0 = _mm256_packs_epi32(i0, i1);
|
||||
i2 = _mm256_packs_epi32(i2, i3);
|
||||
i0 = _mm256_packs_epi16(i0, i2);
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
} else {
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
|
||||
auto i0_l = _mm256_castsi256_si128(i0);
|
||||
auto i0_h = _mm256_extracti128_si256(i0, 1);
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 64*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
@ -1942,14 +1889,21 @@ void iqk_convert_iq2_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_xxs * x8[8];
|
||||
const block_iq2_xxs * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
int16_t ls[16];
|
||||
EvenSignHelper esh;
|
||||
@ -1960,11 +1914,10 @@ void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
|
||||
__m256i values[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_xxs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_xxs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
// TODO: simdify
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
std::memcpy(aux32, x8[k][i].qs + 4*ib32, 2*sizeof(uint32_t));
|
||||
@ -1973,23 +1926,36 @@ void iqk_convert_iq2_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
values[ib32] = _mm256_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
||||
esh.sign_value(aux32[1], values[ib32]);
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/124, values, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, values, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_xs * x8[8];
|
||||
const block_iq2_xs * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
uint32_t block[8];
|
||||
|
||||
@ -2000,10 +1966,10 @@ void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
DequantizerIQ2XS::Helper sign_helper;
|
||||
#endif
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
helper.vec = DequantizerIQ2XS::make_scales(x8[k][i].scales);
|
||||
auto q2l = _mm256_loadu_si256((const __m256i *)x8[k][i].qs+0);
|
||||
@ -2017,9 +1983,16 @@ void iqk_convert_iq2_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
DequantizerIQ2XS::sign_values_helper(q2l, sign_helper, qx+0);
|
||||
DequantizerIQ2XS::sign_values_helper(q2h, sign_helper, qx+4);
|
||||
#endif
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/124, qx, helper.val, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, qx, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -2306,14 +2279,21 @@ void iqk_convert_iq2_s_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
}
|
||||
|
||||
void iqk_convert_iq2_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq2_s * x8[8];
|
||||
const block_iq2_s * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
uint32_t block[8];
|
||||
|
||||
@ -2322,17 +2302,23 @@ void iqk_convert_iq2_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
SignHelper sh;
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq2_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq2_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = 0.125f * GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
helper.vec = DequantizerIQ2S::make_scales(x8[k][i].scales);
|
||||
DequantizerIQ2S::prepare(x8[k][i].qs+ 0, x8[k][i].qh+0, (const uint16_t *)(x8[k][i].qs + QK_K/8) + 0, sh, qx+0);
|
||||
DequantizerIQ2S::prepare(x8[k][i].qs+16, x8[k][i].qh+4, (const uint16_t *)(x8[k][i].qs + QK_K/8) + 8, sh, qx+4);
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/124, qx, helper.val, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, qx, helper.val, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -2458,14 +2444,21 @@ static void mul_mat_iq2_s_q8_2_X4(int n, const void * vx, size_t bx, const DataI
|
||||
}
|
||||
|
||||
void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq3_xxs * x8[8];
|
||||
const block_iq3_xxs * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
int16_t ls[16];
|
||||
EvenSignHelper esh;
|
||||
@ -2474,10 +2467,10 @@ void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
uint32_t block[8];
|
||||
uint32_t aux32;
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_xxs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_xxs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = 0.25f * GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto qs = x8[k][i].qs;
|
||||
auto sas = qs + QK_K/4;
|
||||
@ -2490,9 +2483,15 @@ void iqk_convert_iq3_xxs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
esh.sign_value(aux32, values[ib32]);
|
||||
qs += 8;
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/124, values, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/124, values, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -2551,14 +2550,21 @@ void iqk_convert_iq3_xxs_q8_0_r8(int n, const void * vx, size_t bx, void * vy, i
|
||||
}
|
||||
|
||||
void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq3_s * x8[8];
|
||||
const block_iq3_s * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
int16_t ls[16];
|
||||
SignHelper sh;
|
||||
@ -2567,10 +2573,10 @@ void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
uint32_t block[8];
|
||||
__m256i values[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq3_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq3_s *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto qs = x8[k][i].qs;
|
||||
auto qh = x8[k][i].qh;
|
||||
@ -2585,9 +2591,15 @@ void iqk_convert_iq3_s_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
ls[2*ib32 + 0] = (2*((x8[k][i].scales[ib32/2] >> 4*(ib32%2)) & 0xf) + 1);
|
||||
ls[2*ib32 + 1] = ls[2*ib32 + 0];
|
||||
}
|
||||
float dnew = convert_to_q8_k_r8(k, 1.f/127, values, ls, block, y[i].qs);
|
||||
float dnew = convert_to_q8_k_r8<k_nr>(k, 1.f/127, values, ls, block, y[i].qs);
|
||||
y[i].d[k] = GGML_FP32_TO_FP16(d*dnew);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
|
||||
@ -1859,6 +1859,50 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_k_r16_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%16 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
int nbl = n / QK_K;
|
||||
__m512 acc[nrc_y] = {};
|
||||
__m512i isum[nrc_y] = {};
|
||||
__m512i qx[4];
|
||||
for (int ix = 0; ix < nrc_x; ix += 16) {
|
||||
const block_q8_k_r16 * iq16 = (const block_q8_k_r16 *)((const char *)vx + ix*bx);
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) { // Block of 256
|
||||
auto d4 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)iq16[ibl].d));
|
||||
for (int ib = 0; ib < QK_K/16; ++ib) {
|
||||
qx[0] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+0);
|
||||
qx[1] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+1);
|
||||
qx[2] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+2);
|
||||
qx[3] = _mm512_loadu_si512((const __m512i *)iq16[ibl].qs+4*ib+3);
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
|
||||
auto y256 = MM256_SET_M128I(y128, y128);
|
||||
auto y = _mm512_inserti32x8(_mm512_castsi256_si512(y256), y256, 1);
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[0], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x00)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[1], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0x55)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[2], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xaa)));
|
||||
isum[iy] = _mm512_dpbusd_epi32(isum[iy], qx[3], _mm512_shuffle_epi32(y, _MM_PERM_ENUM(0xff)));
|
||||
}
|
||||
}
|
||||
auto m4 = _mm512_mul_ps(d4, _mm512_set1_ps(-128.f));
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto d4y = _mm512_mul_ps(d4, _mm512_set1_ps(q8.scale(iy, ibl)));
|
||||
acc[iy] = _mm512_fmadd_ps(d4y, _mm512_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
acc[iy] = _mm512_fmadd_ps(m4, _mm512_set1_ps(q8.y[iy][ibl].sum), acc[iy]);
|
||||
isum[iy] = _mm512_setzero_si512();
|
||||
}
|
||||
}
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
info.store(ix, iy, acc[iy]);
|
||||
acc[iy] = _mm512_setzero_ps();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int nrc_y>
|
||||
static void mul_mat_q8_KV_q8_KV(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%4 == 0);
|
||||
@ -2020,14 +2064,21 @@ typedef struct {
|
||||
} block_q8_1_r8;
|
||||
|
||||
void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_q2_K * x8[8];
|
||||
const block_q2_K * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
float f_values[QK_K];
|
||||
uint32_t block[8];
|
||||
@ -2038,10 +2089,10 @@ void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
auto sign_bit = _mm256_set1_ps(-0.0f);
|
||||
auto perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_q2_K *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_q2_K *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
auto vd = _mm256_set1_ps(GGML_FP16_TO_FP32(x8[k][i].d));
|
||||
auto vm = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x8[k][i].dmin)), _mm256_set1_ps(-1.f));
|
||||
auto block_max = _mm256_setzero_ps();
|
||||
@ -2092,13 +2143,18 @@ void iqk_convert_q2_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, perm);
|
||||
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
auto q8 = (uint32_t *)y[i].qs + 64*ib32;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
q8[8*l + k + 0] = block[l + 0];
|
||||
q8[8*l + k + 32] = block[l + 4];
|
||||
auto q8 = (uint32_t *)y[i].qs + 8*k_nr*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
q8[k_nr*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -2414,14 +2470,21 @@ void iqk_convert_q3_k_q8_0_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
}
|
||||
|
||||
void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_q3_K * x8[8];
|
||||
const block_q3_K * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
uint32_t block[8];
|
||||
__m256i values[8];
|
||||
@ -2432,10 +2495,10 @@ void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
|
||||
union { __m256i vec; int16_t val[16]; } helper;
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_q3_K *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_q3_K *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
auto hbits = _mm256_loadu_si256((const __m256i *)x8[k][i].hmask);
|
||||
helper.vec = _mm256_cvtepi8_epi16(sc3.make_scales((const uint16_t *)x8[k][i].scales));
|
||||
@ -2505,93 +2568,54 @@ void iqk_convert_q3_k_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)y[i].qs + 64*ib32;
|
||||
auto qs = (uint32_t *)y[i].qs + 8*k_nr*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
qs[k_nr*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
}
|
||||
|
||||
inline float convert_to_q8_k_r8(int k, float d0, const __m256i * qx, const int16_t * scales, uint32_t * block, int8_t * q8_k) {
|
||||
auto max_i16 = _mm256_setzero_si256();
|
||||
__m256i qs[16];
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
qs[2*ib32+0] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(qx[ib32]));
|
||||
qs[2*ib32+1] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(qx[ib32], 1));
|
||||
qs[2*ib32+0] = _mm256_mullo_epi16(qs[2*ib32+0], _mm256_set1_epi16(scales[2*ib32+0]));
|
||||
qs[2*ib32+1] = _mm256_mullo_epi16(qs[2*ib32+1], _mm256_set1_epi16(scales[2*ib32+1]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+0], qs[2*ib32+0]));
|
||||
max_i16 = _mm256_max_epi16(max_i16, _mm256_sign_epi16(qs[2*ib32+1], qs[2*ib32+1]));
|
||||
}
|
||||
auto max_q32 = _mm256_cvtepi16_epi32(_mm_max_epi16(_mm256_castsi256_si128(max_i16), _mm256_extracti128_si256(max_i16, 1)));
|
||||
auto imax4 = _mm_max_epi32(_mm256_castsi256_si128(max_q32), _mm256_extracti128_si256(max_q32, 1));
|
||||
auto max4 = _mm_cvtepi32_ps(imax4);
|
||||
max4 = _mm_max_ps(max4, _mm_movehl_ps(max4, max4));
|
||||
max4 = _mm_max_ss(max4, _mm_movehdup_ps(max4));
|
||||
bool needs_scaling = true;
|
||||
float dnew = _mm_cvtss_f32(max4) * d0;
|
||||
if (dnew < 1.f) {
|
||||
dnew = 1.f; needs_scaling = false;
|
||||
}
|
||||
auto scale = _mm256_set1_ps(std::abs(dnew) > 1e-9f ? 1/dnew : 0.f);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
if (needs_scaling) {
|
||||
auto i0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+0]));
|
||||
auto i1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+0], 1));
|
||||
auto i2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(qs[2*ib32+1]));
|
||||
auto i3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(qs[2*ib32+1], 1));
|
||||
i0 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i0)), _MM_ROUND_NEAREST));
|
||||
i1 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i1)), _MM_ROUND_NEAREST));
|
||||
i2 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i2)), _MM_ROUND_NEAREST));
|
||||
i3 = _mm256_cvtps_epi32(_mm256_round_ps(_mm256_mul_ps(scale, _mm256_cvtepi32_ps(i3)), _MM_ROUND_NEAREST));
|
||||
i0 = _mm256_packs_epi32(i0, i1);
|
||||
i2 = _mm256_packs_epi32(i2, i3);
|
||||
i0 = _mm256_packs_epi16(i0, i2);
|
||||
i0 = _mm256_permutevar8x32_epi32(i0, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7));
|
||||
_mm256_storeu_si256((__m256i *)block, i0);
|
||||
} else {
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
auto i0 = _mm256_packs_epi16(qs[2*ib32+0], qs[2*ib32+1]);
|
||||
auto i0_l = _mm256_castsi256_si128(i0);
|
||||
auto i0_h = _mm256_extracti128_si256(i0, 1);
|
||||
_mm_storeu_si128((__m128i *)block+0, _mm_unpacklo_epi64(i0_l, i0_h));
|
||||
_mm_storeu_si128((__m128i *)block+1, _mm_unpackhi_epi64(i0_l, i0_h));
|
||||
}
|
||||
auto qs = (uint32_t *)q8_k + 64*ib32;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
qs[8*l + k] = block[l];
|
||||
}
|
||||
}
|
||||
return dnew;
|
||||
}
|
||||
|
||||
// TODO: move this to iqk_gemm_iquants
|
||||
void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, int nrc_x) {
|
||||
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
constexpr int k_nr = 16;
|
||||
using block_q8_k_r = block_q8_k_r16;
|
||||
#else
|
||||
constexpr int k_nr = 8;
|
||||
using block_q8_k_r = block_q8_k_r8;
|
||||
#endif
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
GGML_ASSERT(nrc_x%k_nr == 0);
|
||||
|
||||
int nb = n/QK_K;
|
||||
|
||||
const block_iq4_xs * x8[8];
|
||||
const block_iq4_xs * x8[k_nr];
|
||||
|
||||
block_q8_k_r8 * y = (block_q8_k_r8 *)vy;
|
||||
block_q8_k_r * y = (block_q8_k_r *)vy;
|
||||
|
||||
auto values128 = _mm_loadu_si128((const __m128i *)iq4k_values);
|
||||
auto values = MM256_SET_M128I(values128, values128);
|
||||
|
||||
int16_t ls[16];
|
||||
float dnew[8];
|
||||
float dnew[k_nr];
|
||||
uint32_t block[8];
|
||||
__m256i xv[8];
|
||||
|
||||
for (int ix = 0; ix < nrc_x; ix += 8) {
|
||||
for (int k = 0; k < 8; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int ix = 0; ix < nrc_x; ix += k_nr) {
|
||||
for (int k = 0; k < k_nr; ++k) x8[k] = (const block_iq4_xs *)((const char *)vx + (ix + k)*bx);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int k = 0; k < k_nr; ++k) {
|
||||
float d = GGML_FP16_TO_FP32(x8[k][i].d);
|
||||
for (int ib32 = 0; ib32 < 8; ++ib32) {
|
||||
ls[2*ib32+0] = ls[2*ib32+1] = (((x8[k][i].scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((x8[k][i].scales_h >> 2*ib32) & 3) << 4)) - 32;
|
||||
@ -2599,9 +2623,17 @@ void iqk_convert_iq4_xs_q8_k_r8(int n, const void * vx, size_t bx, void * vy, in
|
||||
xv[ib32] = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(bits, 4), bits), _mm256_set1_epi8(0xf));
|
||||
xv[ib32] = _mm256_shuffle_epi8(values, xv[ib32]);
|
||||
}
|
||||
dnew[k] = d * convert_to_q8_k_r8(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
dnew[k] = d * convert_to_q8_k_r8<k_nr>(k, 1.f/127, xv, ls, block, y[i].qs);
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
_mm256_storeu_si256((__m256i *)y[i].d, _mm512_cvtps_ph(_mm512_loadu_ps(dnew), _MM_ROUND_NEAREST));
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[i].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[i].qs + l, v);
|
||||
}
|
||||
#else
|
||||
_mm_storeu_si128((__m128i *)y[i].d, _mm256_cvtps_ph(_mm256_loadu_ps(dnew), _MM_ROUND_NEAREST));
|
||||
#endif
|
||||
}
|
||||
y += nb;
|
||||
}
|
||||
@ -2671,10 +2703,15 @@ bool iqk_set_kernels_kquants(int ne00, int typeA, int typeB, std::array<mul_mat_
|
||||
break;
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r8_q8_k, kernels)
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
func16 = mul_mat_q8_k_r8_q8_k<16>;
|
||||
#endif
|
||||
//#ifdef HAVE_FANCY_SIMD
|
||||
// func16 = mul_mat_q8_k_r8_q8_k<16>;
|
||||
//#endif
|
||||
break;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_k_r16_q8_k, kernels)
|
||||
break;
|
||||
#endif
|
||||
case GGML_TYPE_Q8_KV:
|
||||
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q8_KV_q8_KV, kernels)
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
|
||||
@ -231,31 +231,36 @@ struct MulMat {
|
||||
static bool prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny);
|
||||
static inline ggml_type is_dequant_better(ggml_type type, int nrc_y) {
|
||||
#ifdef __AVX2__
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto q8_k_type = GGML_TYPE_Q8_K_R16;
|
||||
#else
|
||||
auto q8_k_type = GGML_TYPE_Q8_K_R8;
|
||||
#endif
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_XXS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_XS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_S : return nrc_y >= 16 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_XXS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_XS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_S : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ1_S : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ1_M : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q2_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q3_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q6_K : return nrc_y >= 64 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? GGML_TYPE_Q8_K_R8 : type;
|
||||
case GGML_TYPE_IQ2_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ2_KL : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ3_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_KSS: return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ4_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ5_KS : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ5_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_IQ6_K : return nrc_y >= 32 ? q8_k_type : type;
|
||||
case GGML_TYPE_Q4_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
case GGML_TYPE_Q4_1 : return nrc_y >= 32 ? GGML_TYPE_Q8_1 : type;
|
||||
case GGML_TYPE_Q5_0 : return nrc_y >= 32 ? GGML_TYPE_Q8_0_R8 : type;
|
||||
@ -315,7 +320,8 @@ struct MulMat {
|
||||
#endif
|
||||
return type;
|
||||
}
|
||||
static inline int num_rows(ggml_type type) {
|
||||
static inline int num_rows([[maybe_unused]] ggml_type type) {
|
||||
return 16;
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q2_K_R4:
|
||||
@ -346,7 +352,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q4_0_R8:
|
||||
case GGML_TYPE_Q8_0_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
@ -381,6 +387,7 @@ struct MulMat {
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q8_K_R8: return 8;
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
case GGML_TYPE_BF16_R16: return 16;
|
||||
default: return 1;
|
||||
}
|
||||
@ -829,6 +836,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
return iqk_set_kernels_kquants(ne00, typeA, typeB, mm.funcs, mm.func16);
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
@ -924,6 +932,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
|
||||
case GGML_TYPE_Q8_K_R8:
|
||||
case GGML_TYPE_Q8_KV:
|
||||
case GGML_TYPE_Q8_KV_R8:
|
||||
case GGML_TYPE_Q8_K_R16:
|
||||
return iqk_set_kernels_kquants(ne00, typeA, typeB, m.funcs, m.func16);
|
||||
case GGML_TYPE_IQ2_KS:
|
||||
case GGML_TYPE_IQ2_K:
|
||||
|
||||
@ -6634,6 +6634,92 @@ void vec_dot_q8_k_r8_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
|
||||
GGML_UNUSED(by);
|
||||
}
|
||||
|
||||
//
|
||||
// ========================================= q8_k_r16
|
||||
//
|
||||
|
||||
void quantize_row_q8_k_r16_ref(const float * x, block_q8_k_r16 * y, int64_t k) {
|
||||
quantize_q8_k_r16(x, (void *)y, 16, k/16, nullptr);
|
||||
}
|
||||
|
||||
void quantize_row_q8_k_r16(const float * x, void * y, int64_t k) {
|
||||
quantize_q8_k_r16(x, y, 16, k/16, nullptr);
|
||||
}
|
||||
|
||||
static void repack_q16_k(int nrows, int n_per_row, const block_q8_K * x, block_q8_k_r16 * y, [[maybe_unused]] bool online) {
|
||||
GGML_ASSERT(nrows%16 == 0);
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
const block_q8_K * x16[16];
|
||||
for (int row = 0; row < nrows; row += 16) {
|
||||
for (int k = 0; k < 16; ++k) x16[k] = x + nblock*k;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
for (int k = 0; k < 16; ++k) {
|
||||
y[ibl].d[k] = GGML_FP32_TO_FP16(x16[k][ibl].d);
|
||||
for (int ib = 0; ib < QK_K/4; ++ib) {
|
||||
for (int i = 0; i < 4; ++i) y[ibl].qs[64*ib + 4*k + i] = x16[k][ibl].qs[4*ib+i];
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
for (int l = 0; l < 64; ++l) {
|
||||
auto v = _mm512_xor_si512(_mm512_loadu_si512((const __m512i *)y[ibl].qs + l), _mm512_set1_epi8(-128));
|
||||
_mm512_storeu_si512((__m512i *)y[ibl].qs + l, v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
x += 16*nblock;
|
||||
y += nblock;
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_q8_k_r16(const float * src, void * dst, int64_t nrows, int64_t n_per_row, [[maybe_unused]] const float * imatrix) {
|
||||
GGML_ASSERT(nrows%16 == 0);
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
char * qcur = (char *)dst;
|
||||
auto row_size_0 = ggml_row_size(GGML_TYPE_Q8_K, n_per_row);
|
||||
auto row_size_1 = ggml_row_size(GGML_TYPE_Q8_K_R16, n_per_row);
|
||||
std::vector<char> qtmp(16*row_size_0);
|
||||
for (int row = 0; row < nrows; row += 16) {
|
||||
quantize_row_q8_K32(src, (void *)qtmp.data(), 16*n_per_row);
|
||||
repack_q16_k(16, n_per_row, (const block_q8_K *)qtmp.data(), (block_q8_k_r16 *)qcur, false);
|
||||
qcur += 16*row_size_1;
|
||||
src += 16*n_per_row;
|
||||
}
|
||||
return nrows*row_size_1;
|
||||
}
|
||||
|
||||
void dequantize_row_q8_k_r16(const block_q8_k_r16 * x, float * y, int64_t k) {
|
||||
auto n_per_row = k/16;
|
||||
float * y16[16];
|
||||
for (int k = 0; k < 16; ++k) y16[k] = y + n_per_row*k;
|
||||
int nblock = n_per_row/QK_K;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
auto qs = (const uint8_t *)x[ibl].qs;
|
||||
for (int k = 0; k < 16; ++k) {
|
||||
const float d = GGML_FP16_TO_FP32(x[ibl].d[k]);
|
||||
const float m = -128.f*d;
|
||||
for (int ib = 0; ib < QK_K/4; ++ib) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
y16[k][QK_K*ibl+4*ib+i] = d * qs[64*ib+4*k+i] + m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vec_dot_q8_k_r16_q8_k(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc) {
|
||||
#if GGML_USE_IQK_MULMAT
|
||||
if (iqk_mul_mat(1, 1, n, GGML_TYPE_Q8_K_R16, vx, 0, GGML_TYPE_Q8_K, vy, 0, s, 0, 0, 1)) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
GGML_ASSERT(n%QK4_NL == 0);
|
||||
GGML_ASSERT(nrc == 1);
|
||||
GGML_UNUSED(bs);
|
||||
GGML_UNUSED(bx);
|
||||
GGML_UNUSED(by);
|
||||
}
|
||||
|
||||
//
|
||||
// ========================================= q8_KV_r8
|
||||
//
|
||||
|
||||
@ -277,6 +277,12 @@ size_t quantize_q8_k_r8(const float * GGML_RESTRICT src, void * GGML_RESTRICT ds
|
||||
void dequantize_row_q8_k_r8(const block_q8_k_r8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void vec_dot_q8_k_r8_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void quantize_row_q8_k_r16_ref(const float * GGML_RESTRICT x, block_q8_k_r16 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_k_r16(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
size_t quantize_q8_k_r16(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
void dequantize_row_q8_k_r16(const block_q8_k_r16 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void vec_dot_q8_k_r16_q8_k(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void quantize_row_q8_KV_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_KV(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
size_t quantize_q8_KV(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user