mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Enable AVX-VNNI 256-bit path for Q8_K R8 matmul (#1460)
Use the sign trick with dpbusd instead of maddubs+madd+add, replacing 3 AVX2 instructions with 1 fused VNNI instruction. Removes dead HAVE_FANCY_SIMD code left over from the R16 split. Co-authored-by: Adam Caldwell <accaldwell@users.noreply.github.com>
This commit is contained in:
parent
1a7aa3e7fa
commit
b8fa7936bf
@ -1795,7 +1795,7 @@ template <int nrc_y>
|
||||
static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
|
||||
GGML_ASSERT(nrc_x%8 == 0);
|
||||
Q8<nrc_y, block_q8_K> q8(info);
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
#ifndef HAVE_VNNI256
|
||||
auto m1 = _mm256_set1_epi16(1);
|
||||
#endif
|
||||
int nbl = n / QK_K;
|
||||
@ -1811,25 +1811,18 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1);
|
||||
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2);
|
||||
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3);
|
||||
#ifndef HAVE_FANCY_SIMD
|
||||
auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
|
||||
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
|
||||
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
|
||||
auto s3 = _mm256_sign_epi8(qx[3], qx[3]);
|
||||
#else
|
||||
qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127));
|
||||
qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127));
|
||||
qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127));
|
||||
qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127));
|
||||
#endif
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
|
||||
auto y = MM256_SET_M128I(y128, y128);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[0], _mm256_shuffle_epi32(y, 0x00));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[1], _mm256_shuffle_epi32(y, 0x55));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[2], _mm256_shuffle_epi32(y, 0xaa));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[3], _mm256_shuffle_epi32(y, 0xff));
|
||||
#ifdef HAVE_VNNI256
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0]));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1]));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s2, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), qx[2]));
|
||||
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s3, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), qx[3]));
|
||||
#else
|
||||
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(s0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0])));
|
||||
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1])));
|
||||
@ -1840,15 +1833,9 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
auto m4 = _mm256_mul_ps(d4, _mm256_set1_ps(-127.f));
|
||||
#endif
|
||||
for (int iy = 0; iy < nrc_y; ++iy) {
|
||||
auto d4y = _mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl)));
|
||||
acc[iy] = _mm256_fmadd_ps(d4y, _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
|
||||
#ifdef HAVE_FANCY_SIMD
|
||||
acc[iy] = _mm256_fmadd_ps(m4, _mm256_set1_ps(q8.y[iy][ibl].sum), acc[iy]);
|
||||
#endif
|
||||
isum[iy] = _mm256_setzero_si256();
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user