Enable AVX-VNNI 256-bit path for Q8_K R8 matmul (#1460)

Use the sign trick with dpbusd instead of maddubs+madd+add,
replacing 3 AVX2 instructions with 1 fused VNNI instruction.
Removes dead HAVE_FANCY_SIMD code left over from the R16 split.

Co-authored-by: Adam Caldwell <accaldwell@users.noreply.github.com>
This commit is contained in:
Adam Caldwell 2026-03-18 22:56:11 -07:00 committed by GitHub
parent 1a7aa3e7fa
commit b8fa7936bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1795,7 +1795,7 @@ template <int nrc_y>
static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%8 == 0);
Q8<nrc_y, block_q8_K> q8(info);
#ifndef HAVE_FANCY_SIMD
#ifndef HAVE_VNNI256
auto m1 = _mm256_set1_epi16(1);
#endif
int nbl = n / QK_K;
@ -1811,25 +1811,18 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
qx[1] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+1);
qx[2] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+2);
qx[3] = _mm256_loadu_si256((const __m256i *)iq8[ibl].qs+4*ib+3);
#ifndef HAVE_FANCY_SIMD
auto s0 = _mm256_sign_epi8(qx[0], qx[0]);
auto s1 = _mm256_sign_epi8(qx[1], qx[1]);
auto s2 = _mm256_sign_epi8(qx[2], qx[2]);
auto s3 = _mm256_sign_epi8(qx[3], qx[3]);
#else
qx[0] = _mm256_add_epi8(qx[0], _mm256_set1_epi8(127));
qx[1] = _mm256_add_epi8(qx[1], _mm256_set1_epi8(127));
qx[2] = _mm256_add_epi8(qx[2], _mm256_set1_epi8(127));
qx[3] = _mm256_add_epi8(qx[3], _mm256_set1_epi8(127));
#endif
for (int iy = 0; iy < nrc_y; ++iy) {
auto y128 = _mm_loadu_si128((const __m128i*)q8.y[iy][ibl].qs+ib);
auto y = MM256_SET_M128I(y128, y128);
#ifdef HAVE_FANCY_SIMD
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[0], _mm256_shuffle_epi32(y, 0x00));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[1], _mm256_shuffle_epi32(y, 0x55));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[2], _mm256_shuffle_epi32(y, 0xaa));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], qx[3], _mm256_shuffle_epi32(y, 0xff));
#ifdef HAVE_VNNI256
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0]));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1]));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s2, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xaa), qx[2]));
isum[iy] = _mm256_dpbusd_epi32(isum[iy], s3, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0xff), qx[3]));
#else
auto sumi1 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(s0, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x00), qx[0])));
auto sumi2 = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(s1, _mm256_sign_epi8(_mm256_shuffle_epi32(y, 0x55), qx[1])));
@ -1840,15 +1833,9 @@ static void mul_mat_q8_k_r8_q8_k(int n, const void * vx, size_t bx, const DataIn
#endif
}
}
#ifdef HAVE_FANCY_SIMD
auto m4 = _mm256_mul_ps(d4, _mm256_set1_ps(-127.f));
#endif
for (int iy = 0; iy < nrc_y; ++iy) {
auto d4y = _mm256_mul_ps(d4, _mm256_set1_ps(q8.scale(iy, ibl)));
acc[iy] = _mm256_fmadd_ps(d4y, _mm256_cvtepi32_ps(isum[iy]), acc[iy]);
#ifdef HAVE_FANCY_SIMD
acc[iy] = _mm256_fmadd_ps(m4, _mm256_set1_ps(q8.y[iy][ibl].sum), acc[iy]);
#endif
isum[iy] = _mm256_setzero_si256();
}
}