From 0fdac83272a03f535645001120c17dce345a8e2a Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 14 Jun 2026 16:07:36 +0200 Subject: [PATCH] Fix Q8_0 graph reduce type Analogous to the BF16 fix in eea6a82b25, this adds proper Q8_0 type handling in ggml_cuda_op_add: - Add k_add_q8_0_f32 kernel: dequantize Q8_0, add F32, store F32 - Add k_add_q8_0_q8_0_f32 kernel: dequantize two Q8_0, add, store F32 - Add Q8_0+Q8_0/Q8_0+F32/F32+Q8_0 branches in the F32 dst (else) block, preventing Q8_0 data from falling through to the incorrect half cast - Expand Q8_0 dst branch to handle F32+Q8_0->Q8_0 (swapped args), not just Q8_0+F32->Q8_0 --- ggml/src/ggml-cuda/binbcast.cu | 41 ++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 898640dd..20ab2189 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -385,6 +385,24 @@ static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const fl } } +template +static __global__ void k_add_q8_0_f32(int nelem, const block_q8_0 * x, const float * y, float * z) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + int ib = i / QK8_0; + int iq = i % QK8_0; + z[i] = (float)x[ib].d * x[ib].qs[iq] + y[i]; +} + +template +static __global__ void k_add_q8_0_q8_0_f32(int nelem, const block_q8_0 * x, const block_q8_0 * y, float * z) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + int ib = i / QK8_0; + int iq = i % QK8_0; + z[i] = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq]; +} + void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem, const void * x, const void * y, void * z) { constexpr int kBlockSize = 256; @@ -461,9 +479,16 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { (const float *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data); } } else if (dst->type == GGML_TYPE_Q8_0) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32); - k_add_same_q8_0<<>>(nelem, + if (dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32) { + k_add_same_q8_0<<>>(nelem, (const block_q8_0 *)dst->src[0]->data, (const float *)dst->src[1]->data, (block_q8_0 *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_Q8_0) { + k_add_same_q8_0<<>>(nelem, + (const block_q8_0 *)dst->src[1]->data, (const float *)dst->src[0]->data, (block_q8_0 *)dst->data); + } else { + GGML_ABORT("Unsupported Q8_0 add combination"); + } } else { if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) { k_fast_add_2<<>>(dst->ne[0], nelem, @@ -488,6 +513,18 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_BF16) { k_fast_add_2<<>>(dst->ne[0], nelem, (const float *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (float *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_Q8_0) { + k_add_q8_0_q8_0_f32<<>>(nelem, + (const block_q8_0 *)dst->src[0]->data, (const block_q8_0 *)dst->src[1]->data, (float *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32) { + k_add_q8_0_f32<<>>(nelem, + (const block_q8_0 *)dst->src[0]->data, (const float *)dst->src[1]->data, (float *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_Q8_0) { + k_add_q8_0_f32<<>>(nelem, + (const block_q8_0 *)dst->src[1]->data, (const float *)dst->src[0]->data, (float *)dst->data); } else { k_fast_add_2<<>>(dst->ne[0], nelem, (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (float *)dst->data);