From 2d3ecd5e19cd64cbae122c385b8fa08483d7fc9e Mon Sep 17 00:00:00 2001 From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 23 Jun 2026 14:03:22 +0200 Subject: [PATCH] Fix minor CUDA discrepancies (part 2) (#2015) * fix: wrong tensor index in BF16 fused RMS norm add path (norm.cu:1039) The BF16 branch of ggml_cuda_op_fused_rms_rms_add used dst->src[2]->data for the second weight pointer, but should have used dst->src[3]->data. This caused reading float weights from the wrong bf16 input tensor. The F32 and F16 branches both correctly reference src[3], and the assertions at lines 1013-1015 confirm src[3] is the F32 weight tensor. * fix: off-by-one bounds check in 7 dmmv kernels (row > nrows -> row >= nrows) Seven K-quant dequantize_mul_mat_vec kernels used row > nrows for bounds checking instead of row >= nrows. Since rows are 0-indexed (0..nrows-1), the check missed the row == nrows case, allowing a potential out-of-bounds memory write when grid dimensions produce exactly nrows. The templated dequantize_mul_mat_vec kernel at line 667 already used the correct row >= nrows pattern. * fix: typo in function name iqk_mul_mat_vec_q_kerne -> iqk_mul_mat_vec_q_kernel Truncated function name in iqk_mmvq_templates.cuh was missing trailing 'l'. * fix: print actual split_dim value in set_tensor error message (ggml-cuda.cu) fprintf used extra->split_dim == 0 which evaluates to boolean 0 or 1 instead of the actual split dimension value. When this fatal error is hit for an unsupported split_dim, the user could not diagnose which value caused the problem. * fix: wrong src index in gate bias stride for fused up-gate MoE path ggml_cuda_add_id for the gate bias used dst->src[4]->nb[1] as the stride argument instead of dst->src[5]->nb[1]. This was a copy-paste error from the up-bias code (lines 3220-3224) where src[4] is correct. If src[4] and src[5] have different strides, the bias addition produces incorrect results. * fix: wrong row count for gate projection MMQ in fused up-gate MoE path ggml_cuda_op_mul_mat_q for the gate projection (src0_2) used src0_1->ne[1] as row_high instead of src0_2->ne[1]. This copy-paste error causes processing the wrong number of rows if the up and gate projections have different row counts. The gemv path (line ~3563) correctly used src0_2->ne[1]. --- ggml/src/ggml-cuda.cu | 6 +++--- ggml/src/ggml-cuda/dmmv.cu | 14 +++++++------- ggml/src/ggml-cuda/iqk_mmvq_templates.cuh | 4 ++-- ggml/src/ggml-cuda/norm.cu | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e4b41bee..cbc7618d 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1159,7 +1159,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] } } else { - fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0); + fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim); GGML_ABORT("fatal error"); } @@ -3223,7 +3223,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten if (dst->src[5]) { ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data, (float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1], - dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream); + dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream); CUDA_CHECK(cudaGetLastError()); } @@ -3569,7 +3569,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor CUDA_CHECK(cudaGetLastError()); ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data, - 0, src0_1->ne[1], src1->ne[1], ne10_padded, stream); + 0, src0_2->ne[1], src1->ne[1], ne10_padded, stream); CUDA_CHECK(cudaGetLastError()); } else { auto local_dst = *dst; diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 7a5e3841..f1a48b51 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v const int ncols, int nrows, int64_t row_size) { const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const float * dptr = (const float *)((const char *)vx + row*row_size); const float d = *dptr * 31.75f * 1.05f; @@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v const int ncols, int nrows, int64_t row_size) { const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const float * dptr = (const float *)((const char *)vx + row*row_size); const float d = *dptr * 31.75f * 1.015f; @@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v constexpr int kNumGroups = 64; const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const float * dptr = (const float *)((const char *)vx + row*row_size); const float d = dptr[0] * 31.75f * 1.01f; @@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; @@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; @@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; @@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; + if (row >= nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh index 426a2b09..842e4065 100644 --- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh +++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh @@ -19,7 +19,7 @@ struct ggml_cuda_type_traits { }; template -static __device__ void iqk_mul_mat_vec_q_kerne( +static __device__ void iqk_mul_mat_vec_q_kernel( const void * __restrict__ vx, const void * __restrict__ vy, const float * bias, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) { @@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q( const char * cy = (const char *)vy + i2*nb12; char * cdst = (char *)dst + i2*nb2; const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr); - iqk_mul_mat_vec_q_kerne(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size); + iqk_mul_mat_vec_q_kernel(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size); } template diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 52a59be2..0a112529 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor else if (dst->src[0]->type == GGML_TYPE_BF16) { fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data, (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data, - (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data, + (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data, eps, ctx.stream()); } else {