Fix minor CUDA discrepancies (part 2) (#2015)

* fix: wrong tensor index in BF16 fused RMS norm add path (norm.cu:1039)

The BF16 branch of ggml_cuda_op_fused_rms_rms_add used dst->src[2]->data
for the second weight pointer, but should have used dst->src[3]->data.
This caused reading float weights from the wrong bf16 input tensor.

The F32 and F16 branches both correctly reference src[3], and the
assertions at lines 1013-1015 confirm src[3] is the F32 weight tensor.

* fix: off-by-one bounds check in 7 dmmv kernels (row > nrows -> row >= nrows)

Seven K-quant dequantize_mul_mat_vec kernels used row > nrows for bounds
checking instead of row >= nrows. Since rows are 0-indexed (0..nrows-1),
the check missed the row == nrows case, allowing a potential out-of-bounds
memory write when grid dimensions produce exactly nrows.

The templated dequantize_mul_mat_vec<type> kernel at line 667 already used
the correct row >= nrows pattern.

* fix: typo in function name iqk_mul_mat_vec_q_kerne -> iqk_mul_mat_vec_q_kernel

Truncated function name in iqk_mmvq_templates.cuh was missing trailing 'l'.

* fix: print actual split_dim value in set_tensor error message (ggml-cuda.cu)

fprintf used extra->split_dim == 0 which evaluates to boolean 0 or 1
instead of the actual split dimension value. When this fatal error is
hit for an unsupported split_dim, the user could not diagnose which
value caused the problem.

* fix: wrong src index in gate bias stride for fused up-gate MoE path

ggml_cuda_add_id for the gate bias used dst->src[4]->nb[1] as the stride
argument instead of dst->src[5]->nb[1]. This was a copy-paste error from
the up-bias code (lines 3220-3224) where src[4] is correct. If src[4]
and src[5] have different strides, the bias addition produces incorrect
results.

* fix: wrong row count for gate projection MMQ in fused up-gate MoE path

ggml_cuda_op_mul_mat_q for the gate projection (src0_2) used
src0_1->ne[1] as row_high instead of src0_2->ne[1]. This copy-paste
error causes processing the wrong number of rows if the up and gate
projections have different row counts. The gemv path (line ~3563)
correctly used src0_2->ne[1].
This commit is contained in:
Nexes the Elder 2026-06-23 14:03:22 +02:00 committed by GitHub
parent 9eaf86a7c7
commit 2d3ecd5e19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 13 additions and 13 deletions

View File

@ -1159,7 +1159,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
}
}
else {
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim);
GGML_ABORT("fatal error");
}
@ -3223,7 +3223,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
if (dst->src[5]) {
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream);
CUDA_CHECK(cudaGetLastError());
}
@ -3569,7 +3569,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
CUDA_CHECK(cudaGetLastError());
ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
0, src0_1->ne[1], src1->ne[1], ne10_padded, stream);
0, src0_2->ne[1], src1->ne[1], ne10_padded, stream);
CUDA_CHECK(cudaGetLastError());
} else {
auto local_dst = *dst;

View File

@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
const int ncols, int nrows, int64_t row_size) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const float * dptr = (const float *)((const char *)vx + row*row_size);
const float d = *dptr * 31.75f * 1.05f;
@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
const int ncols, int nrows, int64_t row_size) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const float * dptr = (const float *)((const char *)vx + row*row_size);
const float d = *dptr * 31.75f * 1.015f;
@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v
constexpr int kNumGroups = 64;
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const float * dptr = (const float *)((const char *)vx + row*row_size);
const float d = dptr[0] * 31.75f * 1.01f;
@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return;
if (row >= nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;

View File

@ -19,7 +19,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
};
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
static __device__ void iqk_mul_mat_vec_q_kerne(
static __device__ void iqk_mul_mat_vec_q_kernel(
const void * __restrict__ vx, const void * __restrict__ vy,
const float * bias, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q(
const char * cy = (const char *)vy + i2*nb12;
char * cdst = (char *)dst + i2*nb2;
const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
iqk_mul_mat_vec_q_kerne<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
iqk_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
}
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>

View File

@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor
else if (dst->src[0]->type == GGML_TYPE_BF16) {
fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
(const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data,
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data,
eps, ctx.stream());
}
else {