Fix minor CUDA discrepancies (part 2) (#2015)

* fix: wrong tensor index in BF16 fused RMS norm add path (norm.cu:1039) The BF16 branch of ggml_cuda_op_fused_rms_rms_add used dst->src[2]->data for the second weight pointer, but should have used dst->src[3]->data. This caused reading float weights from the wrong bf16 input tensor. The F32 and F16 branches both correctly reference src[3], and the assertions at lines 1013-1015 confirm src[3] is the F32 weight tensor. * fix: off-by-one bounds check in 7 dmmv kernels (row > nrows -> row >= nrows) Seven K-quant dequantize_mul_mat_vec kernels used row > nrows for bounds checking instead of row >= nrows. Since rows are 0-indexed (0..nrows-1), the check missed the row == nrows case, allowing a potential out-of-bounds memory write when grid dimensions produce exactly nrows. The templated dequantize_mul_mat_vec<type> kernel at line 667 already used the correct row >= nrows pattern. * fix: typo in function name iqk_mul_mat_vec_q_kerne -> iqk_mul_mat_vec_q_kernel Truncated function name in iqk_mmvq_templates.cuh was missing trailing 'l'. * fix: print actual split_dim value in set_tensor error message (ggml-cuda.cu) fprintf used extra->split_dim == 0 which evaluates to boolean 0 or 1 instead of the actual split dimension value. When this fatal error is hit for an unsupported split_dim, the user could not diagnose which value caused the problem. * fix: wrong src index in gate bias stride for fused up-gate MoE path ggml_cuda_add_id for the gate bias used dst->src[4]->nb[1] as the stride argument instead of dst->src[5]->nb[1]. This was a copy-paste error from the up-bias code (lines 3220-3224) where src[4] is correct. If src[4] and src[5] have different strides, the bias addition produces incorrect results. * fix: wrong row count for gate projection MMQ in fused up-gate MoE path ggml_cuda_op_mul_mat_q for the gate projection (src0_2) used src0_1->ne[1] as row_high instead of src0_2->ne[1]. This copy-paste error causes processing the wrong number of rows if the up and gate projections have different row counts. The gemv path (line ~3563) correctly used src0_2->ne[1].
2026-06-28 04:30:15 -05:00 · 2026-06-23 14:03:22 +02:00 · 2026-06-23 14:03:22 +02:00 · 2d3ecd5e19
commit 2d3ecd5e19
parent 9eaf86a7c7
4 changed files with 13 additions and 13 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -1159,7 +1159,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
        }
    }
    else {
-        fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
+        fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim);
        GGML_ABORT("fatal error");
    }
@ -3223,7 +3223,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
        if (dst->src[5]) {
            ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
                    (float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
-                    dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
+                    dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream);
            CUDA_CHECK(cudaGetLastError());
        }
@ -3569,7 +3569,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
            CUDA_CHECK(cudaGetLastError());
            ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
-                    0, src0_1->ne[1], src1->ne[1], ne10_padded, stream);
+                    0, src0_2->ne[1], src1->ne[1], ne10_padded, stream);
            CUDA_CHECK(cudaGetLastError());
        } else {
            auto local_dst = *dst;
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
        const int ncols, int nrows, int64_t row_size) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const float * dptr = (const float *)((const char *)vx + row*row_size);
    const float d = *dptr * 31.75f * 1.05f;
@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
        const int ncols, int nrows, int64_t row_size) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const float * dptr = (const float *)((const char *)vx + row*row_size);
    const float d = *dptr * 31.75f * 1.015f;
@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v
    constexpr int kNumGroups = 64;
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const float * dptr = (const float *)((const char *)vx + row*row_size);
    const float d = dptr[0] * 31.75f * 1.01f;
@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
 static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
 static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
--- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
@ -19,7 +19,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
 };
 template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
-static __device__ void iqk_mul_mat_vec_q_kerne(
+static __device__ void iqk_mul_mat_vec_q_kernel(
    const void * __restrict__ vx, const void * __restrict__ vy,
    const float * bias, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q(
    const char * cy = (const char *)vy + i2*nb12;
    char * cdst = (char *)dst + i2*nb2;
    const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
-    iqk_mul_mat_vec_q_kerne<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
+    iqk_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
 }
 template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor
    else if (dst->src[0]->type == GGML_TYPE_BF16) {
        fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
                (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
-                (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data,
+                (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data,
                eps, ctx.stream());
    }
    else {