From 2d3ecd5e19cd64cbae122c385b8fa08483d7fc9e Mon Sep 17 00:00:00 2001
From: Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>
Date: Tue, 23 Jun 2026 14:03:22 +0200
Subject: [PATCH] Fix minor CUDA discrepancies (part 2) (#2015)

* fix: wrong tensor index in BF16 fused RMS norm add path (norm.cu:1039)

The BF16 branch of ggml_cuda_op_fused_rms_rms_add used dst->src[2]->data
for the second weight pointer, but should have used dst->src[3]->data.
This caused reading float weights from the wrong bf16 input tensor.

The F32 and F16 branches both correctly reference src[3], and the
assertions at lines 1013-1015 confirm src[3] is the F32 weight tensor.

* fix: off-by-one bounds check in 7 dmmv kernels (row > nrows -> row >= nrows)

Seven K-quant dequantize_mul_mat_vec kernels used row > nrows for bounds
checking instead of row >= nrows. Since rows are 0-indexed (0..nrows-1),
the check missed the row == nrows case, allowing a potential out-of-bounds
memory write when grid dimensions produce exactly nrows.

The templated dequantize_mul_mat_vec<type> kernel at line 667 already used
the correct row >= nrows pattern.

* fix: typo in function name iqk_mul_mat_vec_q_kerne -> iqk_mul_mat_vec_q_kernel

Truncated function name in iqk_mmvq_templates.cuh was missing trailing 'l'.

* fix: print actual split_dim value in set_tensor error message (ggml-cuda.cu)

fprintf used extra->split_dim == 0 which evaluates to boolean 0 or 1
instead of the actual split dimension value. When this fatal error is
hit for an unsupported split_dim, the user could not diagnose which
value caused the problem.

* fix: wrong src index in gate bias stride for fused up-gate MoE path

ggml_cuda_add_id for the gate bias used dst->src[4]->nb[1] as the stride
argument instead of dst->src[5]->nb[1]. This was a copy-paste error from
the up-bias code (lines 3220-3224) where src[4] is correct. If src[4]
and src[5] have different strides, the bias addition produces incorrect
results.

* fix: wrong row count for gate projection MMQ in fused up-gate MoE path

ggml_cuda_op_mul_mat_q for the gate projection (src0_2) used
src0_1->ne[1] as row_high instead of src0_2->ne[1]. This copy-paste
error causes processing the wrong number of rows if the up and gate
projections have different row counts. The gemv path (line ~3563)
correctly used src0_2->ne[1].
---
 ggml/src/ggml-cuda.cu                     |  6 +++---
 ggml/src/ggml-cuda/dmmv.cu                | 14 +++++++-------
 ggml/src/ggml-cuda/iqk_mmvq_templates.cuh |  4 ++--
 ggml/src/ggml-cuda/norm.cu                |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index e4b41bee..cbc7618d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1159,7 +1159,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
         }
     }
     else {
-        fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
+        fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim);
         GGML_ABORT("fatal error");
     }
 
@@ -3223,7 +3223,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
         if (dst->src[5]) {
             ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
                     (float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
-                    dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
+                    dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream);
             CUDA_CHECK(cudaGetLastError());
         }
 
@@ -3569,7 +3569,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
             CUDA_CHECK(cudaGetLastError());
 
             ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
-                    0, src0_1->ne[1], src1->ne[1], ne10_padded, stream);
+                    0, src0_2->ne[1], src1->ne[1], ne10_padded, stream);
             CUDA_CHECK(cudaGetLastError());
         } else {
             auto local_dst = *dst;
diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
index 7a5e3841..f1a48b51 100644
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
         const int ncols, int nrows, int64_t row_size) {
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const float * dptr = (const float *)((const char *)vx + row*row_size);
     const float d = *dptr * 31.75f * 1.05f;
@@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
         const int ncols, int nrows, int64_t row_size) {
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const float * dptr = (const float *)((const char *)vx + row*row_size);
     const float d = *dptr * 31.75f * 1.015f;
@@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v
     constexpr int kNumGroups = 64;
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const float * dptr = (const float *)((const char *)vx + row*row_size);
     const float d = dptr[0] * 31.75f * 1.01f;
@@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const int num_blocks_per_row = ncols / QK_K;
     const int ib0 = row*num_blocks_per_row;
@@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
 static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const int num_blocks_per_row = ncols / QK_K;
     const int ib0 = row*num_blocks_per_row;
@@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
 static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
     const int num_blocks_per_row = ncols / QK_K;
     const int ib0 = row*num_blocks_per_row;
 
@@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
     static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
 
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
+    if (row >= nrows) return;
 
     const int num_blocks_per_row = ncols / QK_K;
     const int ib0 = row*num_blocks_per_row;
diff --git a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
index 426a2b09..842e4065 100644
--- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
@@ -19,7 +19,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
 };
 
 template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
-static __device__ void iqk_mul_mat_vec_q_kerne(
+static __device__ void iqk_mul_mat_vec_q_kernel(
     const void * __restrict__ vx, const void * __restrict__ vy,
     const float * bias, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
@@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q(
     const char * cy = (const char *)vy + i2*nb12;
     char * cdst = (char *)dst + i2*nb2;
     const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
-    iqk_mul_mat_vec_q_kerne<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
+    iqk_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
 }
 
 template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index 52a59be2..0a112529 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor
     else if (dst->src[0]->type == GGML_TYPE_BF16) {
         fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
                 (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
-                (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data,
+                (const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data,
                 eps, ctx.stream());
     }
     else {