mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Fix minor CUDA discrepancies (part 2) (#2015)
* fix: wrong tensor index in BF16 fused RMS norm add path (norm.cu:1039) The BF16 branch of ggml_cuda_op_fused_rms_rms_add used dst->src[2]->data for the second weight pointer, but should have used dst->src[3]->data. This caused reading float weights from the wrong bf16 input tensor. The F32 and F16 branches both correctly reference src[3], and the assertions at lines 1013-1015 confirm src[3] is the F32 weight tensor. * fix: off-by-one bounds check in 7 dmmv kernels (row > nrows -> row >= nrows) Seven K-quant dequantize_mul_mat_vec kernels used row > nrows for bounds checking instead of row >= nrows. Since rows are 0-indexed (0..nrows-1), the check missed the row == nrows case, allowing a potential out-of-bounds memory write when grid dimensions produce exactly nrows. The templated dequantize_mul_mat_vec<type> kernel at line 667 already used the correct row >= nrows pattern. * fix: typo in function name iqk_mul_mat_vec_q_kerne -> iqk_mul_mat_vec_q_kernel Truncated function name in iqk_mmvq_templates.cuh was missing trailing 'l'. * fix: print actual split_dim value in set_tensor error message (ggml-cuda.cu) fprintf used extra->split_dim == 0 which evaluates to boolean 0 or 1 instead of the actual split dimension value. When this fatal error is hit for an unsupported split_dim, the user could not diagnose which value caused the problem. * fix: wrong src index in gate bias stride for fused up-gate MoE path ggml_cuda_add_id for the gate bias used dst->src[4]->nb[1] as the stride argument instead of dst->src[5]->nb[1]. This was a copy-paste error from the up-bias code (lines 3220-3224) where src[4] is correct. If src[4] and src[5] have different strides, the bias addition produces incorrect results. * fix: wrong row count for gate projection MMQ in fused up-gate MoE path ggml_cuda_op_mul_mat_q for the gate projection (src0_2) used src0_1->ne[1] as row_high instead of src0_2->ne[1]. This copy-paste error causes processing the wrong number of rows if the up and gate projections have different row counts. The gemv path (line ~3563) correctly used src0_2->ne[1].
This commit is contained in:
parent
9eaf86a7c7
commit
2d3ecd5e19
@ -1159,7 +1159,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]]
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim == 0);
|
fprintf(stderr, "%s: not implemented for split dim %d\n", __func__, extra->split_dim);
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3223,7 +3223,7 @@ static int ggml_cuda_moe_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_ten
|
|||||||
if (dst->src[5]) {
|
if (dst->src[5]) {
|
||||||
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
|
ggml_cuda_add_id((const float *)dst_row.data, (const float *)dst->src[5]->data, (const int32_t *)ids->data,
|
||||||
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
|
(float *)dst_row.data, dst_row.ne[0], dst_row.ne[1], dst_row.ne[2], dst_row.ne[0], dst_row.ne[1],
|
||||||
dst_row.nb[1], dst_row.nb[2], dst->src[4]->nb[1], ids->nb[1], stream);
|
dst_row.nb[1], dst_row.nb[2], dst->src[5]->nb[1], ids->nb[1], stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3569,7 +3569,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
|||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
|
ggml_cuda_op_mul_mat_q(ctx, src0_2, src1, dst, (const char *)src0_2->data, nullptr, src1_quantized.get(), (float *)dst->data,
|
||||||
0, src0_1->ne[1], src1->ne[1], ne10_padded, stream);
|
0, src0_2->ne[1], src1->ne[1], ne10_padded, stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
} else {
|
} else {
|
||||||
auto local_dst = *dst;
|
auto local_dst = *dst;
|
||||||
|
|||||||
@ -77,7 +77,7 @@ static __global__ void dequantize_mul_mat_vec_iq2_kt(const void * __restrict__ v
|
|||||||
const int ncols, int nrows, int64_t row_size) {
|
const int ncols, int nrows, int64_t row_size) {
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||||
const float d = *dptr * 31.75f * 1.05f;
|
const float d = *dptr * 31.75f * 1.05f;
|
||||||
@ -121,7 +121,7 @@ static __global__ void dequantize_mul_mat_vec_iq3_kt(const void * __restrict__ v
|
|||||||
const int ncols, int nrows, int64_t row_size) {
|
const int ncols, int nrows, int64_t row_size) {
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||||
const float d = *dptr * 31.75f * 1.015f;
|
const float d = *dptr * 31.75f * 1.015f;
|
||||||
@ -171,7 +171,7 @@ static __global__ void dequantize_mul_mat_vec_iq4_kt(const void * __restrict__ v
|
|||||||
constexpr int kNumGroups = 64;
|
constexpr int kNumGroups = 64;
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
const float * dptr = (const float *)((const char *)vx + row*row_size);
|
||||||
const float d = dptr[0] * 31.75f * 1.01f;
|
const float d = dptr[0] * 31.75f * 1.01f;
|
||||||
@ -234,7 +234,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
@ -303,7 +303,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|||||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
@ -374,7 +374,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
@ -566,7 +566,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row >= nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|||||||
@ -19,7 +19,7 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M_R4> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
||||||
static __device__ void iqk_mul_mat_vec_q_kerne(
|
static __device__ void iqk_mul_mat_vec_q_kernel(
|
||||||
const void * __restrict__ vx, const void * __restrict__ vy,
|
const void * __restrict__ vx, const void * __restrict__ vy,
|
||||||
const float * bias, float * __restrict__ dst,
|
const float * bias, float * __restrict__ dst,
|
||||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
|
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst, const int64_t row_size) {
|
||||||
@ -244,7 +244,7 @@ static __global__ void iqk_mul_mat_vec_q(
|
|||||||
const char * cy = (const char *)vy + i2*nb12;
|
const char * cy = (const char *)vy + i2*nb12;
|
||||||
char * cdst = (char *)dst + i2*nb2;
|
char * cdst = (char *)dst + i2*nb2;
|
||||||
const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
|
const float * b = (const float *)(bias ? ids_data ? (const char *)bias + i02*bias_nb1 : bias : nullptr);
|
||||||
iqk_mul_mat_vec_q_kerne<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
|
iqk_mul_mat_vec_q_kernel<type, vdr, vec_dot_q_cuda, ncols_y, n_interleaved>(cx, cy, b, (float *)cdst, ncols_x, nrows_x, nrows_y, nrows_dst, row_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
template <ggml_type type, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda, int ncols_y, int n_interleaved = 1>
|
||||||
|
|||||||
@ -1046,7 +1046,7 @@ void ggml_cuda_op_fused_rms_rms_add(ggml_backend_cuda_context & ctx, ggml_tensor
|
|||||||
else if (dst->src[0]->type == GGML_TYPE_BF16) {
|
else if (dst->src[0]->type == GGML_TYPE_BF16) {
|
||||||
fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
|
fused_rms_rms_add_f32_cuda(ncols, nrows, (float *)dst->data,
|
||||||
(const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
|
(const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data,
|
||||||
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[2]->data,
|
(const nv_bfloat16 *)dst->src[2]->data, (const float *)dst->src[3]->data,
|
||||||
eps, ctx.stream());
|
eps, ctx.stream());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user