ggml : address integer overflows in binary ops CUDA implementation (#24706)

* ggml : address integer overflows in binary ops CUDA implementation

* ggml : add size_t casts to avoid integer overflows

* ggml : add more asserts checking integer overflows in binary ops CUDA implementation

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
fairydreaming 2026-06-25 10:06:44 +02:00 committed by GitHub
parent 3e61ea0e2f
commit f728adab68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
static __global__ void k_bin_bcast(const src0_t * src0,
const src1_t * src1,
dst_t * dst,
const int ne0,
const int ne1,
const int ne2,
const uint32_t ne0,
const uint32_t ne1,
const uint32_t ne2,
const uint3 ne3,
const uint3 ne10,
const uint3 ne11,
const uint3 ne12,
const uint3 ne13,
/*const int s0,*/
const int s1,
const int s2,
const int s3,
const int s00,
const int s01,
const int s02,
const int s03,
const int s10,
const int s11,
const int s12,
const int s13,
/*const uint32_t s0,*/
const uint32_t s1,
const uint32_t s2,
const uint32_t s3,
const uint32_t s00,
const uint32_t s01,
const uint32_t s02,
const uint32_t s03,
const uint32_t s10,
const uint32_t s11,
const uint32_t s12,
const uint32_t s13,
src1_ptrs... src1s) {
ggml_cuda_pdl_lc();
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
return;
}
@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0,
const uint32_t i12 = fastmodulo(i2, ne12);
const uint32_t i13 = fastmodulo(i3, ne13);
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
dst_t * dst_row = dst + i_dst;
const uint32_t s0 = blockDim.x * gridDim.x;
ggml_cuda_pdl_sync();
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
const uint32_t i10 = fastmodulo(i0, ne10);
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
} else {
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
}
dst_row[i0] = (dst_t) result;
// protect i0 from overflow
if (ne0 - i0 <= s0) {
break;
}
}
}
@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
const uint3 ne12,
const uint3 ne13,
/*const int s0,*/
const int s1,
const int s2,
const int s3,
const int s00,
const int s01,
const int s02,
const int s03,
const int s10,
const int s11,
const int s12,
const int s13,
const uint32_t s1,
const uint32_t s2,
const uint32_t s3,
const uint32_t s00,
const uint32_t s01,
const uint32_t s02,
const uint32_t s03,
const uint32_t s10,
const uint32_t s11,
const uint32_t s12,
const uint32_t s13,
src1_ptrs... src1s) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x;
const uint32_t i3 = fastdiv(i, prod_012);
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
return;
}
const int i11 = fastmodulo(i1, ne11);
const int i12 = fastmodulo(i2, ne12);
const int i13 = fastmodulo(i3, ne13);
const uint32_t i11 = fastmodulo(i1, ne11);
const uint32_t i12 = fastmodulo(i2, ne12);
const uint32_t i13 = fastmodulo(i3, ne13);
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
dst_t * dst_row = dst + i_dst;
const int i10 = fastmodulo(i0, ne10);
const uint32_t i10 = fastmodulo(i0, ne10);
ggml_cuda_pdl_sync();
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
} else {
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
}
dst_row[i0] = (dst_t) result;
@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
//GGML_ASSERT(s0 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s1 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s2 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s3 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
const int block_size = 128;
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
if (block_nums.z > 65535 || block_nums.y > 65535) {
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
GGML_ASSERT(block_num <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(ne0 * ne1 <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits<uint32_t>::max());
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
}
} else {
GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
{
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);