From f728adab683387b2bdb085b3854a0f3487a13d0a Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:06:44 +0200 Subject: [PATCH] ggml : address integer overflows in binary ops CUDA implementation (#24706) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ggml : address integer overflows in binary ops CUDA implementation * ggml : add size_t casts to avoid integer overflows * ggml : add more asserts checking integer overflows in binary ops CUDA implementation --------- Co-authored-by: Stanisław Szymczyk --- ggml/src/ggml-cuda/binbcast.cu | 136 ++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 46 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index c25f42b32b..2e38077bf6 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -34,26 +34,26 @@ template = (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) { + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) { return; } @@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0, const uint32_t i12 = fastmodulo(i2, ne12); const uint32_t i13 = fastmodulo(i3, ne13); - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01; + const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11; + const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1; const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; + const uint32_t s0 = blockDim.x * gridDim.x; + ggml_cuda_pdl_sync(); - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { + for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) { const uint32_t i10 = fastmodulo(i0, ne10); - float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; + float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { - result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); + result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10]))); } else { - result = bin_op(result, (float)src1[i_src1 + i10*s10]); + result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]); } dst_row[i0] = (dst_t) result; + + // protect i0 from overflow + if (ne0 - i0 <= s0) { + break; + } } } @@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const uint3 ne12, const uint3 ne13, /*const int s0,*/ - const int s1, - const int s2, - const int s3, - const int s00, - const int s01, - const int s02, - const int s03, - const int s10, - const int s11, - const int s12, - const int s13, + const uint32_t s1, + const uint32_t s2, + const uint32_t s3, + const uint32_t s00, + const uint32_t s01, + const uint32_t s02, + const uint32_t s03, + const uint32_t s10, + const uint32_t s11, + const uint32_t s12, + const uint32_t s13, src1_ptrs... src1s) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; + const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x; const uint32_t i3 = fastdiv(i, prod_012); const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01); @@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, return; } - const int i11 = fastmodulo(i1, ne11); - const int i12 = fastmodulo(i2, ne12); - const int i13 = fastmodulo(i3, ne13); + const uint32_t i11 = fastmodulo(i1, ne11); + const uint32_t i12 = fastmodulo(i2, ne12); + const uint32_t i13 = fastmodulo(i3, ne13); - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01; + const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11; + const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1; const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - const int i10 = fastmodulo(i0, ne10); + const uint32_t i10 = fastmodulo(i0, ne10); ggml_cuda_pdl_sync(); - float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; + float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { - result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); + result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10]))); } else { - result = bin_op(result, (float)src1[i_src1 + i10*s10]); + result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]); } dst_row[i0] = (dst_t) result; @@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * size_t s02 = nb02 / sizeof(src0_t); size_t s03 = nb03 / sizeof(src0_t); + GGML_ASSERT(ne0 <= std::numeric_limits::max()); + GGML_ASSERT(ne1 <= std::numeric_limits::max()); + GGML_ASSERT(ne2 <= std::numeric_limits::max()); + GGML_ASSERT(ne3 <= std::numeric_limits::max()); + + //GGML_ASSERT(s0 <= std::numeric_limits::max()); + GGML_ASSERT(s1 <= std::numeric_limits::max()); + GGML_ASSERT(s2 <= std::numeric_limits::max()); + GGML_ASSERT(s3 <= std::numeric_limits::max()); + + GGML_ASSERT(s00 <= std::numeric_limits::max()); + GGML_ASSERT(s01 <= std::numeric_limits::max()); + GGML_ASSERT(s02 <= std::numeric_limits::max()); + GGML_ASSERT(s03 <= std::numeric_limits::max()); + + GGML_ASSERT(s10 <= std::numeric_limits::max()); + GGML_ASSERT(s11 <= std::numeric_limits::max()); + GGML_ASSERT(s12 <= std::numeric_limits::max()); + GGML_ASSERT(s13 <= std::numeric_limits::max()); + + GGML_ASSERT(cne1[0] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[1] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[2] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[3] <= std::numeric_limits::max()); + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); GGML_ASSERT(nb1 % sizeof(dst_t) == 0); GGML_ASSERT(nb2 % sizeof(dst_t) == 0); @@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * GGML_ASSERT(nb12 % sizeof(src1_t) == 0); GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + GGML_ASSERT(ne2 * ne3 <= std::numeric_limits::max()); + const int block_size = 128; int64_t hne0 = std::max(ne0 / 2LL, 1LL); @@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); if (block_nums.z > 65535 || block_nums.y > 65535) { - int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + + GGML_ASSERT(block_num <= std::numeric_limits::max()); + GGML_ASSERT(block_num * block_size <= std::numeric_limits::max()); + GGML_ASSERT(ne0 * ne1 <= std::numeric_limits::max()); + GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits::max()); + const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1)); const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0); @@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } } else { + GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits::max()); + GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits::max()); + GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits::max()); + const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); { const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);