mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
ggml : address integer overflows in binary ops CUDA implementation (#24706)
* ggml : address integer overflows in binary ops CUDA implementation * ggml : add size_t casts to avoid integer overflows * ggml : add more asserts checking integer overflows in binary ops CUDA implementation --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
parent
3e61ea0e2f
commit
f728adab68
@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
|
||||
static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const uint32_t ne0,
|
||||
const uint32_t ne1,
|
||||
const uint32_t ne2,
|
||||
const uint3 ne3,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
/*const uint32_t s0,*/
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const uint32_t s0 = blockDim.x * gridDim.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
|
||||
// protect i0 from overflow
|
||||
if (ne0 - i0 <= s0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const uint32_t i3 = fastdiv(i, prod_012);
|
||||
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
|
||||
@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = fastmodulo(i1, ne11);
|
||||
const int i12 = fastmodulo(i2, ne12);
|
||||
const int i13 = fastmodulo(i3, ne13);
|
||||
const uint32_t i11 = fastmodulo(i1, ne11);
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
//GGML_ASSERT(s0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
|
||||
@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
|
||||
|
||||
if (block_nums.z > 65535 || block_nums.y > 65535) {
|
||||
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
|
||||
GGML_ASSERT(block_num <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
|
||||
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
|
||||
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
|
||||
@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user