diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 2bef2315..762110f7 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -129,7 +129,7 @@ static __global__ void quantize_mmq_q8_1( } } - const float d = amax/127.f; + float d = amax/127.f; const float d_inv = d > 0 ? 1/d : 0.f; char4 q; q.x = roundf(xi.x*d_inv); @@ -162,6 +162,8 @@ static __global__ void quantize_mmq_q8_1( } if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) { + d = max(-65504.0f, min(65504.f, d)); + sum = max(-65504.0f, min(65504.f, sum)); y[ib].ds4[iqs/32] = make_half2(d, sum); } else { y[ib].d4[iqs/32] = d; diff --git a/ggml/src/ggml-cuda/quantize_id.cu b/ggml/src/ggml-cuda/quantize_id.cu index 80a9f222..9324c2c3 100644 --- a/ggml/src/ggml-cuda/quantize_id.cu +++ b/ggml/src/ggml-cuda/quantize_id.cu @@ -91,9 +91,11 @@ static __global__ void quantize_mmq_q8_1( return; } - const float d = 1.0f / d_inv; + float d = 1.0f / d_inv; if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) { + d = max(-65504.0f, min(65504.f, d)); + sum = max(-65504.0f, min(65504.f, sum)); y[ib].ds4[iqs/32] = make_half2(d, sum); } else { y[ib].d4[iqs/32] = d;