diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ded622cfd6..a23b0405ce 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2565,7 +2565,10 @@ private: n_keep = std::min(slot.n_ctx - 4, n_keep); const int n_left = slot.prompt.n_tokens() - n_keep; - const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); + int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); + + // ref: https://github.com/ggml-org/llama.cpp/pull/24786 + n_discard = std::clamp(n_discard, 0, std::max(0, n_left - 1)); SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);