server : skip checkpoints beyond pos_next (#24411)

* server : skip checkpoints beyond pos_next * cont : update comment + TODO + ref --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-27 23:50:20 -05:00 · 2026-06-11 02:18:12 -05:00 · 2026-06-11 02:18:12 -05:00 · db94854ff5
commit db94854ff5
parent ac4cddeb0d
1 changed files with 7 additions and 0 deletions
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -2046,6 +2046,9 @@ private:

        auto & cur = slot.prompt.checkpoints.emplace_back();

+        // [TAG_CHECKPOINTS_FIX_POS_MIN]
+        // TODO: here we incorrectly deterimne that the saved checkpoint data covers the [pos_min, pos_max] range
+        //       this is not true for SWA models: https://github.com/ggml-org/llama.cpp/pull/24411#issuecomment-4677983225
        cur.update_pos(slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max);

        cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
@ -2860,6 +2863,10 @@ private:
                                            // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
                                            LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
                                                func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
+                                            // workaround for [TAG_CHECKPOINTS_FIX_POS_MIN]
+                                            if (cur.pos_max > pos_next) {
+                                                return false;
+                                            }
                                            return cur.pos_min < pos_min_thold || cur.pos_min == 0;
                                        }
                                    );