From d51036a0c44832a68ad66683bd3a03dbcbeaa3c2 Mon Sep 17 00:00:00 2001
From: Samuel Oliveira Alves
 <107287165+SamuelOliveirads@users.noreply.github.com>
Date: Fri, 22 May 2026 02:14:47 -0300
Subject: [PATCH] fix: reset KV cache and prompt state in server_slot and
 server_context (#1860)

---
 examples/server/server-context.cpp | 43 +++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/examples/server/server-context.cpp b/examples/server/server-context.cpp
index a1292615..e5568de5 100644
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -608,6 +608,8 @@ void server_slot::reset() {
     stopping_word = "";
     n_past = 0;
     n_past_prompt = 0;
+    n_discarded_prompt = 0;
+    n_kept_prompt = 0;
     n_sent_text = 0;
     drafted.clear();
     drafted_spec_type = COMMON_SPECULATIVE_TYPE_NONE;
@@ -2056,6 +2058,16 @@ void server_context::kv_cache_clear() {
 
     // clear the entire KV cache
     llama_kv_cache_clear(ctx);
+    for (auto & slot : slots) {
+        if (slot.spec == nullptr) {
+            continue;
+        }
+
+        common_speculative_clear_sequence_hidden(slot.spec, slot.id);
+        if (auto * ctx_companion = common_speculative_get_companion_ctx(slot.spec); ctx_companion != nullptr) {
+            llama_kv_cache_clear(ctx_companion);
+        }
+    }
     clean_kv_cache = false;
 }
 
@@ -2107,6 +2119,22 @@ bool server_context::system_prompt_set(const std::string& sys_prompt) {
     // release all slots
     for (server_slot& slot : slots) {
         slot.release();
+        slot.cache_tokens.clear();
+        slot.n_past = 0;
+        slot.n_past_prompt = 0;
+        slot.n_past_offset = 0;
+        slot.n_discarded_prompt = 0;
+        slot.n_kept_prompt = 0;
+        slot.n_prompt_tokens_cache = 0;
+        slot.server_cached_prompt.checkpoints.clear();
+        slot.checkpoint_pos = 0;
+        slot.do_checkpoint = false;
+        if (slot.ctx_sampling != nullptr) {
+            common_sampler_reset(slot.ctx_sampling);
+        }
+    }
+    if (prompt_cache) {
+        prompt_cache->states.clear();
     }
 
     system_need_update = true;
@@ -3904,9 +3932,15 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
                 slot.cache_tokens.keep_first(slot.n_past);
                 int p0 = (int)system_tokens.size() + slot.n_past;
                 p0 = system_tokens.size() + slot.cache_tokens.pos_next();
-                if (!llama_kv_cache_seq_rm(ctx, slot.id, p0, -1)) {
+                auto * ctx_companion = slot.spec ? common_speculative_get_companion_ctx(slot.spec) : nullptr;
+                const bool target_trimmed = llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+                const bool companion_trimmed = ctx_companion == nullptr || llama_kv_cache_seq_rm(ctx_companion, slot.id, p0, -1);
+                if (!target_trimmed || !companion_trimmed) {
                     // could not partially delete (likely using a non-Transformer model)
                     llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                    if (ctx_companion != nullptr) {
+                        llama_kv_cache_seq_rm(ctx_companion, slot.id, -1, -1);
+                    }
 
                     p0 = (int)system_tokens.size();
                     if (p0 != 0) {
@@ -3915,9 +3949,16 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
                     }
 
                     // there is no common part left (except for the system prompt)
+                    slot.cache_tokens.clear();
                     slot.n_past = 0;
+                    slot.n_past_prompt = 0;
+                    slot.n_past_offset = 0;
+                    slot.n_discarded_prompt = 0;
+                    slot.n_kept_prompt = 0;
                     slot.n_past_se = 0;
+                    slot.n_prompt_tokens_cache = 0;
                     slot.ga_i = 0;
+                    slot.server_cached_prompt.checkpoints.clear();
                     // TODO: is the system prompt ever in the sampling context?
                     common_sampler_reset(slot.ctx_sampling);
                 }