server: reset cache tokens after pp stops (#1787)

Co-authored-by: firecoperana <firecoperana>
2026-06-28 04:30:15 -05:00 · 2026-05-13 01:05:32 -05:00 · 2026-05-13 01:05:32 -05:00 · cdc288bc97
commit cdc288bc97
parent f9a93c37e2
2 changed files with 12 additions and 7 deletions
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -721,9 +721,8 @@ void server_slot::release() {
        command = SLOT_COMMAND_RELEASE;
        state = SLOT_STATE_IDLE;
        task.reset();
-        llama_decode_reset();
    }
-
+    llama_decode_reset();
 }


@ -4545,8 +4544,14 @@ void server_context::process_batch_tokens(int32_t & n_batch) {
                for (auto& slot : slots) {
                    slot.state = SLOT_STATE_PROCESSING;
                    slot.command = SLOT_COMMAND_NONE;
-                    slot.release();
-                    if (ret != user_cancel) {
+                    if (ret == user_cancel) {
+                        llama_pos cur_pos = llama_kv_cache_seq_pos_max(slot.ctx, slot.id);
+                        slot.n_past = slot.cache_tokens.size_up_to_pos(cur_pos + 1);
+                        slot.cache_tokens.keep_first(slot.n_past);
+                        slot.release();
+                    }
+                    else {
+                        slot.release();
                        LLAMA_LOG_INFO("n_past = %d\n", (int)slot.cache_tokens.size());
                        send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
                    }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4688,9 +4688,6 @@ static int llama_decode_internal(
                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(max_cell, pad)));
            }
        }
-        if (stop_internal_decode) {
-            return -3;
-        }

 #if IK_PRINT_TIMING
        auto tim2 = ggml_time_us();
@ -4922,6 +4919,9 @@ static int llama_decode_internal(
            // empty context, but for the sake of correctness let's just do it.
            lctx.prev.reset();
        }
+        if (stop_internal_decode) {
+            return -3;
+        }
    }

    // set to total number of outputs in the batch, for use in llama_get_logits_ith