server: reset cache tokens after pp stops (#1787)

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana 2026-05-13 01:05:32 -05:00 committed by GitHub
parent f9a93c37e2
commit cdc288bc97
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 7 deletions

View File

@ -721,9 +721,8 @@ void server_slot::release() {
command = SLOT_COMMAND_RELEASE;
state = SLOT_STATE_IDLE;
task.reset();
llama_decode_reset();
}
llama_decode_reset();
}
@ -4545,8 +4544,14 @@ void server_context::process_batch_tokens(int32_t & n_batch) {
for (auto& slot : slots) {
slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE;
slot.release();
if (ret != user_cancel) {
if (ret == user_cancel) {
llama_pos cur_pos = llama_kv_cache_seq_pos_max(slot.ctx, slot.id);
slot.n_past = slot.cache_tokens.size_up_to_pos(cur_pos + 1);
slot.cache_tokens.keep_first(slot.n_past);
slot.release();
}
else {
slot.release();
LLAMA_LOG_INFO("n_past = %d\n", (int)slot.cache_tokens.size());
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
}

View File

@ -4688,9 +4688,6 @@ static int llama_decode_internal(
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(max_cell, pad)));
}
}
if (stop_internal_decode) {
return -3;
}
#if IK_PRINT_TIMING
auto tim2 = ggml_time_us();
@ -4922,6 +4919,9 @@ static int llama_decode_internal(
// empty context, but for the sake of correctness let's just do it.
lctx.prev.reset();
}
if (stop_internal_decode) {
return -3;
}
}
// set to total number of outputs in the batch, for use in llama_get_logits_ith