mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
server : do not clear slots without unified KV cache (#24190)
* Always export idle slots to RAM Without this, a slot's VRAM cache may not be written to RAM. If this slot happens to be busy then later on, this triggers needless preprocessing in another slot. * cont : clean-up --------- Co-authored-by: Christoph Weiss <weiss@wsoptics.de> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f0152efe40
commit
961e9a3e46
@ -1360,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"--cache-idle-slots"},
|
||||
{"--no-cache-idle-slots"},
|
||||
"save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
|
||||
"save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)",
|
||||
[](common_params & params, bool value) {
|
||||
params.cache_idle_slots = value;
|
||||
}
|
||||
|
||||
@ -165,7 +165,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)<br/>(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) |
|
||||
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||
| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
|
||||
| `--cache-idle-slots, --no-cache-idle-slots` | save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
|
||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
|
||||
@ -127,7 +127,11 @@ struct server_slot {
|
||||
|
||||
server_prompt prompt;
|
||||
|
||||
void prompt_save(server_prompt_cache & prompt_cache) const {
|
||||
bool prompt_save(server_prompt_cache & prompt_cache) const {
|
||||
if (prompt.tokens.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_ASSERT(prompt.data.size() == 0);
|
||||
|
||||
const size_t cur_size_tgt = llama_state_seq_get_size_ext(ctx_tgt, id, LLAMA_STATE_SEQ_FLAGS_NONE);
|
||||
@ -140,13 +144,15 @@ struct server_slot {
|
||||
|
||||
auto * cur = prompt_cache.alloc(prompt, cur_size_tgt, cur_size_dft);
|
||||
if (cur == nullptr) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
llama_state_seq_get_data_ext(ctx_tgt, cur->data.main.data(), cur_size_tgt, id, LLAMA_STATE_SEQ_FLAGS_NONE);
|
||||
if (ctx_dft) {
|
||||
llama_state_seq_get_data_ext(ctx_dft, cur->data.drft.data(), cur_size_dft, id, LLAMA_STATE_SEQ_FLAGS_NONE);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) {
|
||||
@ -739,17 +745,6 @@ private:
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void slot_save_and_clear(server_slot & slot) {
|
||||
if (slot.prompt.n_tokens() == 0) {
|
||||
return;
|
||||
}
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
|
||||
slot.prompt_save(*prompt_cache);
|
||||
slot.prompt_clear(false);
|
||||
prompt_cache->update();
|
||||
}
|
||||
|
||||
void handle_sleeping_state(bool new_state) {
|
||||
GGML_ASSERT(sleeping != new_state);
|
||||
if (new_state) {
|
||||
@ -1186,14 +1181,17 @@ private:
|
||||
metrics.init();
|
||||
|
||||
if (params_base.cache_idle_slots) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else {
|
||||
SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
|
||||
if (params_base.kv_unified) {
|
||||
SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
|
||||
} else {
|
||||
// without a unified KV cache, clearing a slot frees no reusable room, so we only
|
||||
// publish a RAM-cache copy of idle slots (their KV stays in VRAM) [TAG_IDLE_SLOT_CLEAR]
|
||||
SRV_INF("%s", "idle slots will be saved to prompt cache upon starting a new task\n");
|
||||
}
|
||||
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
@ -1357,8 +1355,6 @@ private:
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
const auto & tokens = ret->prompt.tokens;
|
||||
|
||||
update_cache = update_cache && prompt_cache;
|
||||
|
||||
// cache prompts only for completion tasks
|
||||
@ -1369,10 +1365,7 @@ private:
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
// don't save the slot's state if its context is empty
|
||||
if (tokens.size() > 0) {
|
||||
ret->prompt_save(*prompt_cache);
|
||||
}
|
||||
ret->prompt_save(*prompt_cache);
|
||||
|
||||
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
|
||||
ret->prompt_clear(false);
|
||||
@ -2120,9 +2113,19 @@ private:
|
||||
}
|
||||
|
||||
if (params_base.cache_idle_slots) {
|
||||
for (auto & s : slots) {
|
||||
if (!s.is_processing()) {
|
||||
slot_save_and_clear(s);
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing()) {
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
|
||||
if (slot.prompt_save(*prompt_cache)) {
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
|
||||
prompt_cache->update();
|
||||
}
|
||||
|
||||
if (params_base.kv_unified) {
|
||||
// [TAG_IDLE_SLOT_CLEAR]
|
||||
slot.prompt_clear(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user