diff --git a/common/arg.cpp b/common/arg.cpp index 0213a67c80..df506d2d0e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1360,7 +1360,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--cache-idle-slots"}, {"--no-cache-idle-slots"}, - "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)", + "save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)", [](common_params & params, bool value) { params.cache_idle_slots = value; } diff --git a/tools/server/README.md b/tools/server/README.md index f507b8c181..b414910590 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -165,7 +165,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)
(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | -| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | +| `--cache-idle-slots, --no-cache-idle-slots` | save idle slots to the prompt cache on new task, and clear them when using unified KV (default: enabled, requires cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6fa302e132..a7c4f7b56e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -127,7 +127,11 @@ struct server_slot { server_prompt prompt; - void prompt_save(server_prompt_cache & prompt_cache) const { + bool prompt_save(server_prompt_cache & prompt_cache) const { + if (prompt.tokens.size() == 0) { + return false; + } + GGML_ASSERT(prompt.data.size() == 0); const size_t cur_size_tgt = llama_state_seq_get_size_ext(ctx_tgt, id, LLAMA_STATE_SEQ_FLAGS_NONE); @@ -140,13 +144,15 @@ struct server_slot { auto * cur = prompt_cache.alloc(prompt, cur_size_tgt, cur_size_dft); if (cur == nullptr) { - return; + return false; } llama_state_seq_get_data_ext(ctx_tgt, cur->data.main.data(), cur_size_tgt, id, LLAMA_STATE_SEQ_FLAGS_NONE); if (ctx_dft) { llama_state_seq_get_data_ext(ctx_dft, cur->data.drft.data(), cur_size_dft, id, LLAMA_STATE_SEQ_FLAGS_NONE); } + + return true; } bool prompt_load(server_prompt_cache & prompt_cache, const server_tokens & tokens) { @@ -739,17 +745,6 @@ private: llama_batch_free(batch); } - void slot_save_and_clear(server_slot & slot) { - if (slot.prompt.n_tokens() == 0) { - return; - } - SLT_INF(slot, "%s", "saving idle slot to prompt cache\n"); - SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n"); - slot.prompt_save(*prompt_cache); - slot.prompt_clear(false); - prompt_cache->update(); - } - void handle_sleeping_state(bool new_state) { GGML_ASSERT(sleeping != new_state); if (new_state) { @@ -1186,14 +1181,17 @@ private: metrics.init(); if (params_base.cache_idle_slots) { - if (!params_base.kv_unified) { - SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n"); - params_base.cache_idle_slots = false; - } else if (params_base.cache_ram_mib == 0) { + if (params_base.cache_ram_mib == 0) { SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n"); params_base.cache_idle_slots = false; } else { - SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n"); + if (params_base.kv_unified) { + SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n"); + } else { + // without a unified KV cache, clearing a slot frees no reusable room, so we only + // publish a RAM-cache copy of idle slots (their KV stays in VRAM) [TAG_IDLE_SLOT_CLEAR] + SRV_INF("%s", "idle slots will be saved to prompt cache upon starting a new task\n"); + } SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n"); } } @@ -1357,8 +1355,6 @@ private: } if (ret) { - const auto & tokens = ret->prompt.tokens; - update_cache = update_cache && prompt_cache; // cache prompts only for completion tasks @@ -1369,10 +1365,7 @@ private: const int64_t t_start = ggml_time_us(); - // don't save the slot's state if its context is empty - if (tokens.size() > 0) { - ret->prompt_save(*prompt_cache); - } + ret->prompt_save(*prompt_cache); if (!ret->prompt_load(*prompt_cache, task.tokens)) { ret->prompt_clear(false); @@ -2120,9 +2113,19 @@ private: } if (params_base.cache_idle_slots) { - for (auto & s : slots) { - if (!s.is_processing()) { - slot_save_and_clear(s); + for (auto & slot : slots) { + if (!slot.is_processing()) { + SLT_INF(slot, "%s", "saving idle slot to prompt cache\n"); + + if (slot.prompt_save(*prompt_cache)) { + SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n"); + prompt_cache->update(); + } + + if (params_base.kv_unified) { + // [TAG_IDLE_SLOT_CLEAR] + slot.prompt_clear(false); + } } } }