mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
server : consolidate slot selection into get_available_slot (#24755)
Absorb get_slot_by_id logic into get_available_slot so slot selection is handled by a single function call. When a specific slot id is requested, the LCP similarity check still runs to enable proper prompt cache updates. Assisted-by: pi:llama.cpp/Qwen3.6-27B
This commit is contained in:
parent
8141e730f1
commit
80452d65b9
@ -1395,11 +1395,23 @@ private:
|
|||||||
|
|
||||||
bool update_cache = false;
|
bool update_cache = false;
|
||||||
|
|
||||||
|
// if a specific slot is requested, use it (still goes through cache update logic below)
|
||||||
|
if (task.id_slot != -1) {
|
||||||
|
ret = get_slot_by_id(task.id_slot);
|
||||||
|
if (ret) {
|
||||||
|
SLT_INF(*ret, "selected slot by id (%d)\n", task.id_slot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// find the slot that has at least n% prompt similarity
|
// find the slot that has at least n% prompt similarity
|
||||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
if (slot_prompt_similarity != 0.0f) {
|
||||||
float sim_best = 0;
|
float sim_best = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
|
if (task.id_slot != -1 && slot.id != task.id_slot) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// skip the slot if it is not available
|
// skip the slot if it is not available
|
||||||
if (slot.is_processing()) {
|
if (slot.is_processing()) {
|
||||||
continue;
|
continue;
|
||||||
@ -1426,8 +1438,10 @@ private:
|
|||||||
if (ret != nullptr) {
|
if (ret != nullptr) {
|
||||||
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
|
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();
|
||||||
|
|
||||||
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
|
if (task.id_slot == -1) {
|
||||||
sim_best, slot_prompt_similarity, f_keep);
|
SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
|
||||||
|
sim_best, slot_prompt_similarity, f_keep);
|
||||||
|
}
|
||||||
|
|
||||||
// if we are about to lose a large portion of the existing context - save it in the prompt cache
|
// if we are about to lose a large portion of the existing context - save it in the prompt cache
|
||||||
if (f_keep < 0.5f) {
|
if (f_keep < 0.5f) {
|
||||||
@ -2180,10 +2194,9 @@ private:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int id_slot = task.id_slot;
|
|
||||||
const int id_task = task.id;
|
const int id_task = task.id;
|
||||||
|
|
||||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
server_slot * slot = get_available_slot(task);
|
||||||
|
|
||||||
//
|
//
|
||||||
// slot scheduling logic
|
// slot scheduling logic
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user