server: fix double submits of infill (#1944)

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana 2026-06-10 00:48:15 -05:00 committed by GitHub
parent 71d5aa21f7
commit 2a1148384c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 14 additions and 12 deletions

View File

@ -24,7 +24,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
result->grammar = nullptr;
result->rbudget = nullptr;
struct llama_grammar* grmr;
struct llama_grammar* grmr = nullptr;
const std::string & grammar_str = common_grammar_value(params.grammar);
if (grammar_str.compare(0, 11, "%llguidance") == 0) {
#ifdef LLAMA_USE_LLGUIDANCE

View File

@ -1295,7 +1295,7 @@ void server_tokens::push_back(server_tokens& tokens) {
// Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
// We could also just check, but this will prevent silently dropping MTMD data.
GGML_ASSERT(has_mtmd);
for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); it++) {
auto* chunk = tokens.map_idx_to_media[it->first].get();
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_idx_to_media[start_idx + it->first] = std::move(new_chunk);

View File

@ -2661,7 +2661,7 @@ void server_context::apply_server_biases(server_slot& slot) {
}
}
void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs) {
void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs) {
server_task task;
task.id = id_task;
task.id_multi = id_multi;
@ -2670,7 +2670,7 @@ void server_context::request_completion(int id_task, int id_multi, json data, bo
task.infill = infill;
task.embedding = embedding;
task.type = SERVER_TASK_TYPE_COMPLETION;
task.tokens = std::move(inputs);
task.tokens = inputs.clone();
// when a completion task's prompt array is not a singleton, we split it into multiple requests
// otherwise, it's a single-prompt task, we actually queue it
// if there's numbers in the prompt array it will be treated as an array of tokens
@ -2709,7 +2709,8 @@ void server_context::request_cancel(int id_task) {
}
void server_context::split_multiprompt_task(int id_multi, server_task& multiprompt_task) {
const int prompt_count = multiprompt_task.data.at("prompt").size();
auto prompts = multiprompt_task.data.at("prompt");
const int prompt_count = prompts.size();
if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
return;
@ -2727,11 +2728,11 @@ void server_context::split_multiprompt_task(int id_multi, server_task& multiprom
// add subtasks
for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data.at("prompt")[i];
subtask_data["prompt"] = prompts[i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding,
std::move(multiprompt_task.tokens));
multiprompt_task.tokens);
}
}

View File

@ -350,7 +350,7 @@ struct server_context {
void apply_server_biases(server_slot& slot);
void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs);
void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs);
void request_cancel(int id_task);

View File

@ -1358,10 +1358,11 @@ int main(int argc, char ** argv) {
const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
log_prompt(ctx_server.params_base, json::parse(req.body));
json data = json::parse(req.body);
const int id_task = ctx_server.queue_tasks.get_new_id();
server_tokens token; // dummy tokens
ctx_server.queue_results.add_waiting_task_id(id_task);
ctx_server.request_completion(id_task, -1, data, true, false, std::move(token));
//avoid double submits
//const int id_task = ctx_server.queue_tasks.get_new_id();
//server_tokens token; // dummy tokens
//ctx_server.queue_results.add_waiting_task_id(id_task);
//ctx_server.request_completion(id_task, -1, data, true, false, token);
std::vector<raw_buffer> files; // dummy
handle_completions_impl(
SERVER_TASK_TYPE_INFILL,