server: fix double submits of infill (#1944)

Co-authored-by: firecoperana <firecoperana>
2026-06-28 04:30:15 -05:00 · 2026-06-10 00:48:15 -05:00 · 2026-06-10 00:48:15 -05:00 · 2a1148384c
commit 2a1148384c
parent 71d5aa21f7
5 changed files with 14 additions and 12 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -24,7 +24,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
    result->grammar = nullptr;
    result->rbudget = nullptr;

-    struct llama_grammar* grmr;
+    struct llama_grammar* grmr = nullptr;
    const std::string & grammar_str = common_grammar_value(params.grammar);
    if (grammar_str.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
--- a/examples/server/server-common.cpp
+++ b/examples/server/server-common.cpp
@ -1295,7 +1295,7 @@ void server_tokens::push_back(server_tokens& tokens) {
        // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd.
        // We could also just check, but this will prevent silently dropping MTMD data.
        GGML_ASSERT(has_mtmd);
-        for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) {
+        for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); it++) {
            auto* chunk = tokens.map_idx_to_media[it->first].get();
            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
            map_idx_to_media[start_idx + it->first] = std::move(new_chunk);
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -2661,7 +2661,7 @@ void server_context::apply_server_biases(server_slot& slot) {
    }
 }

-void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs) {
+void server_context::request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs) {
    server_task task;
    task.id = id_task;
    task.id_multi = id_multi;
@ -2670,7 +2670,7 @@ void server_context::request_completion(int id_task, int id_multi, json data, bo
    task.infill = infill;
    task.embedding = embedding;
    task.type = SERVER_TASK_TYPE_COMPLETION;
-    task.tokens = std::move(inputs);
+    task.tokens = inputs.clone();
    // when a completion task's prompt array is not a singleton, we split it into multiple requests
    // otherwise, it's a single-prompt task, we actually queue it
    // if there's numbers in the prompt array it will be treated as an array of tokens
@ -2709,7 +2709,8 @@ void server_context::request_cancel(int id_task) {
 }

 void server_context::split_multiprompt_task(int id_multi, server_task& multiprompt_task) {
-    const int prompt_count = multiprompt_task.data.at("prompt").size();
+    auto prompts = multiprompt_task.data.at("prompt");
+    const int prompt_count = prompts.size();
    if (prompt_count <= 1) {
        send_error(multiprompt_task, "error while handling multiple prompts");
        return;
@ -2727,11 +2728,11 @@ void server_context::split_multiprompt_task(int id_multi, server_task& multiprom
    // add subtasks
    for (int i = 0; i < prompt_count; i++) {
        json subtask_data = multiprompt_task.data;
-        subtask_data["prompt"] = subtask_data.at("prompt")[i];
+        subtask_data["prompt"] = prompts[i];

        // subtasks inherit everything else (infill mode, embedding mode, etc.)
        request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding,
-            std::move(multiprompt_task.tokens));
+            multiprompt_task.tokens);
    }
 }

--- a/examples/server/server-context.h
+++ b/examples/server/server-context.h
@ -350,7 +350,7 @@ struct server_context {

    void apply_server_biases(server_slot& slot);

-    void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens&& inputs);
+    void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding, server_tokens & inputs);

    void request_cancel(int id_task);

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1358,10 +1358,11 @@ int main(int argc, char ** argv) {
    const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        log_prompt(ctx_server.params_base, json::parse(req.body));
        json data = json::parse(req.body);
-        const int id_task = ctx_server.queue_tasks.get_new_id();
-        server_tokens token; // dummy tokens
-        ctx_server.queue_results.add_waiting_task_id(id_task);
-        ctx_server.request_completion(id_task, -1, data, true, false, std::move(token));
+        //avoid double submits
+        //const int id_task = ctx_server.queue_tasks.get_new_id();
+        //server_tokens token; // dummy tokens
+        //ctx_server.queue_results.add_waiting_task_id(id_task);
+        //ctx_server.request_completion(id_task, -1, data, true, false, token);
        std::vector<raw_buffer> files; // dummy
        handle_completions_impl(
            SERVER_TASK_TYPE_INFILL,