diff --git a/common/common.h b/common/common.h index 92064a0e4d..f2c7ee027b 100644 --- a/common/common.h +++ b/common/common.h @@ -277,6 +277,7 @@ struct common_params_sampling { std::vector reasoning_budget_end; // end tag token sequence std::vector reasoning_budget_forced; // forced sequence (message + end tag) std::string reasoning_budget_message; // message injected before end tag when budget exhausted + bool reasoning_control = false; // create the budget sampler on demand so reasoning can be ended at runtime bool backend_sampling = false; diff --git a/common/sampling.cpp b/common/sampling.cpp index bbfa9a9ecd..85f8ed50b3 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } // reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression) - if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) { + if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) { rbudget = common_reasoning_budget_init( vocab, params.reasoning_budget_start, diff --git a/tools/server/README.md b/tools/server/README.md index df30ca6464..f1eeec36aa 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1244,6 +1244,8 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text. +`reasoning_control`: Arms realtime reasoning control for this completion so it can be ended early via `/v1/chat/completions/control`. Defaults to `false`. + `generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing. `parse_tool_calls`: Whether to parse the generated tool call. @@ -1350,6 +1352,22 @@ The server supports parsing and returning reasoning via the `reasoning_content` Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994). +### POST `/v1/chat/completions/control`: Control a running chat completion in real time + +Acts on an in-flight completion identified by its `id` (the `id` field streamed back by `/v1/chat/completions`). The request is processed in parallel with the SSE stream, so the client sends it while still reading tokens. + +*Options:* + +`id`: (Required) The chat completion id to act on. A completion that has already finished matches nothing and the call is a no-op. + +`action`: (Required) The control action to perform. Currently the only supported value is `reasoning_end`, which forces the end of the current reasoning block so the model moves on to the final answer. Requires `reasoning_control: true` on the original completion request. + +`model`: (Required in router mode) The model name, used to route the request to the right instance. Ignored in single model mode. + +**Response format** + +Returns a JSON object with a boolean `success` field, and an optional `message` field describing the reason when `success` is `false`. + ### POST `/v1/responses`: OpenAI-compatible Responses API *Options:* diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index fb71792fe6..4c3f16a0a3 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1132,6 +1132,7 @@ json oaicompat_chat_params_parse( llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag; llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag; llama_params["reasoning_budget_message"] = opt.reasoning_budget_message; + llama_params["reasoning_control"] = json_value(body, "reasoning_control", false); } } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 44fca83c6b..fae73f09f8 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1263,6 +1263,20 @@ private: return nullptr; } + server_slot * get_slot_by_cmpl_id(const std::string & cmpl_id) { + if (cmpl_id.empty()) { + return nullptr; + } + + for (server_slot & slot : slots) { + if (slot.is_processing() && slot.task && slot.task->params.oaicompat_cmpl_id == cmpl_id) { + return &slot; + } + } + + return nullptr; + } + server_slot * get_available_slot(const server_task & task) { server_slot * ret = nullptr; @@ -2114,6 +2128,37 @@ private: } } } break; + case SERVER_TASK_TYPE_CONTROL: + { + auto res = std::make_unique(); + res->id = task.id; + + server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id); + if (slot == nullptr) { + res->success = false; + res->message = "no active completion for this id"; + queue_results.send(std::move(res)); + break; + } + + if (task.params.control_action == "reasoning_end") { + // the budget sampler only exists when reasoning control was armed + if (!slot->task->params.sampling.reasoning_control) { + res->success = false; + res->message = "reasoning control not enabled for this completion"; + queue_results.send(std::move(res)); + break; + } + // act on the live slot mid generation, never defer + common_sampler_reasoning_budget_force(slot->smpl.get()); + res->success = true; + } else { + res->success = false; + res->message = "unknown control action"; + } + + queue_results.send(std::move(res)); + } break; case SERVER_TASK_TYPE_NEXT_RESPONSE: { // do nothing @@ -4266,6 +4311,43 @@ void server_routes::init_routes() { TASK_RESPONSE_TYPE_OAI_CHAT); }; + this->post_control = [this](const server_http_req & req) { + auto res = create_response(); + const json body = json::parse(req.body); + + const std::string cmpl_id = json_value(body, "id", std::string()); + const std::string action = json_value(body, "action", std::string()); + if (cmpl_id.empty()) { + res->error(format_error_response("missing completion id", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + if (action != "reasoning_end") { + res->error(format_error_response("unknown control action", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + + auto & rd = res->rd; + { + server_task task(SERVER_TASK_TYPE_CONTROL); + task.id = rd.get_new_id(); + task.params.control_cmpl_id = cmpl_id; + task.params.control_action = action; + rd.post_task(std::move(task)); + } + + auto result = rd.next(req.should_stop); + if (!result) { + GGML_ASSERT(req.should_stop()); + return res; + } + if (result->is_error()) { + res->error(result->to_json()); + return res; + } + res->ok(result->to_json()); + return res; + }; + this->post_responses_oai = [this](const server_http_req & req) { auto res = create_response(); std::vector files; diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 65853438c9..73caff54a4 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -110,6 +110,7 @@ struct server_routes { server_http_context::handler_t post_completions; server_http_context::handler_t post_completions_oai; server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_control; server_http_context::handler_t post_responses_oai; server_http_context::handler_t post_transcriptions_oai; server_http_context::handler_t post_anthropic_messages; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index ff80be6ccb..33de2e4d9c 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -499,6 +499,7 @@ task_params server_task::params_from_json_cmpl( const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); const auto message = json_value(data, "reasoning_budget_message", std::string()); params.sampling.reasoning_budget_tokens = budget; + params.sampling.reasoning_control = json_value(data, "reasoning_control", false); if (!start_tag.empty()) { params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index d47dc690cf..bdadcff765 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -19,6 +19,7 @@ enum server_task_type { SERVER_TASK_TYPE_RERANK, SERVER_TASK_TYPE_INFILL, SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_CONTROL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, SERVER_TASK_TYPE_SLOT_SAVE, @@ -84,6 +85,10 @@ struct task_params { std::string oaicompat_model; std::string oaicompat_cmpl_id; + // realtime control (SERVER_TASK_TYPE_CONTROL) + std::string control_action; + std::string control_cmpl_id; + // per-request parameters for chat parsing common_chat_parser_params chat_parser_params; @@ -551,6 +556,19 @@ struct server_task_result_slot_erase : server_task_result { virtual json to_json() override; }; +struct server_task_result_control : server_task_result { + bool success = false; + std::string message; // optional detail when success is false + + virtual json to_json() override { + json out = json { { "success", success } }; + if (!message.empty()) { + out["message"] = message; + } + return out; + } +}; + struct server_task_result_get_lora : server_task_result { struct lora { common_adapter_lora_info info; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 4d56d45e83..769e80a802 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -149,6 +149,7 @@ int llama_server(int argc, char ** argv) { routes.post_completions = models_routes->proxy_post; routes.post_completions_oai = models_routes->proxy_post; routes.post_chat_completions = models_routes->proxy_post; + routes.post_control = models_routes->proxy_post; routes.post_responses_oai = models_routes->proxy_post; routes.post_transcriptions_oai = models_routes->proxy_post; routes.post_anthropic_messages = models_routes->proxy_post; @@ -185,6 +186,7 @@ int llama_server(int argc, char ** argv) { ctx_http.post("/v1/completions", ex_wrapper(routes.post_completions_oai)); ctx_http.post("/chat/completions", ex_wrapper(routes.post_chat_completions)); ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions)); + ctx_http.post("/v1/chat/completions/control", ex_wrapper(routes.post_control)); ctx_http.post("/v1/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/responses", ex_wrapper(routes.post_responses_oai)); ctx_http.post("/v1/audio/transcriptions", ex_wrapper(routes.post_transcriptions_oai)); diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte index 46ac823349..99faecb6b4 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte @@ -541,6 +541,7 @@ canSend={canSubmit} {disabled} {isLoading} + isReasoning={chatStore.isReasoning} {isRecording} {showAddButton} {showModelSelector} diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte index a94293dd95..627107ef50 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte @@ -1,6 +1,7 @@
{/if} + {#if isReasoning} + + {/if} + {#if isLoading && !canSubmit}