From 354ebac8cb92e93eb6f22bd507d4249b6846b90d Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Tue, 2 Jun 2026 07:26:20 +0200
Subject: [PATCH] server: real-time reasoning interruption via control endpoint
 (#23971)

* server: real-time reasoning interruption via control endpoint

Builds on the manual reasoning budget trigger from #23949. Adds a
CONTROL task that mirrors the CANCEL path on the live slot and calls
common_sampler_reasoning_budget_force to end thinking mid-generation.
POST /v1/chat/completions/control with { id_slot, action }, opt-in
reasoning_control arms the budget sampler on demand. Router and single
model. Minimal WebUI button as a skeleton for further UI work.

* ui: track reasoning phase via explicit streaming state

Add isReasoning to the chat store, mirroring the isLoading pattern:
per conversation map, private setter, public accessor and reactive
export. Set from the stream callbacks, true on reasoning chunks, false
on the first content chunk, reset on stream end and resynced on
conversation switch. The skip button now keys off isReasoning so it
shows only during the thinking phase, not the whole generation.

* ui: extract control endpoint and action into constants

Move the chat completion routes, the slots route and the reasoning
control action out of chat.service into api-endpoints and a dedicated
control-actions module. No behavior change, drops the magic strings so
the control protocol has a single source of truth.

* server: target reasoning control by completion id

Address @ngxson review on the control endpoint.

Switch from id_slot to the chat completion id to avoid a TOCTOU: the
slot can be reassigned between the lookup and the control request, so
matching the live completion (oaicompat_cmpl_id) is safe and a finished
one simply matches nothing. Rename the action to reasoning_end, guard
it on the reasoning_control flag of the target slot, and reduce the
response to {success} with an optional message.

* ui: target reasoning control by completion id

Keep the streamed completion id on the message and post it back to the
control endpoint instead of probing /slots. Drops the slot discovery
and the TOCTOU that came with it. Action renamed to reasoning_end,
response read as {success}.

* server: address review from @ngxson

Move the control fields into task_params and drop the redundant
comments on the control path.

* server: document the reasoning control endpoint

* Update tools/ui/src/lib/types/database.d.ts

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>

* ui: rename cmplId to completionId

Per @allozaur review, clearer name for the streamed completion id.

* ui: wire completion id capture through the agentic flow

The webui streams through the agentic flow, which relayed onModel but
not onCompletionId, so the completion id never reached the message and
the control request was never sent. Relay it through the flow and its
callbacks type, declare id on the chunk type, and log an explicit error
when the button fires without a usable id.

* ui: target reasoning control model from the message

The model is a property of the completion, so read it from the streaming
message like the id, not from the model dropdown which is unrelated UI
state. Makes the request self-consistent by construction instead of just
unlikely to drift.

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
---
 common/common.h                               |  1 +
 common/sampling.cpp                           |  2 +-
 tools/server/README.md                        | 18 ++++
 tools/server/server-common.cpp                |  1 +
 tools/server/server-context.cpp               | 82 +++++++++++++++++++
 tools/server/server-context.h                 |  1 +
 tools/server/server-task.cpp                  |  1 +
 tools/server/server-task.h                    | 18 ++++
 tools/server/server.cpp                       |  2 +
 .../app/chat/ChatForm/ChatForm.svelte         |  1 +
 .../ChatFormActions/ChatFormActions.svelte    | 25 +++++-
 tools/ui/src/lib/constants/api-endpoints.ts   | 11 +++
 tools/ui/src/lib/constants/control-actions.ts |  7 ++
 tools/ui/src/lib/constants/index.ts           |  1 +
 tools/ui/src/lib/services/chat.service.ts     | 67 ++++++++++++++-
 tools/ui/src/lib/stores/agentic.svelte.ts     |  2 +
 tools/ui/src/lib/stores/chat.svelte.ts        | 37 +++++++++
 tools/ui/src/lib/types/agentic.d.ts           |  1 +
 tools/ui/src/lib/types/api.d.ts               |  1 +
 tools/ui/src/lib/types/chat.d.ts              |  1 +
 tools/ui/src/lib/types/database.d.ts          |  2 +
 tools/ui/src/lib/types/settings.d.ts          |  1 +
 22 files changed, 277 insertions(+), 6 deletions(-)
 create mode 100644 tools/ui/src/lib/constants/control-actions.ts

diff --git a/common/common.h b/common/common.h
index 92064a0e4d..f2c7ee027b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -277,6 +277,7 @@ struct common_params_sampling {
     std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
     std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
     std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted
+    bool                     reasoning_control = false;        // create the budget sampler on demand so reasoning can be ended at runtime
 
     bool backend_sampling = false;
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index bbfa9a9ecd..85f8ed50b3 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
     }
 
     // reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
-    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
+    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) {
         rbudget = common_reasoning_budget_init(
             vocab,
             params.reasoning_budget_start,
diff --git a/tools/server/README.md b/tools/server/README.md
index df30ca6464..f1eeec36aa 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1244,6 +1244,8 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
 
 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
 
+`reasoning_control`: Arms realtime reasoning control for this completion so it can be ended early via `/v1/chat/completions/control`. Defaults to `false`.
+
 `generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.
 
 `parse_tool_calls`: Whether to parse the generated tool call.
@@ -1350,6 +1352,22 @@ The server supports parsing and returning reasoning via the `reasoning_content`
 
 Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994).
 
+### POST `/v1/chat/completions/control`: Control a running chat completion in real time
+
+Acts on an in-flight completion identified by its `id` (the `id` field streamed back by `/v1/chat/completions`). The request is processed in parallel with the SSE stream, so the client sends it while still reading tokens.
+
+*Options:*
+
+`id`: (Required) The chat completion id to act on. A completion that has already finished matches nothing and the call is a no-op.
+
+`action`: (Required) The control action to perform. Currently the only supported value is `reasoning_end`, which forces the end of the current reasoning block so the model moves on to the final answer. Requires `reasoning_control: true` on the original completion request.
+
+`model`: (Required in router mode) The model name, used to route the request to the right instance. Ignored in single model mode.
+
+**Response format**
+
+Returns a JSON object with a boolean `success` field, and an optional `message` field describing the reason when `success` is `false`.
+
 ### POST `/v1/responses`: OpenAI-compatible Responses API
 
 *Options:*
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index fb71792fe6..4c3f16a0a3 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1132,6 +1132,7 @@ json oaicompat_chat_params_parse(
             llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
             llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
             llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
+            llama_params["reasoning_control"] = json_value(body, "reasoning_control", false);
         }
     }
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 44fca83c6b..fae73f09f8 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1263,6 +1263,20 @@ private:
         return nullptr;
     }
 
+    server_slot * get_slot_by_cmpl_id(const std::string & cmpl_id) {
+        if (cmpl_id.empty()) {
+            return nullptr;
+        }
+
+        for (server_slot & slot : slots) {
+            if (slot.is_processing() && slot.task && slot.task->params.oaicompat_cmpl_id == cmpl_id) {
+                return &slot;
+            }
+        }
+
+        return nullptr;
+    }
+
     server_slot * get_available_slot(const server_task & task) {
         server_slot * ret = nullptr;
 
@@ -2114,6 +2128,37 @@ private:
                         }
                     }
                 } break;
+            case SERVER_TASK_TYPE_CONTROL:
+                {
+                    auto res = std::make_unique<server_task_result_control>();
+                    res->id = task.id;
+
+                    server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id);
+                    if (slot == nullptr) {
+                        res->success = false;
+                        res->message = "no active completion for this id";
+                        queue_results.send(std::move(res));
+                        break;
+                    }
+
+                    if (task.params.control_action == "reasoning_end") {
+                        // the budget sampler only exists when reasoning control was armed
+                        if (!slot->task->params.sampling.reasoning_control) {
+                            res->success = false;
+                            res->message = "reasoning control not enabled for this completion";
+                            queue_results.send(std::move(res));
+                            break;
+                        }
+                        // act on the live slot mid generation, never defer
+                        common_sampler_reasoning_budget_force(slot->smpl.get());
+                        res->success = true;
+                    } else {
+                        res->success = false;
+                        res->message = "unknown control action";
+                    }
+
+                    queue_results.send(std::move(res));
+                } break;
             case SERVER_TASK_TYPE_NEXT_RESPONSE:
                 {
                     // do nothing
@@ -4266,6 +4311,43 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_CHAT);
     };
 
+    this->post_control = [this](const server_http_req & req) {
+        auto res = create_response();
+        const json body = json::parse(req.body);
+
+        const std::string cmpl_id = json_value(body, "id", std::string());
+        const std::string action  = json_value(body, "action", std::string());
+        if (cmpl_id.empty()) {
+            res->error(format_error_response("missing completion id", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        if (action != "reasoning_end") {
+            res->error(format_error_response("unknown control action", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        auto & rd = res->rd;
+        {
+            server_task task(SERVER_TASK_TYPE_CONTROL);
+            task.id              = rd.get_new_id();
+            task.params.control_cmpl_id = cmpl_id;
+            task.params.control_action  = action;
+            rd.post_task(std::move(task));
+        }
+
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+        res->ok(result->to_json());
+        return res;
+    };
+
     this->post_responses_oai = [this](const server_http_req & req) {
         auto res = create_response();
         std::vector<raw_buffer> files;
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 65853438c9..73caff54a4 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -110,6 +110,7 @@ struct server_routes {
     server_http_context::handler_t post_completions;
     server_http_context::handler_t post_completions_oai;
     server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_control;
     server_http_context::handler_t post_responses_oai;
     server_http_context::handler_t post_transcriptions_oai;
     server_http_context::handler_t post_anthropic_messages;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index ff80be6ccb..33de2e4d9c 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -499,6 +499,7 @@ task_params server_task::params_from_json_cmpl(
         const auto end_tag   = json_value(data, "reasoning_budget_end_tag", std::string());
         const auto message   = json_value(data, "reasoning_budget_message", std::string());
         params.sampling.reasoning_budget_tokens = budget;
+        params.sampling.reasoning_control = json_value(data, "reasoning_control", false);
 
         if (!start_tag.empty()) {
             params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index d47dc690cf..bdadcff765 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -19,6 +19,7 @@ enum server_task_type {
     SERVER_TASK_TYPE_RERANK,
     SERVER_TASK_TYPE_INFILL,
     SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_CONTROL,
     SERVER_TASK_TYPE_NEXT_RESPONSE,
     SERVER_TASK_TYPE_METRICS,
     SERVER_TASK_TYPE_SLOT_SAVE,
@@ -84,6 +85,10 @@ struct task_params {
     std::string        oaicompat_model;
     std::string        oaicompat_cmpl_id;
 
+    // realtime control (SERVER_TASK_TYPE_CONTROL)
+    std::string        control_action;
+    std::string        control_cmpl_id;
+
     // per-request parameters for chat parsing
     common_chat_parser_params chat_parser_params;
 
@@ -551,6 +556,19 @@ struct server_task_result_slot_erase : server_task_result {
     virtual json to_json() override;
 };
 
+struct server_task_result_control : server_task_result {
+    bool        success = false;
+    std::string message; // optional detail when success is false
+
+    virtual json to_json() override {
+        json out = json { { "success", success } };
+        if (!message.empty()) {
+            out["message"] = message;
+        }
+        return out;
+    }
+};
+
 struct server_task_result_get_lora : server_task_result {
     struct lora {
         common_adapter_lora_info info;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 4d56d45e83..769e80a802 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -149,6 +149,7 @@ int llama_server(int argc, char ** argv) {
         routes.post_completions            = models_routes->proxy_post;
         routes.post_completions_oai        = models_routes->proxy_post;
         routes.post_chat_completions       = models_routes->proxy_post;
+        routes.post_control                = models_routes->proxy_post;
         routes.post_responses_oai          = models_routes->proxy_post;
         routes.post_transcriptions_oai     = models_routes->proxy_post;
         routes.post_anthropic_messages     = models_routes->proxy_post;
@@ -185,6 +186,7 @@ int llama_server(int argc, char ** argv) {
     ctx_http.post("/v1/completions",           ex_wrapper(routes.post_completions_oai));
     ctx_http.post("/chat/completions",         ex_wrapper(routes.post_chat_completions));
     ctx_http.post("/v1/chat/completions",      ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions/control", ex_wrapper(routes.post_control));
     ctx_http.post("/v1/responses",             ex_wrapper(routes.post_responses_oai));
     ctx_http.post("/responses",                ex_wrapper(routes.post_responses_oai));
     ctx_http.post("/v1/audio/transcriptions",  ex_wrapper(routes.post_transcriptions_oai));
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 46ac823349..99faecb6b4 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -541,6 +541,7 @@
 				canSend={canSubmit}
 				{disabled}
 				{isLoading}
+				isReasoning={chatStore.isReasoning}
 				{isRecording}
 				{showAddButton}
 				{showModelSelector}
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
index a94293dd95..627107ef50 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
-	import { Square } from '@lucide/svelte';
+	import { Square, SkipForward } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
+	import { ChatService } from '$lib/services';
 	import {
 		ChatFormActionsAdd,
 		ChatFormActionModels,
@@ -21,6 +22,7 @@
 		class?: string;
 		disabled?: boolean;
 		isLoading?: boolean;
+		isReasoning?: boolean;
 		isRecording?: boolean;
 		showAddButton?: boolean;
 		showModelSelector?: boolean;
@@ -39,6 +41,7 @@
 		class: className = '',
 		disabled = false,
 		isLoading = false,
+		isReasoning = false,
 		isRecording = false,
 		showAddButton = true,
 		showModelSelector = true,
@@ -84,6 +87,11 @@
 	export function openModelSelector() {
 		selectorModelRef?.open();
 	}
+	// the streaming assistant message carries both the completion id and the model that
+	// produced it, targeting reasoning control from the same source keeps them consistent
+	let activeMessage = $derived(
+		conversationsStore.activeMessages[conversationsStore.activeMessages.length - 1]
+	);
 </script>
 
 <div
@@ -123,6 +131,21 @@
 		/>
 	{/if}
 
+	{#if isReasoning}
+		<Button
+			type="button"
+			variant="secondary"
+			onclick={() =>
+				ChatService.stopReasoning(activeMessage?.completionId ?? '', activeMessage?.model)}
+			class="group h-8 w-8 rounded-full p-0"
+			title="Skip reasoning"
+		>
+			<span class="sr-only">Skip reasoning</span>
+
+			<SkipForward class="h-4 w-4 stroke-muted-foreground group-hover:stroke-foreground" />
+		</Button>
+	{/if}
+
 	{#if isLoading && !canSubmit}
 		<Button
 			type="button"
diff --git a/tools/ui/src/lib/constants/api-endpoints.ts b/tools/ui/src/lib/constants/api-endpoints.ts
index f89ebe4213..9eb6c74e75 100644
--- a/tools/ui/src/lib/constants/api-endpoints.ts
+++ b/tools/ui/src/lib/constants/api-endpoints.ts
@@ -4,6 +4,17 @@ export const API_MODELS = {
 	UNLOAD: '/models/unload'
 };
 
+// chat completion routes, the control route drives realtime inference (e.g. end reasoning)
+export const API_CHAT = {
+	COMPLETIONS: './v1/chat/completions',
+	CONTROL: './v1/chat/completions/control'
+};
+
+// slot introspection, requires the --slots flag on the server
+export const API_SLOTS = {
+	LIST: './slots'
+};
+
 export const API_TOOLS = {
 	LIST: '/tools',
 	EXECUTE: '/tools'
diff --git a/tools/ui/src/lib/constants/control-actions.ts b/tools/ui/src/lib/constants/control-actions.ts
new file mode 100644
index 0000000000..935ae9542a
--- /dev/null
+++ b/tools/ui/src/lib/constants/control-actions.ts
@@ -0,0 +1,7 @@
+// actions accepted by the realtime inference control endpoint (API_CHAT.CONTROL)
+// kept separate from the endpoint paths since these are protocol level verbs
+export const CONTROL_ACTION = {
+	END_REASONING: 'reasoning_end'
+} as const;
+
+export type ControlAction = (typeof CONTROL_ACTION)[keyof typeof CONTROL_ACTION];
diff --git a/tools/ui/src/lib/constants/index.ts b/tools/ui/src/lib/constants/index.ts
index d3a4348024..eb85910370 100644
--- a/tools/ui/src/lib/constants/index.ts
+++ b/tools/ui/src/lib/constants/index.ts
@@ -15,6 +15,7 @@ export * from './cli-flags';
 export * from './code-blocks';
 export * from './code';
 export * from './context-keys';
+export * from './control-actions';
 export * from './css-classes';
 export * from './floating-ui-constraints';
 export * from './formatters';
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index d6c7e36d70..09d616bd76 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -6,7 +6,10 @@ import {
 	ATTACHMENT_LABEL_MCP_PROMPT,
 	ATTACHMENT_LABEL_MCP_RESOURCE,
 	LEGACY_AGENTIC_REGEX,
-	SETTINGS_KEYS
+	SETTINGS_KEYS,
+	API_CHAT,
+	API_SLOTS,
+	CONTROL_ACTION
 } from '$lib/constants';
 import {
 	AttachmentType,
@@ -126,6 +129,7 @@ export class ChatService {
 			onReasoningChunk,
 			onToolCallChunk,
 			onModel,
+			onCompletionId,
 			onTimings,
 			// Tools for function calling
 			tools,
@@ -239,6 +243,9 @@ export class ChatService {
 			? ReasoningFormat.NONE
 			: ReasoningFormat.AUTO;
 
+		// arms the budget sampler so reasoning can be ended at runtime via the control endpoint
+		requestBody.reasoning_control = true;
+
 		if (continueFinalMessage) {
 			requestBody.continue_final_message = true;
 			requestBody.add_generation_prompt = false;
@@ -289,7 +296,7 @@ export class ChatService {
 		}
 
 		try {
-			const response = await fetch(`./v1/chat/completions`, {
+			const response = await fetch(API_CHAT.COMPLETIONS, {
 				method: 'POST',
 				headers: getJsonHeaders(),
 				body: JSON.stringify(requestBody),
@@ -315,6 +322,7 @@ export class ChatService {
 					onReasoningChunk,
 					onToolCallChunk,
 					onModel,
+					onCompletionId,
 					onTimings,
 					conversationId,
 					signal
@@ -379,7 +387,7 @@ export class ChatService {
 	 */
 	static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
 		try {
-			const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
+			const url = model ? `${API_SLOTS.LIST}?model=${encodeURIComponent(model)}` : API_SLOTS.LIST;
 			const res = await fetch(url, { signal });
 			if (!res.ok) return true;
 
@@ -390,6 +398,50 @@ export class ChatService {
 		}
 	}
 
+	/**
+	 * Ends the current reasoning block of a running completion, targeted by its
+	 * chat completion id (streamed back as `id`). Matching the completion rather
+	 * than a slot index avoids a TOCTOU: a finished completion simply matches
+	 * nothing server side. The model is carried so the router forwards to the
+	 * right child, single model ignores it. Returns true on success.
+	 */
+	static async stopReasoning(completionId: string, model?: string | null): Promise<boolean> {
+		if (!completionId) {
+			console.error(
+				'stopReasoning: no completion id for the active message, cannot target the running completion'
+			);
+			return false;
+		}
+
+		const body: Record<string, unknown> = {
+			id: completionId,
+			action: CONTROL_ACTION.END_REASONING
+		};
+		if (model) body.model = model;
+
+		try {
+			const res = await fetch(API_CHAT.CONTROL, {
+				method: 'POST',
+				headers: getJsonHeaders(),
+				body: JSON.stringify(body)
+			});
+
+			const data = await res.json().catch(() => null);
+			if (!res.ok || data?.success !== true) {
+				console.error('stopReasoning: control request failed', {
+					status: res.status,
+					completionId,
+					response: data
+				});
+				return false;
+			}
+			return true;
+		} catch (error) {
+			console.error('stopReasoning: control request threw', { completionId, error });
+			return false;
+		}
+	}
+
 	/**
 	 * Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
 	 * After a response completes, this re-submits the full conversation
@@ -457,7 +509,7 @@ export class ChatService {
 		}
 
 		try {
-			await fetch(`./v1/chat/completions`, {
+			await fetch(API_CHAT.COMPLETIONS, {
 				method: 'POST',
 				headers: getJsonHeaders(),
 				body: JSON.stringify(requestBody),
@@ -502,6 +554,7 @@ export class ChatService {
 		onReasoningChunk?: (chunk: string) => void,
 		onToolCallChunk?: (chunk: string) => void,
 		onModel?: (model: string) => void,
+		onCompletionId?: (id: string) => void,
 		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
 		conversationId?: string,
 		abortSignal?: AbortSignal
@@ -519,6 +572,7 @@ export class ChatService {
 		let lastTimings: ChatMessageTimings | undefined;
 		let streamFinished = false;
 		let modelEmitted = false;
+		let idEmitted = false;
 		let toolCallIndexOffset = 0;
 		let hasOpenToolCallBatch = false;
 
@@ -603,6 +657,11 @@ export class ChatService {
 								onModel?.(chunkModel);
 							}
 
+							if (parsed.id && !idEmitted) {
+								idEmitted = true;
+								onCompletionId?.(parsed.id);
+							}
+
 							if (promptProgress) {
 								ChatService.notifyTimings(undefined, promptProgress, onTimings);
 							}
diff --git a/tools/ui/src/lib/stores/agentic.svelte.ts b/tools/ui/src/lib/stores/agentic.svelte.ts
index 4866995b4a..947737d7c1 100644
--- a/tools/ui/src/lib/stores/agentic.svelte.ts
+++ b/tools/ui/src/lib/stores/agentic.svelte.ts
@@ -488,6 +488,7 @@ class AgenticStore {
 			onToolCallsStreaming,
 			onAttachments,
 			onModel,
+			onCompletionId,
 			onAssistantTurnComplete,
 			createToolResultMessage,
 			createAssistantMessage,
@@ -597,6 +598,7 @@ class AgenticStore {
 							}
 						},
 						onModel,
+						onCompletionId,
 						onTimings: (timings?: ChatMessageTimings, progress?: ChatMessagePromptProgress) => {
 							onTimings?.(timings, progress);
 							if (timings) {
diff --git a/tools/ui/src/lib/stores/chat.svelte.ts b/tools/ui/src/lib/stores/chat.svelte.ts
index f2f13f25dc..667fb417c3 100644
--- a/tools/ui/src/lib/stores/chat.svelte.ts
+++ b/tools/ui/src/lib/stores/chat.svelte.ts
@@ -63,7 +63,10 @@ class ChatStore {
 	currentResponse = $state('');
 	errorDialogState = $state<ErrorDialogState | null>(null);
 	isLoading = $state(false);
+	// true while the active conversation streams reasoning content but no visible content yet
+	isReasoning = $state(false);
 	chatLoadingStates = new SvelteMap<string, boolean>();
+	chatReasoningStates = new SvelteMap<string, boolean>();
 	chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
 	private abortControllers = new SvelteMap<string, AbortController>();
 	private preEncodeAbortController: AbortController | null = null;
@@ -94,6 +97,17 @@ class ChatStore {
 		} else {
 			this.chatLoadingStates.delete(convId);
 			if (convId === conversationsStore.activeConversation?.id) this.isLoading = false;
+			this.setChatReasoning(convId, false);
+		}
+	}
+
+	private setChatReasoning(convId: string, reasoning: boolean): void {
+		if (reasoning) {
+			this.chatReasoningStates.set(convId, true);
+			if (convId === conversationsStore.activeConversation?.id) this.isReasoning = true;
+		} else {
+			this.chatReasoningStates.delete(convId);
+			if (convId === conversationsStore.activeConversation?.id) this.isReasoning = false;
 		}
 	}
 	private setChatStreaming(convId: string, response: string, messageId: string): void {
@@ -110,6 +124,7 @@ class ChatStore {
 	}
 	syncLoadingStateForChat(convId: string): void {
 		this.isLoading = this.chatLoadingStates.get(convId) || false;
+		this.isReasoning = this.chatReasoningStates.get(convId) || false;
 		const s = this.chatStreamingStates.get(convId);
 		this.currentResponse = s?.response || '';
 		this.isStreamingActive = s !== undefined;
@@ -265,6 +280,10 @@ class ChatStore {
 		return this.chatLoadingStates.get(convId) || false;
 	}
 
+	isChatReasoningPublic(convId: string): boolean {
+		return this.chatReasoningStates.get(convId) || false;
+	}
+
 	private isChatLoadingInternal(convId: string): boolean {
 		return this.chatLoadingStates.has(convId) || this.chatStreamingStates.has(convId);
 	}
@@ -655,6 +674,17 @@ class ChatStore {
 			}
 		};
 
+		let completionIdRecorded = false;
+		const recordCompletionId = (id: string): void => {
+			if (!id || completionIdRecorded) return;
+			completionIdRecorded = true;
+			const idx = conversationsStore.findMessageIndex(currentMessageId);
+			conversationsStore.updateMessageAtIndex(idx, { completionId: id });
+			DatabaseService.updateMessage(currentMessageId, { completionId: id }).catch(() => {
+				completionIdRecorded = false;
+			});
+		};
+
 		const updateStreamingUI = () => {
 			this.setChatStreaming(convId, streamedContent, currentMessageId);
 			const idx = conversationsStore.findMessageIndex(currentMessageId);
@@ -676,6 +706,7 @@ class ChatStore {
 			onChunk: (chunk: string) => {
 				streamedContent += chunk;
 				updateStreamingUI();
+				this.setChatReasoning(convId, false);
 			},
 			onReasoningChunk: (chunk: string) => {
 				streamedReasoningContent += chunk;
@@ -685,6 +716,7 @@ class ChatStore {
 				conversationsStore.updateMessageAtIndex(idx, {
 					reasoningContent: streamedReasoningContent
 				});
+				this.setChatReasoning(convId, true);
 			},
 			onToolCallsStreaming: (toolCalls) => {
 				const idx = conversationsStore.findMessageIndex(currentMessageId);
@@ -702,6 +734,7 @@ class ChatStore {
 				DatabaseService.updateMessage(messageId, { extra: updatedExtras }).catch(console.error);
 			},
 			onModel: (modelName: string) => recordModel(modelName),
+			onCompletionId: (id: string) => recordCompletionId(id),
 			onTurnComplete: (intermediateTimings: ChatMessageTimings) => {
 				// Update the first assistant message with cumulative agentic timings
 				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
@@ -887,6 +920,7 @@ class ChatStore {
 				onChunk: streamCallbacks.onChunk,
 				onReasoningChunk: streamCallbacks.onReasoningChunk,
 				onModel: streamCallbacks.onModel,
+				onCompletionId: streamCallbacks.onCompletionId,
 				onTimings: streamCallbacks.onTimings,
 				onComplete: async (
 					finalContent?: string,
@@ -1373,6 +1407,7 @@ class ChatStore {
 						appendedContent += chunk;
 						hasReceivedContent = true;
 						updateStreamingContent(originalContent + appendedContent);
+						this.setChatReasoning(msg.convId, false);
 					},
 					onReasoningChunk: (chunk: string) => {
 						appendedReasoning += chunk;
@@ -1382,6 +1417,7 @@ class ChatStore {
 						conversationsStore.updateMessageAtIndex(idx, {
 							reasoningContent: originalReasoning + appendedReasoning
 						});
+						this.setChatReasoning(msg.convId, true);
 					},
 					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 						const tokensPerSecond =
@@ -1924,6 +1960,7 @@ export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(c
 export const isChatStreaming = () => chatStore.isStreaming();
 export const isEditing = () => chatStore.isEditing();
 export const isLoading = () => chatStore.isLoading;
+export const isReasoning = () => chatStore.isReasoning;
 export const pendingEditMessageId = () => chatStore.pendingEditMessageId;
 export const chatHasPendingMessage = (convId: string) => chatStore.hasPendingMessage(convId);
 export const chatPendingMessageContent = (convId: string) =>
diff --git a/tools/ui/src/lib/types/agentic.d.ts b/tools/ui/src/lib/types/agentic.d.ts
index b94998384f..bcec10c671 100644
--- a/tools/ui/src/lib/types/agentic.d.ts
+++ b/tools/ui/src/lib/types/agentic.d.ts
@@ -93,6 +93,7 @@ export interface AgenticFlowCallbacks {
 	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
 	/** Model name detected from response */
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	/** Current assistant turn's streaming is complete - save to DB */
 	onAssistantTurnComplete?: (
 		content: string,
diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts
index c780351801..f620d67351 100644
--- a/tools/ui/src/lib/types/api.d.ts
+++ b/tools/ui/src/lib/types/api.d.ts
@@ -271,6 +271,7 @@ export interface ApiChatCompletionToolCall extends ApiChatCompletionToolCallDelt
 }
 
 export interface ApiChatCompletionStreamChunk {
+	id?: string;
 	object?: string;
 	model?: string;
 	choices: Array<{
diff --git a/tools/ui/src/lib/types/chat.d.ts b/tools/ui/src/lib/types/chat.d.ts
index acedd0769a..b2158807d4 100644
--- a/tools/ui/src/lib/types/chat.d.ts
+++ b/tools/ui/src/lib/types/chat.d.ts
@@ -97,6 +97,7 @@ export interface ChatStreamCallbacks {
 	onToolCallsStreaming?: (toolCalls: ApiChatCompletionToolCall[]) => void;
 	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
 	onAssistantTurnComplete?: (
 		content: string,
diff --git a/tools/ui/src/lib/types/database.d.ts b/tools/ui/src/lib/types/database.d.ts
index 02bdb3ee60..aecc250e81 100644
--- a/tools/ui/src/lib/types/database.d.ts
+++ b/tools/ui/src/lib/types/database.d.ts
@@ -112,6 +112,8 @@ export interface DatabaseMessage {
 	reasoningContent?: string;
 	/** Serialized JSON array of tool calls made by assistant messages */
 	toolCalls?: string;
+	/** Chat completion id streamed by the server, used to target realtime control (e.g. end reasoning) */
+	completionId?: string;
 	/** Tool call ID for tool result messages (role: 'tool') */
 	toolCallId?: string;
 	children: string[];
diff --git a/tools/ui/src/lib/types/settings.d.ts b/tools/ui/src/lib/types/settings.d.ts
index 03818091a1..44e83f8f33 100644
--- a/tools/ui/src/lib/types/settings.d.ts
+++ b/tools/ui/src/lib/types/settings.d.ts
@@ -101,6 +101,7 @@ export interface SettingsChatServiceOptions {
 	onToolCallChunk?: (chunk: string) => void;
 	onAttachments?: (extras: DatabaseMessageExtra[]) => void;
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
 	onComplete?: (
 		response: string,