diff --git a/common/common.h b/common/common.h
index 92064a0e4d..f2c7ee027b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -277,6 +277,7 @@ struct common_params_sampling {
     std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
     std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
     std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted
+    bool                     reasoning_control = false;        // create the budget sampler on demand so reasoning can be ended at runtime
 
     bool backend_sampling = false;
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index bbfa9a9ecd..85f8ed50b3 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
     }
 
     // reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
-    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
+    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) {
         rbudget = common_reasoning_budget_init(
             vocab,
             params.reasoning_budget_start,
diff --git a/tools/server/README.md b/tools/server/README.md
index df30ca6464..f1eeec36aa 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1244,6 +1244,8 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
 
 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
 
+`reasoning_control`: Arms realtime reasoning control for this completion so it can be ended early via `/v1/chat/completions/control`. Defaults to `false`.
+
 `generation_prompt`: The generation prompt that was prefilled in by the template. Prepended to model output before parsing.
 
 `parse_tool_calls`: Whether to parse the generated tool call.
@@ -1350,6 +1352,22 @@ The server supports parsing and returning reasoning via the `reasoning_content`
 
 Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994).
 
+### POST `/v1/chat/completions/control`: Control a running chat completion in real time
+
+Acts on an in-flight completion identified by its `id` (the `id` field streamed back by `/v1/chat/completions`). The request is processed in parallel with the SSE stream, so the client sends it while still reading tokens.
+
+*Options:*
+
+`id`: (Required) The chat completion id to act on. A completion that has already finished matches nothing and the call is a no-op.
+
+`action`: (Required) The control action to perform. Currently the only supported value is `reasoning_end`, which forces the end of the current reasoning block so the model moves on to the final answer. Requires `reasoning_control: true` on the original completion request.
+
+`model`: (Required in router mode) The model name, used to route the request to the right instance. Ignored in single model mode.
+
+**Response format**
+
+Returns a JSON object with a boolean `success` field, and an optional `message` field describing the reason when `success` is `false`.
+
 ### POST `/v1/responses`: OpenAI-compatible Responses API
 
 *Options:*
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index fb71792fe6..4c3f16a0a3 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1132,6 +1132,7 @@ json oaicompat_chat_params_parse(
             llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
             llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
             llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
+            llama_params["reasoning_control"] = json_value(body, "reasoning_control", false);
         }
     }
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 44fca83c6b..fae73f09f8 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1263,6 +1263,20 @@ private:
         return nullptr;
     }
 
+    server_slot * get_slot_by_cmpl_id(const std::string & cmpl_id) {
+        if (cmpl_id.empty()) {
+            return nullptr;
+        }
+
+        for (server_slot & slot : slots) {
+            if (slot.is_processing() && slot.task && slot.task->params.oaicompat_cmpl_id == cmpl_id) {
+                return &slot;
+            }
+        }
+
+        return nullptr;
+    }
+
     server_slot * get_available_slot(const server_task & task) {
         server_slot * ret = nullptr;
 
@@ -2114,6 +2128,37 @@ private:
                         }
                     }
                 } break;
+            case SERVER_TASK_TYPE_CONTROL:
+                {
+                    auto res = std::make_unique<server_task_result_control>();
+                    res->id = task.id;
+
+                    server_slot * slot = get_slot_by_cmpl_id(task.params.control_cmpl_id);
+                    if (slot == nullptr) {
+                        res->success = false;
+                        res->message = "no active completion for this id";
+                        queue_results.send(std::move(res));
+                        break;
+                    }
+
+                    if (task.params.control_action == "reasoning_end") {
+                        // the budget sampler only exists when reasoning control was armed
+                        if (!slot->task->params.sampling.reasoning_control) {
+                            res->success = false;
+                            res->message = "reasoning control not enabled for this completion";
+                            queue_results.send(std::move(res));
+                            break;
+                        }
+                        // act on the live slot mid generation, never defer
+                        common_sampler_reasoning_budget_force(slot->smpl.get());
+                        res->success = true;
+                    } else {
+                        res->success = false;
+                        res->message = "unknown control action";
+                    }
+
+                    queue_results.send(std::move(res));
+                } break;
             case SERVER_TASK_TYPE_NEXT_RESPONSE:
                 {
                     // do nothing
@@ -4266,6 +4311,43 @@ void server_routes::init_routes() {
             TASK_RESPONSE_TYPE_OAI_CHAT);
     };
 
+    this->post_control = [this](const server_http_req & req) {
+        auto res = create_response();
+        const json body = json::parse(req.body);
+
+        const std::string cmpl_id = json_value(body, "id", std::string());
+        const std::string action  = json_value(body, "action", std::string());
+        if (cmpl_id.empty()) {
+            res->error(format_error_response("missing completion id", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        if (action != "reasoning_end") {
+            res->error(format_error_response("unknown control action", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        auto & rd = res->rd;
+        {
+            server_task task(SERVER_TASK_TYPE_CONTROL);
+            task.id              = rd.get_new_id();
+            task.params.control_cmpl_id = cmpl_id;
+            task.params.control_action  = action;
+            rd.post_task(std::move(task));
+        }
+
+        auto result = rd.next(req.should_stop);
+        if (!result) {
+            GGML_ASSERT(req.should_stop());
+            return res;
+        }
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+        res->ok(result->to_json());
+        return res;
+    };
+
     this->post_responses_oai = [this](const server_http_req & req) {
         auto res = create_response();
         std::vector<raw_buffer> files;
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 65853438c9..73caff54a4 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -110,6 +110,7 @@ struct server_routes {
     server_http_context::handler_t post_completions;
     server_http_context::handler_t post_completions_oai;
     server_http_context::handler_t post_chat_completions;
+    server_http_context::handler_t post_control;
     server_http_context::handler_t post_responses_oai;
     server_http_context::handler_t post_transcriptions_oai;
     server_http_context::handler_t post_anthropic_messages;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index ff80be6ccb..33de2e4d9c 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -499,6 +499,7 @@ task_params server_task::params_from_json_cmpl(
         const auto end_tag   = json_value(data, "reasoning_budget_end_tag", std::string());
         const auto message   = json_value(data, "reasoning_budget_message", std::string());
         params.sampling.reasoning_budget_tokens = budget;
+        params.sampling.reasoning_control = json_value(data, "reasoning_control", false);
 
         if (!start_tag.empty()) {
             params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index d47dc690cf..bdadcff765 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -19,6 +19,7 @@ enum server_task_type {
     SERVER_TASK_TYPE_RERANK,
     SERVER_TASK_TYPE_INFILL,
     SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_CONTROL,
     SERVER_TASK_TYPE_NEXT_RESPONSE,
     SERVER_TASK_TYPE_METRICS,
     SERVER_TASK_TYPE_SLOT_SAVE,
@@ -84,6 +85,10 @@ struct task_params {
     std::string        oaicompat_model;
     std::string        oaicompat_cmpl_id;
 
+    // realtime control (SERVER_TASK_TYPE_CONTROL)
+    std::string        control_action;
+    std::string        control_cmpl_id;
+
     // per-request parameters for chat parsing
     common_chat_parser_params chat_parser_params;
 
@@ -551,6 +556,19 @@ struct server_task_result_slot_erase : server_task_result {
     virtual json to_json() override;
 };
 
+struct server_task_result_control : server_task_result {
+    bool        success = false;
+    std::string message; // optional detail when success is false
+
+    virtual json to_json() override {
+        json out = json { { "success", success } };
+        if (!message.empty()) {
+            out["message"] = message;
+        }
+        return out;
+    }
+};
+
 struct server_task_result_get_lora : server_task_result {
     struct lora {
         common_adapter_lora_info info;
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 4d56d45e83..769e80a802 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -149,6 +149,7 @@ int llama_server(int argc, char ** argv) {
         routes.post_completions            = models_routes->proxy_post;
         routes.post_completions_oai        = models_routes->proxy_post;
         routes.post_chat_completions       = models_routes->proxy_post;
+        routes.post_control                = models_routes->proxy_post;
         routes.post_responses_oai          = models_routes->proxy_post;
         routes.post_transcriptions_oai     = models_routes->proxy_post;
         routes.post_anthropic_messages     = models_routes->proxy_post;
@@ -185,6 +186,7 @@ int llama_server(int argc, char ** argv) {
     ctx_http.post("/v1/completions",           ex_wrapper(routes.post_completions_oai));
     ctx_http.post("/chat/completions",         ex_wrapper(routes.post_chat_completions));
     ctx_http.post("/v1/chat/completions",      ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions/control", ex_wrapper(routes.post_control));
     ctx_http.post("/v1/responses",             ex_wrapper(routes.post_responses_oai));
     ctx_http.post("/responses",                ex_wrapper(routes.post_responses_oai));
     ctx_http.post("/v1/audio/transcriptions",  ex_wrapper(routes.post_transcriptions_oai));
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 46ac823349..99faecb6b4 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -541,6 +541,7 @@
 				canSend={canSubmit}
 				{disabled}
 				{isLoading}
+				isReasoning={chatStore.isReasoning}
 				{isRecording}
 				{showAddButton}
 				{showModelSelector}
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
index a94293dd95..627107ef50 100644
--- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
+++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
-	import { Square } from '@lucide/svelte';
+	import { Square, SkipForward } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
+	import { ChatService } from '$lib/services';
 	import {
 		ChatFormActionsAdd,
 		ChatFormActionModels,
@@ -21,6 +22,7 @@
 		class?: string;
 		disabled?: boolean;
 		isLoading?: boolean;
+		isReasoning?: boolean;
 		isRecording?: boolean;
 		showAddButton?: boolean;
 		showModelSelector?: boolean;
@@ -39,6 +41,7 @@
 		class: className = '',
 		disabled = false,
 		isLoading = false,
+		isReasoning = false,
 		isRecording = false,
 		showAddButton = true,
 		showModelSelector = true,
@@ -84,6 +87,11 @@
 	export function openModelSelector() {
 		selectorModelRef?.open();
 	}
+	// the streaming assistant message carries both the completion id and the model that
+	// produced it, targeting reasoning control from the same source keeps them consistent
+	let activeMessage = $derived(
+		conversationsStore.activeMessages[conversationsStore.activeMessages.length - 1]
+	);
 </script>
 
 <div
@@ -123,6 +131,21 @@
 		/>
 	{/if}
 
+	{#if isReasoning}
+		<Button
+			type="button"
+			variant="secondary"
+			onclick={() =>
+				ChatService.stopReasoning(activeMessage?.completionId ?? '', activeMessage?.model)}
+			class="group h-8 w-8 rounded-full p-0"
+			title="Skip reasoning"
+		>
+			<span class="sr-only">Skip reasoning</span>
+
+			<SkipForward class="h-4 w-4 stroke-muted-foreground group-hover:stroke-foreground" />
+		</Button>
+	{/if}
+
 	{#if isLoading && !canSubmit}
 		<Button
 			type="button"
diff --git a/tools/ui/src/lib/constants/api-endpoints.ts b/tools/ui/src/lib/constants/api-endpoints.ts
index f89ebe4213..9eb6c74e75 100644
--- a/tools/ui/src/lib/constants/api-endpoints.ts
+++ b/tools/ui/src/lib/constants/api-endpoints.ts
@@ -4,6 +4,17 @@ export const API_MODELS = {
 	UNLOAD: '/models/unload'
 };
 
+// chat completion routes, the control route drives realtime inference (e.g. end reasoning)
+export const API_CHAT = {
+	COMPLETIONS: './v1/chat/completions',
+	CONTROL: './v1/chat/completions/control'
+};
+
+// slot introspection, requires the --slots flag on the server
+export const API_SLOTS = {
+	LIST: './slots'
+};
+
 export const API_TOOLS = {
 	LIST: '/tools',
 	EXECUTE: '/tools'
diff --git a/tools/ui/src/lib/constants/control-actions.ts b/tools/ui/src/lib/constants/control-actions.ts
new file mode 100644
index 0000000000..935ae9542a
--- /dev/null
+++ b/tools/ui/src/lib/constants/control-actions.ts
@@ -0,0 +1,7 @@
+// actions accepted by the realtime inference control endpoint (API_CHAT.CONTROL)
+// kept separate from the endpoint paths since these are protocol level verbs
+export const CONTROL_ACTION = {
+	END_REASONING: 'reasoning_end'
+} as const;
+
+export type ControlAction = (typeof CONTROL_ACTION)[keyof typeof CONTROL_ACTION];
diff --git a/tools/ui/src/lib/constants/index.ts b/tools/ui/src/lib/constants/index.ts
index d3a4348024..eb85910370 100644
--- a/tools/ui/src/lib/constants/index.ts
+++ b/tools/ui/src/lib/constants/index.ts
@@ -15,6 +15,7 @@ export * from './cli-flags';
 export * from './code-blocks';
 export * from './code';
 export * from './context-keys';
+export * from './control-actions';
 export * from './css-classes';
 export * from './floating-ui-constraints';
 export * from './formatters';
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index d6c7e36d70..09d616bd76 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -6,7 +6,10 @@ import {
 	ATTACHMENT_LABEL_MCP_PROMPT,
 	ATTACHMENT_LABEL_MCP_RESOURCE,
 	LEGACY_AGENTIC_REGEX,
-	SETTINGS_KEYS
+	SETTINGS_KEYS,
+	API_CHAT,
+	API_SLOTS,
+	CONTROL_ACTION
 } from '$lib/constants';
 import {
 	AttachmentType,
@@ -126,6 +129,7 @@ export class ChatService {
 			onReasoningChunk,
 			onToolCallChunk,
 			onModel,
+			onCompletionId,
 			onTimings,
 			// Tools for function calling
 			tools,
@@ -239,6 +243,9 @@ export class ChatService {
 			? ReasoningFormat.NONE
 			: ReasoningFormat.AUTO;
 
+		// arms the budget sampler so reasoning can be ended at runtime via the control endpoint
+		requestBody.reasoning_control = true;
+
 		if (continueFinalMessage) {
 			requestBody.continue_final_message = true;
 			requestBody.add_generation_prompt = false;
@@ -289,7 +296,7 @@ export class ChatService {
 		}
 
 		try {
-			const response = await fetch(`./v1/chat/completions`, {
+			const response = await fetch(API_CHAT.COMPLETIONS, {
 				method: 'POST',
 				headers: getJsonHeaders(),
 				body: JSON.stringify(requestBody),
@@ -315,6 +322,7 @@ export class ChatService {
 					onReasoningChunk,
 					onToolCallChunk,
 					onModel,
+					onCompletionId,
 					onTimings,
 					conversationId,
 					signal
@@ -379,7 +387,7 @@ export class ChatService {
 	 */
 	static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
 		try {
-			const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
+			const url = model ? `${API_SLOTS.LIST}?model=${encodeURIComponent(model)}` : API_SLOTS.LIST;
 			const res = await fetch(url, { signal });
 			if (!res.ok) return true;
 
@@ -390,6 +398,50 @@ export class ChatService {
 		}
 	}
 
+	/**
+	 * Ends the current reasoning block of a running completion, targeted by its
+	 * chat completion id (streamed back as `id`). Matching the completion rather
+	 * than a slot index avoids a TOCTOU: a finished completion simply matches
+	 * nothing server side. The model is carried so the router forwards to the
+	 * right child, single model ignores it. Returns true on success.
+	 */
+	static async stopReasoning(completionId: string, model?: string | null): Promise<boolean> {
+		if (!completionId) {
+			console.error(
+				'stopReasoning: no completion id for the active message, cannot target the running completion'
+			);
+			return false;
+		}
+
+		const body: Record<string, unknown> = {
+			id: completionId,
+			action: CONTROL_ACTION.END_REASONING
+		};
+		if (model) body.model = model;
+
+		try {
+			const res = await fetch(API_CHAT.CONTROL, {
+				method: 'POST',
+				headers: getJsonHeaders(),
+				body: JSON.stringify(body)
+			});
+
+			const data = await res.json().catch(() => null);
+			if (!res.ok || data?.success !== true) {
+				console.error('stopReasoning: control request failed', {
+					status: res.status,
+					completionId,
+					response: data
+				});
+				return false;
+			}
+			return true;
+		} catch (error) {
+			console.error('stopReasoning: control request threw', { completionId, error });
+			return false;
+		}
+	}
+
 	/**
 	 * Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
 	 * After a response completes, this re-submits the full conversation
@@ -457,7 +509,7 @@ export class ChatService {
 		}
 
 		try {
-			await fetch(`./v1/chat/completions`, {
+			await fetch(API_CHAT.COMPLETIONS, {
 				method: 'POST',
 				headers: getJsonHeaders(),
 				body: JSON.stringify(requestBody),
@@ -502,6 +554,7 @@ export class ChatService {
 		onReasoningChunk?: (chunk: string) => void,
 		onToolCallChunk?: (chunk: string) => void,
 		onModel?: (model: string) => void,
+		onCompletionId?: (id: string) => void,
 		onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void,
 		conversationId?: string,
 		abortSignal?: AbortSignal
@@ -519,6 +572,7 @@ export class ChatService {
 		let lastTimings: ChatMessageTimings | undefined;
 		let streamFinished = false;
 		let modelEmitted = false;
+		let idEmitted = false;
 		let toolCallIndexOffset = 0;
 		let hasOpenToolCallBatch = false;
 
@@ -603,6 +657,11 @@ export class ChatService {
 								onModel?.(chunkModel);
 							}
 
+							if (parsed.id && !idEmitted) {
+								idEmitted = true;
+								onCompletionId?.(parsed.id);
+							}
+
 							if (promptProgress) {
 								ChatService.notifyTimings(undefined, promptProgress, onTimings);
 							}
diff --git a/tools/ui/src/lib/stores/agentic.svelte.ts b/tools/ui/src/lib/stores/agentic.svelte.ts
index 4866995b4a..947737d7c1 100644
--- a/tools/ui/src/lib/stores/agentic.svelte.ts
+++ b/tools/ui/src/lib/stores/agentic.svelte.ts
@@ -488,6 +488,7 @@ class AgenticStore {
 			onToolCallsStreaming,
 			onAttachments,
 			onModel,
+			onCompletionId,
 			onAssistantTurnComplete,
 			createToolResultMessage,
 			createAssistantMessage,
@@ -597,6 +598,7 @@ class AgenticStore {
 							}
 						},
 						onModel,
+						onCompletionId,
 						onTimings: (timings?: ChatMessageTimings, progress?: ChatMessagePromptProgress) => {
 							onTimings?.(timings, progress);
 							if (timings) {
diff --git a/tools/ui/src/lib/stores/chat.svelte.ts b/tools/ui/src/lib/stores/chat.svelte.ts
index f2f13f25dc..667fb417c3 100644
--- a/tools/ui/src/lib/stores/chat.svelte.ts
+++ b/tools/ui/src/lib/stores/chat.svelte.ts
@@ -63,7 +63,10 @@ class ChatStore {
 	currentResponse = $state('');
 	errorDialogState = $state<ErrorDialogState | null>(null);
 	isLoading = $state(false);
+	// true while the active conversation streams reasoning content but no visible content yet
+	isReasoning = $state(false);
 	chatLoadingStates = new SvelteMap<string, boolean>();
+	chatReasoningStates = new SvelteMap<string, boolean>();
 	chatStreamingStates = new SvelteMap<string, { response: string; messageId: string }>();
 	private abortControllers = new SvelteMap<string, AbortController>();
 	private preEncodeAbortController: AbortController | null = null;
@@ -94,6 +97,17 @@ class ChatStore {
 		} else {
 			this.chatLoadingStates.delete(convId);
 			if (convId === conversationsStore.activeConversation?.id) this.isLoading = false;
+			this.setChatReasoning(convId, false);
+		}
+	}
+
+	private setChatReasoning(convId: string, reasoning: boolean): void {
+		if (reasoning) {
+			this.chatReasoningStates.set(convId, true);
+			if (convId === conversationsStore.activeConversation?.id) this.isReasoning = true;
+		} else {
+			this.chatReasoningStates.delete(convId);
+			if (convId === conversationsStore.activeConversation?.id) this.isReasoning = false;
 		}
 	}
 	private setChatStreaming(convId: string, response: string, messageId: string): void {
@@ -110,6 +124,7 @@ class ChatStore {
 	}
 	syncLoadingStateForChat(convId: string): void {
 		this.isLoading = this.chatLoadingStates.get(convId) || false;
+		this.isReasoning = this.chatReasoningStates.get(convId) || false;
 		const s = this.chatStreamingStates.get(convId);
 		this.currentResponse = s?.response || '';
 		this.isStreamingActive = s !== undefined;
@@ -265,6 +280,10 @@ class ChatStore {
 		return this.chatLoadingStates.get(convId) || false;
 	}
 
+	isChatReasoningPublic(convId: string): boolean {
+		return this.chatReasoningStates.get(convId) || false;
+	}
+
 	private isChatLoadingInternal(convId: string): boolean {
 		return this.chatLoadingStates.has(convId) || this.chatStreamingStates.has(convId);
 	}
@@ -655,6 +674,17 @@ class ChatStore {
 			}
 		};
 
+		let completionIdRecorded = false;
+		const recordCompletionId = (id: string): void => {
+			if (!id || completionIdRecorded) return;
+			completionIdRecorded = true;
+			const idx = conversationsStore.findMessageIndex(currentMessageId);
+			conversationsStore.updateMessageAtIndex(idx, { completionId: id });
+			DatabaseService.updateMessage(currentMessageId, { completionId: id }).catch(() => {
+				completionIdRecorded = false;
+			});
+		};
+
 		const updateStreamingUI = () => {
 			this.setChatStreaming(convId, streamedContent, currentMessageId);
 			const idx = conversationsStore.findMessageIndex(currentMessageId);
@@ -676,6 +706,7 @@ class ChatStore {
 			onChunk: (chunk: string) => {
 				streamedContent += chunk;
 				updateStreamingUI();
+				this.setChatReasoning(convId, false);
 			},
 			onReasoningChunk: (chunk: string) => {
 				streamedReasoningContent += chunk;
@@ -685,6 +716,7 @@ class ChatStore {
 				conversationsStore.updateMessageAtIndex(idx, {
 					reasoningContent: streamedReasoningContent
 				});
+				this.setChatReasoning(convId, true);
 			},
 			onToolCallsStreaming: (toolCalls) => {
 				const idx = conversationsStore.findMessageIndex(currentMessageId);
@@ -702,6 +734,7 @@ class ChatStore {
 				DatabaseService.updateMessage(messageId, { extra: updatedExtras }).catch(console.error);
 			},
 			onModel: (modelName: string) => recordModel(modelName),
+			onCompletionId: (id: string) => recordCompletionId(id),
 			onTurnComplete: (intermediateTimings: ChatMessageTimings) => {
 				// Update the first assistant message with cumulative agentic timings
 				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
@@ -887,6 +920,7 @@ class ChatStore {
 				onChunk: streamCallbacks.onChunk,
 				onReasoningChunk: streamCallbacks.onReasoningChunk,
 				onModel: streamCallbacks.onModel,
+				onCompletionId: streamCallbacks.onCompletionId,
 				onTimings: streamCallbacks.onTimings,
 				onComplete: async (
 					finalContent?: string,
@@ -1373,6 +1407,7 @@ class ChatStore {
 						appendedContent += chunk;
 						hasReceivedContent = true;
 						updateStreamingContent(originalContent + appendedContent);
+						this.setChatReasoning(msg.convId, false);
 					},
 					onReasoningChunk: (chunk: string) => {
 						appendedReasoning += chunk;
@@ -1382,6 +1417,7 @@ class ChatStore {
 						conversationsStore.updateMessageAtIndex(idx, {
 							reasoningContent: originalReasoning + appendedReasoning
 						});
+						this.setChatReasoning(msg.convId, true);
 					},
 					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 						const tokensPerSecond =
@@ -1924,6 +1960,7 @@ export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(c
 export const isChatStreaming = () => chatStore.isStreaming();
 export const isEditing = () => chatStore.isEditing();
 export const isLoading = () => chatStore.isLoading;
+export const isReasoning = () => chatStore.isReasoning;
 export const pendingEditMessageId = () => chatStore.pendingEditMessageId;
 export const chatHasPendingMessage = (convId: string) => chatStore.hasPendingMessage(convId);
 export const chatPendingMessageContent = (convId: string) =>
diff --git a/tools/ui/src/lib/types/agentic.d.ts b/tools/ui/src/lib/types/agentic.d.ts
index b94998384f..bcec10c671 100644
--- a/tools/ui/src/lib/types/agentic.d.ts
+++ b/tools/ui/src/lib/types/agentic.d.ts
@@ -93,6 +93,7 @@ export interface AgenticFlowCallbacks {
 	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
 	/** Model name detected from response */
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	/** Current assistant turn's streaming is complete - save to DB */
 	onAssistantTurnComplete?: (
 		content: string,
diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts
index c780351801..f620d67351 100644
--- a/tools/ui/src/lib/types/api.d.ts
+++ b/tools/ui/src/lib/types/api.d.ts
@@ -271,6 +271,7 @@ export interface ApiChatCompletionToolCall extends ApiChatCompletionToolCallDelt
 }
 
 export interface ApiChatCompletionStreamChunk {
+	id?: string;
 	object?: string;
 	model?: string;
 	choices: Array<{
diff --git a/tools/ui/src/lib/types/chat.d.ts b/tools/ui/src/lib/types/chat.d.ts
index acedd0769a..b2158807d4 100644
--- a/tools/ui/src/lib/types/chat.d.ts
+++ b/tools/ui/src/lib/types/chat.d.ts
@@ -97,6 +97,7 @@ export interface ChatStreamCallbacks {
 	onToolCallsStreaming?: (toolCalls: ApiChatCompletionToolCall[]) => void;
 	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
 	onAssistantTurnComplete?: (
 		content: string,
diff --git a/tools/ui/src/lib/types/database.d.ts b/tools/ui/src/lib/types/database.d.ts
index 02bdb3ee60..aecc250e81 100644
--- a/tools/ui/src/lib/types/database.d.ts
+++ b/tools/ui/src/lib/types/database.d.ts
@@ -112,6 +112,8 @@ export interface DatabaseMessage {
 	reasoningContent?: string;
 	/** Serialized JSON array of tool calls made by assistant messages */
 	toolCalls?: string;
+	/** Chat completion id streamed by the server, used to target realtime control (e.g. end reasoning) */
+	completionId?: string;
 	/** Tool call ID for tool result messages (role: 'tool') */
 	toolCallId?: string;
 	children: string[];
diff --git a/tools/ui/src/lib/types/settings.d.ts b/tools/ui/src/lib/types/settings.d.ts
index 03818091a1..44e83f8f33 100644
--- a/tools/ui/src/lib/types/settings.d.ts
+++ b/tools/ui/src/lib/types/settings.d.ts
@@ -101,6 +101,7 @@ export interface SettingsChatServiceOptions {
 	onToolCallChunk?: (chunk: string) => void;
 	onAttachments?: (extras: DatabaseMessageExtra[]) => void;
 	onModel?: (model: string) => void;
+	onCompletionId?: (id: string) => void;
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
 	onComplete?: (
 		response: string,