server : hint preserve_thinking when supported by chat template

Detect if the chat template supports the 'preserve_thinking' kwarg (by checking for its presence in the template source) and print a hint suggesting users enable it via --chat-template-kwargs. This is particularly useful for models like Qwen3.6 where preserve_thinking is recommended but many users are unaware of the option. ref: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking Assisted-by: pi:llama.cpp/Qwen3.6-27B
2026-06-27 23:50:20 -05:00 · 2026-06-27 17:39:39 +03:00 · 2026-06-27 17:39:39 +03:00 · eae7149824
commit eae7149824
parent 0ed235ea2c
3 changed files with 16 additions and 0 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -367,6 +367,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
    return params.supports_thinking;
 }

+// Check if the template source contains the preserve_thinking kwarg.
+// This is useful for printing a hint to the user to enable it.
+bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates) {
+    std::string src = common_chat_templates_source(chat_templates);
+    return src.find("preserve_thinking") != std::string::npos;
+}
+
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
    std::vector<common_chat_msg> msgs;

--- a/common/chat.h
+++ b/common/chat.h
@ -347,6 +347,7 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

 bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
+bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates);

 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1517,6 +1517,14 @@ private:
            const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);

+            // hint: suggest preserve_thinking if the template supports it but user hasn't set it
+            if (params_base.use_jinja && common_chat_templates_support_preserve_thinking(chat_templates.get())) {
+                auto it = params_base.default_template_kwargs.find("preserve_thinking");
+                if (it == params_base.default_template_kwargs.end()) {
+                    SRV_WRN("%s\n", "chat template supports 'preserve_thinking' - consider using --chat-template-kwargs \"{\\\"preserve_thinking\\\": true}\"");
+                }
+            }
+
            // IMPORTANT: chat_params is reused across sleeping / resuming states,
            //            never store llama_context/llama_model pointers in chat_params,
            //            as they may be invalidated after sleeping