server : hint preserve_thinking when supported by chat template

Detect if the chat template supports the 'preserve_thinking' kwarg
(by checking for its presence in the template source) and print a hint
suggesting users enable it via --chat-template-kwargs.

This is particularly useful for models like Qwen3.6 where preserve_thinking
is recommended but many users are unaware of the option.

ref: https://docs.z.ai/guides/capabilities/thinking-mode#preserved-thinking

Assisted-by: pi:llama.cpp/Qwen3.6-27B
This commit is contained in:
Georgi Gerganov 2026-06-27 17:39:39 +03:00
parent 0ed235ea2c
commit eae7149824
3 changed files with 16 additions and 0 deletions

View File

@ -367,6 +367,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
return params.supports_thinking;
}
// Check if the template source contains the preserve_thinking kwarg.
// This is useful for printing a hint to the user to enable it.
bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates) {
std::string src = common_chat_templates_source(chat_templates);
return src.find("preserve_thinking") != std::string::npos;
}
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
std::vector<common_chat_msg> msgs;

View File

@ -347,6 +347,7 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates);
// Parses a JSON array of messages in OpenAI's chat completion API format.
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

View File

@ -1517,6 +1517,14 @@ private:
const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking;
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
// hint: suggest preserve_thinking if the template supports it but user hasn't set it
if (params_base.use_jinja && common_chat_templates_support_preserve_thinking(chat_templates.get())) {
auto it = params_base.default_template_kwargs.find("preserve_thinking");
if (it == params_base.default_template_kwargs.end()) {
SRV_WRN("%s\n", "chat template supports 'preserve_thinking' - consider using --chat-template-kwargs \"{\\\"preserve_thinking\\\": true}\"");
}
}
// IMPORTANT: chat_params is reused across sleeping / resuming states,
// never store llama_context/llama_model pointers in chat_params,
// as they may be invalidated after sleeping