diff --git a/common/chat.cpp b/common/chat.cpp index 0cee80434e..8f42fef844 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -367,6 +367,13 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates * return params.supports_thinking; } +// Check if the template source contains the preserve_thinking kwarg. +// This is useful for printing a hint to the user to enable it. +bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates) { + std::string src = common_chat_templates_source(chat_templates); + return src.find("preserve_thinking") != std::string::npos; +} + std::vector common_chat_msgs_parse_oaicompat(const json & messages) { std::vector msgs; diff --git a/common/chat.h b/common/chat.h index 7898f1623f..54010a45eb 100644 --- a/common/chat.h +++ b/common/chat.h @@ -347,6 +347,7 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates); +bool common_chat_templates_support_preserve_thinking(const common_chat_templates * chat_templates); // Parses a JSON array of messages in OpenAI's chat completion API format. std::vector common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5c33a418f5..b731072b4b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1517,6 +1517,14 @@ private: const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking; SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking); + // hint: suggest preserve_thinking if the template supports it but user hasn't set it + if (params_base.use_jinja && common_chat_templates_support_preserve_thinking(chat_templates.get())) { + auto it = params_base.default_template_kwargs.find("preserve_thinking"); + if (it == params_base.default_template_kwargs.end()) { + SRV_WRN("%s\n", "chat template supports 'preserve_thinking' - consider using --chat-template-kwargs \"{\\\"preserve_thinking\\\": true}\""); + } + } + // IMPORTANT: chat_params is reused across sleeping / resuming states, // never store llama_context/llama_model pointers in chat_params, // as they may be invalidated after sleeping