common/autoparser: fixes for newline handling / forced tool calls (#22654)

* chat/autoparser: the fixes * Move optspace() to chat-peg-parser, comment out server tests invalidated due to content now allowed with forced tool calls. * Trim whitespace on apply instead
2026-06-27 23:50:20 -05:00 · 2026-05-04 13:18:11 +02:00 · 2026-05-04 13:18:11 +02:00 · a4701c98f7
commit a4701c98f7
parent 994118a183
10 changed files with 392 additions and 97 deletions
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        if (!end.empty()) {
            if (!start.empty()) {
                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
            }
            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
+            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
        }
    }
@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
+    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
           p.end();
 }
 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    common_peg_parser tool_choice = p.choice();
@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
           p.end();
 }
 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
           p.end();
 }
 }  // namespace autoparser
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = trim_leading_whitespace(diff.right);
+                start = diff.right;
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = trim_trailing_whitespace(diff.left);
+                end = diff.left;
                mode = reasoning_mode::TAG_BASED;
            }
        }
@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = trim_trailing_whitespace(result.tags["post"]);
+            end = result.tags["post"];
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    return literal(s.substr(0, s.rfind(delimiter)));
 }
 common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
    auto parser = eps();
    size_t end_of_prefix_space = tag.size();
    size_t start_of_suffix_space = tag.size();
    for (size_t i = 0; i < tag.size(); i++) {
        if (!std::isspace(tag[i])) {
            end_of_prefix_space = i;
            break;
        }
    }
    for (size_t i = tag.size(); i > 0; i--) {
        if (!std::isspace(tag[i - 1])) {
            start_of_suffix_space = i;
            break;
        }
    }
    for (size_t i = 0; i < end_of_prefix_space; i++) {
        parser += optional(literal(std::string(1, tag[i])));
    }
    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
        parser += optional(literal(std::string(1, tag[i])));
    }
    return parser;
 }
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Return a parser that parses the prefix of a string, up to a given delimiter.
    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
    common_peg_parser optspace(const std::string & tag);
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -2221,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = autoparser.reasoning.start;
+            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
-            auto_params.thinking_end_tag   = autoparser.reasoning.end;
+            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
        }
        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@ -158,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
    for (size_t i = 0; i < cur_p->size; i++) {
        if (cur_p->data[i].id != forced) {
            cur_p->data[i].logit = -INFINITY;
        } else {
            cur_p->data[i].logit = +INFINITY; // force the token
        }
    }
 }
--- a/scripts/server-test-function-call.py
+++ b/scripts/server-test-function-call.py
@ -79,7 +79,7 @@ def print_info(msg):
 # ---------------------------------------------------------------------------
-def chat_completion(url, messages, tools=None, stream=False):
+def chat_completion(url, messages, tools=None, stream=False, force_tools=False):
    payload = {
        "messages": messages,
        "stream": stream,
@ -87,7 +87,10 @@ def chat_completion(url, messages, tools=None, stream=False):
    }
    if tools:
        payload["tools"] = tools
-        payload["tool_choice"] = "auto"
+        if force_tools:
            payload["tool_choice"] = "required"
        else:
            payload["tool_choice"] = "auto"
    try:
        response = requests.post(url, json=payload, stream=stream)
@ -160,7 +163,13 @@ def chat_completion(url, messages, tools=None, stream=False):
    return result
-def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
+def all_tools_called(tools, all_tool_calls):
    all_tool_names = set([tc["function"]["name"] for tc in tools])
    all_called_tool_names = set([tc["function"]["name"] for tc in all_tool_calls])
    return all_tool_names == all_called_tool_names
 def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6, force_tools=False):
    """
    Drive the multi-turn tool-call loop:
      1. Send messages to model.
@ -172,8 +181,8 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
    msgs = list(messages)
    all_tool_calls: list[dict] = []
-    for _ in range(max_turns):
+    for t in range(max_turns):
-        result = chat_completion(url, msgs, tools=tools, stream=stream)
+        result = chat_completion(url, msgs, tools=tools, stream=stream, force_tools=(force_tools and not all_tools_called(tools, all_tool_calls)))
        if result is None:
            return all_tool_calls, None
@ -235,10 +244,10 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
 # ---------------------------------------------------------------------------
-def run_test(url, test_case, stream):
+def run_test(url, test_case, stream, force_tools):
    name = test_case["name"]
    mode = f"{'stream' if stream else 'non-stream'}"
-    print_header(f"{name}  [{mode}]")
+    print_header(f"{name} [{mode}, force_tools={force_tools}] ")
    all_tool_calls, final_content = run_agentic_loop(
        url,
@ -246,6 +255,7 @@ def run_test(url, test_case, stream):
        tools=test_case["tools"],
        mock_tool_responses=test_case["mock_tool_responses"],
        stream=stream,
        force_tools=force_tools
    )
    if final_content is None and not all_tool_calls:
@ -1093,6 +1103,9 @@ def main():
    parser.add_argument(
        "--stream-only", action="store_true", help="Only run streaming mode tests"
    )
    parser.add_argument(
        "--force-tools", action="store_true", help="Change tool mode to forced instead of auto"
    )
    parser.add_argument(
        "--test",
        help="Run only the test whose name contains this substring (case-insensitive)",
@ -1103,10 +1116,13 @@ def main():
    print_info(f"Testing server at {url}")
    modes = []
    force_tools = False
    if not args.stream_only:
        modes.append(False)
    if not args.no_stream:
        modes.append(True)
    if args.force_tools:
        force_tools = True
    cases: list[dict] = ALL_TEST_CASES
    if args.test:
@ -1121,7 +1137,7 @@ def main():
    for stream in modes:
        for case in cases:
            total += 1
-            if run_test(url, case, stream=stream):
+            if run_test(url, case, stream=stream, force_tools=force_tools):
                passed += 1
    color = GREEN if passed == total else RED
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -542,6 +542,36 @@ static common_chat_tool edit_tool{
    })",
 };
 static common_chat_tool manage_todo_list_tool{
    /* .name = */ "manage_todo_list",
    /* .description = */ "Create or update the todo list",
    /* .parameters = */ R"({
        "type": "object",
        "properties": {
            "todos": {
                "type": "array",
                "description": "List of TODO list items"
            }
        },
        "required": ["todos"]
    })",
 };
 static common_chat_tool run_in_terminal_tool{
    /* .name = */ "run_in_terminal",
    /* .description = */ "Run a shell command.",
    /* .parameters = */ R"({
        "type": "object",
        "properties": {
            "command": {
                "type": "string",
                "description": "Shell command to run"
            }
        },
        "required": ["command"]
    })",
 };
 static common_chat_tool magic_tool{
    /* .name = */ "magic",
    /* .description = */ "Magic tool that takes a hash",
@ -1379,6 +1409,16 @@ class peg_test_builder {
        return *this;
    }
    peg_test_builder & tool_choice(common_chat_tool_choice choice) {
        tc_.params.tool_choice = choice;
        return *this;
    }
    peg_test_builder & messages(std::vector<common_chat_msg> messages) {
        tc_.params.messages = std::move(messages);
        return *this;
    }
    // Execute the test
    void run() {
        // Check template filter
@ -1755,23 +1795,23 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "hello()\n"
               "</parameter>\n"
               "</function>\n"
-               "</tool_call>"
+               "</tool_call>")
            )
            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({
                python_tool
        })
-            .expect_reasoning("Let's call a tool: <tool_call>\n"
+            .expect_reasoning(
-               "<function=python>\n"
+                "Let's call a tool: <tool_call>\n"
-               "<parameter=code>\n"
+                "<function=python>\n"
-               "def hello():\n"
+                "<parameter=code>\n"
-               "    print(\"Not the real call!\")\n"
+                "def hello():\n"
-               "\n"
+                "    print(\"Not the real call!\")\n"
-               "hello()\n"
+                "\n"
-               "</parameter>\n"
+                "hello()\n"
-               "</function>\n"
+                "</parameter>\n"
-               "</tool_call>")
+                "</function>\n"
                "</tool_call>")
            .expect_tool_calls({
                { "python", "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
            })
@ -1800,6 +1840,219 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .tools({ empty_args_tool_no_properties })
            .expect(message_with_tool_calls("empty_args_no_props", "{}"))
            .run();
        // Edge cases when reasoning traces are not sent
        tst.test(
               "<think>\n\n</think>\n\n"
               "<tool_call>\n"
               "<function=special_function>\n"
               "<parameter=arg1>\n1\n</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({
                special_function_tool
        })
            .expect_reasoning("<think>\n\n")
            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
            .run();
        tst.test(
               "</think>\n\n"
               "<tool_call>\n"
               "<function=special_function>\n"
               "<parameter=arg1>\n1\n</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({
                special_function_tool
        })
            .expect_reasoning("")
            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
            .run();
        tst.test(
               "</think>\n\n"
               "<tool_call>\n"
               "<function=run_in_terminal>\n"
               "<parameter=command>\n"
               "pwd\n"
               "</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
            .tools({
                run_in_terminal_tool
        })
            .expect_tool_calls({
                { "run_in_terminal", R"({"command": "pwd"})", {} },
            })
            .run();
        tst.test(
               "</think>\n\n"
               "Let me inspect the current directory.\n"
               "<tool_call>\n"
               "<function=run_in_terminal>\n"
               "<parameter=command>\n"
               "pwd\n"
               "</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
            .tools({
                run_in_terminal_tool
        })
            .expect_content("Let me inspect the current directory.\n")
            .expect_tool_calls({
                { "run_in_terminal", R"({"command": "pwd"})", {} },
            })
            .run();
        tst.test(
               "</think>\n\n"
               "Let me inspect the current directory.\n"
               "<tool_call>\n"
               "<function=run_in_terminal>\n"
               "<parameter=command>\n"
               "pwd\n"
               "</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
            .tools({
                run_in_terminal_tool
        })
            .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
            .expect_content("Let me inspect the current directory.\n")
            .expect_tool_calls({
                { "run_in_terminal", R"({"command": "pwd"})", {} },
            })
            .run();
        tst.test(
               "I should inspect the directory.\n"
               "</think>\n\n"
               "Let me inspect it now.\n"
               "<tool_call>\n"
               "<function=run_in_terminal>\n"
               "<parameter=command>\n"
               "pwd\n"
               "</parameter>\n"
               "</function>\n"
               "</tool_call>")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
            .tools({
                run_in_terminal_tool
        })
            .expect_reasoning("I should inspect the directory.")
            .expect_content("Let me inspect it now.\n")
            .expect_tool_calls({
                { "run_in_terminal", R"({"command": "pwd"})", {} },
            })
            .run();
        tst.test(
               "I might call <tool_call> later, but I am still thinking.\n"
               "</think>\n\n"
               "Final answer without tools.")
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .enable_thinking(true)
            .tools({ run_in_terminal_tool })
            .expect_reasoning("I might call <tool_call> later, but I am still thinking.")
            .expect_content("Final answer without tools.")
            .run();
        {
            common_chat_msg user_start;
            user_start.role    = "user";
            user_start.content = "Create a todo list, then inspect the repository.";
            common_chat_msg assistant_todos =
                simple_assist_msg("", "", "manage_todo_list",
                                  R"({"todos":[{"item":"Inspect repository","selected":false}]})", "call_todos");
            common_chat_msg tool_result;
            tool_result.role         = "tool";
            tool_result.content      = "Successfully wrote todo list";
            tool_result.tool_call_id = "call_todos";
            common_chat_msg user_continue;
            user_continue.role    = "user";
            user_continue.content = "Proceed.";
            tst.test(
                   "I need to run a terminal command.\n"
                   "</think>\n\n"
                   "<tool_call>\n"
                   "<function=run_in_terminal>\n"
                   "<parameter=command>\n"
                   "pwd\n"
                   "</parameter>\n"
                   "</function>\n"
                   "</tool_call>")
                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
                .enable_thinking(true)
                .tools({
                    manage_todo_list_tool, run_in_terminal_tool
            })
                .messages({ user_start, assistant_todos, tool_result, user_continue })
                .expect_reasoning("I need to run a terminal command.")
                .expect_tool_calls({
                    { "run_in_terminal", R"({"command": "pwd"})", {} },
                })
                .run();
            tst.test(
                   "I need to run a terminal command.\n"
                   "</think>\n\n"
                   "Let me inspect the current directory.\n"
                   "<tool_call>\n"
                   "<function=run_in_terminal>\n"
                   "<parameter=command>\n"
                   "pwd\n"
                   "</parameter>\n"
                   "</function>\n"
                   "</tool_call>")
                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
                .enable_thinking(true)
                .tools({
                    manage_todo_list_tool, run_in_terminal_tool
            })
                .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
                .messages({ user_start, assistant_todos, tool_result, user_continue })
                .expect_reasoning("I need to run a terminal command.")
                .expect_content("Let me inspect the current directory.\n")
                .expect_tool_calls({
                    { "run_in_terminal", R"({"command": "pwd"})", {} },
                })
                .run();
            tst.test(
                   "</think>\n\n"
                   "<tool_call>\n"
                   "<function=run_in_terminal>\n"
                   "<parameter=command>\n"
                   "pwd\n"
                   "</parameter>\n"
                   "</function>\n"
                   "</tool_call>")
                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
                .enable_thinking(true)
                .tools({
                    manage_todo_list_tool, run_in_terminal_tool
            })
                .messages({ user_start, assistant_todos, tool_result, user_continue })
                .expect_tool_calls({
                    { "run_in_terminal", R"({"command": "pwd"})", {} },
                })
                .run();
        }
    }
    {
--- a/tests/test-reasoning-budget.cpp
+++ b/tests/test-reasoning-budget.cpp
@ -70,20 +70,20 @@ static void test_reasoning_budget(
        llama_sampler_apply(sampler, &cur_p);
        // Check if forcing is active (all logits except one should be -INFINITY)
-        size_t finite_count = 0;
+        size_t not_neg_inf = 0;
-        llama_token finite_token = -1;
+        llama_token not_neg_inf_token = -1;
        for (size_t j = 0; j < cur.size(); j++) {
-            if (std::isfinite(cur[j].logit)) {
+            if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
-                finite_count++;
+                not_neg_inf++;
-                finite_token = cur[j].id;
+                not_neg_inf_token = cur[j].id;
            }
        }
        llama_sampler_accept(sampler, sequence[i]);
-        fprintf(stderr, "    i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
+        fprintf(stderr, "    i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);
-        if (finite_count == 1) {
+        if (not_neg_inf == 1) {
            if (actual_force_start == SIZE_MAX) {
                actual_force_start = i;
            }
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@ -126,69 +126,70 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
            actual_arguments = json.loads(actual_arguments)
        assert argument_key in actual_arguments, f"tool arguments: {actual_arguments}, expected: {argument_key}"
 # PR #22654: commented out since we're now allowing content before tool calls in tool_call: required, so we can't force this
 # in the tiny model just by using the grammar
 #
 # @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 # @pytest.mark.parametrize("template_name,tool,argument_key", [
 #     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
 #     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
 #     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
 #     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
 #     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
 #     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
 # ])
 # def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
 #     global server
 #     n_predict = 1024
 #     # server = ServerPreset.stories15m_moe()
 #     server.jinja = True
 #     server.n_predict = n_predict
 #     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
 #     server.start()
 #     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+# @pytest.mark.slow
-@pytest.mark.parametrize("template_name,tool,argument_key", [
+# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+# @pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
 ])
 def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
    global server
    n_predict = 1024
    # server = ServerPreset.stories15m_moe()
    server.jinja = True
    server.n_predict = n_predict
    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
    server.start()
    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
 #     ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
 #     ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
-@pytest.mark.slow
+#     ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+#     # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
-@pytest.mark.parametrize("template_name,tool,argument_key", [
+#     # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
-    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
-    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
-    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
+#     ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
+#     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+#     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
 #     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
+# ])
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
+# def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
-
+#     global server
-    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+#     n_predict = 512
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
+#     # server = ServerPreset.stories15m_moe()
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+#     server.jinja = True
-
+#     server.n_predict = n_predict
-])
+#     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+#     server.start(timeout_seconds=TIMEOUT_START_SLOW)
-    global server
+#     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
    n_predict = 512
    # server = ServerPreset.stories15m_moe()
    server.jinja = True
    server.n_predict = n_predict
    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
    server.start(timeout_seconds=TIMEOUT_START_SLOW)
    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
@pytest.mark.slow