chat: Cohere2MoE/North Code: parse unopened thinking under --reasoning off (follow-up to #1968) (#2012)

* Handle Cohere2MoE unopened thinking before tools * Cohere2MoE: route unopened thinking to reasoning_content; test in active target Follow-up to #1968. Gate extract_reasoning on reasoning_format only (drop the "&& enable_thinking" addition) so the unopened-thinking handling does not also change where an opened thinking block is routed. Under --reasoning off (enable_thinking=false, reasoning_format defaults to DEEPSEEK) an orphaned thinking block is now quarantined in reasoning_content with clean content and a native tool call, instead of leaking the thinking prose into the user-facing answer. Move the Cohere2MoE end-to-end parser cases into tests/test-chat-auto-parser.cpp, which CMake actually builds. tests/test-chat.cpp has been disabled in tests/CMakeLists.txt since #723, so cohere coverage added there never ran in CI; revert the local band-aids to that file. * Cohere2MoE: harden parser from NMC eval findings --------- Co-authored-by: Joel Farthing <262452229+joelfarthing@users.noreply.github.com>
2026-06-28 04:30:15 -05:00 · 2026-06-24 02:04:41 -05:00 · 2026-06-24 02:04:41 -05:00 · 8686ea708b
commit 8686ea708b
parent 5a4fa17947
2 changed files with 185 additions and 7 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -2144,22 +2144,42 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
    };

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
+    // Surface reasoning as reasoning_content whenever the output format requests it
+    // (Cohere2MoE has a non-empty THINK_START, so this matches the narrow condition
+    // from the reasoning-delimiter work). Under --reasoning off the model may still
+    // emit an (un)opened thinking block; keeping it in reasoning_content quarantines
+    // it from the user-facing content rather than leaking it into the answer.
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
-        auto end               = p.optional(p.literal(TURN_END)) + p.end();
+        // Cohere2MoE can emit a stray text terminator after an action envelope.
+        auto end               = p.optional(p.literal(TEXT_END)) + p.optional(p.literal(TURN_END)) + p.end();
+
+        auto thinking_body = [&]() {
+            return p.until_one_of({ THINK_END, TEXT_START, ACTION_START });
+        };

        common_peg_parser reasoning = p.eps();
        if (extract_reasoning) {
-            reasoning = p.optional(p.literal(THINK_START) +
-                                   p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) +
-                                   p.optional(p.literal(THINK_END)));
+            auto opened = p.literal(THINK_START) +
+                          p.reasoning(thinking_body()) +
+                          p.optional(p.literal(THINK_END));
+            auto unopened = p.reasoning(thinking_body()) + p.literal(THINK_END);
+            reasoning = p.optional(p.choice({ opened, unopened }));
+        } else if (inputs.enable_thinking) {
+            auto opened = p.content(p.literal(THINK_START) +
+                                    thinking_body() +
+                                    p.optional(p.literal(THINK_END)));
+            auto unopened = p.content(thinking_body() + p.literal(THINK_END));
+            reasoning = p.optional(p.choice({ opened, unopened }));
        } else {
-            reasoning = p.optional(p.content(p.literal(THINK_START) +
-                                             p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) +
-                                             p.optional(p.literal(THINK_END))));
+            auto opened = p.literal(THINK_START) +
+                          p.content(thinking_body()) +
+                          p.optional(p.literal(THINK_END));
+            auto unopened = p.content(thinking_body()) + p.literal(THINK_END);
+            reasoning = p.optional(p.choice({ opened, unopened }));
        }

        auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END));
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@ -62,6 +62,9 @@ static void test_nemotron_tool_format(testing & t);
 static void test_cohere_reasoning_detection(testing & t);
 static void test_cohere_analysis(testing & t);

+// End-to-end Cohere2MoE (North Code) dedicated PEG parser coverage.
+static void test_cohere2moe_parser(testing & t);
+
 // SmolLM3 template analysis tests
 static void test_smollm3_analysis(testing & t);

@ -98,6 +101,7 @@ int main(int argc, char * argv[]) {
    t.test("segments", test_marker_separation);
    t.test("seed_oss_diffs", test_seed_oss_tool_analysis);
    t.test("cohere", test_cohere_analysis);
+    t.test("cohere2moe_parser", test_cohere2moe_parser);
    t.test("nemotron", test_nemotron_analysis);
    t.test("smollm3", test_smollm3_analysis);
    t.test("standard_json_tools", test_standard_json_tools_formats);
@ -1967,3 +1971,157 @@ static void test_tagged_args_with_embedded_quotes(testing & t) {
    }
 }

+// End-to-end coverage for the dedicated Cohere2MoE (North Code) parser:
+// template apply -> PEG parse -> assert message. Exercises the reasoning-mode
+// matrix, including the unopened-thinking-under---reasoning-off case (#1968
+// follow-up). Routing rule: reasoning surfaces to reasoning_content whenever the
+// output format != NONE (DEEPSEEK), and folds into content under NONE.
+static void test_cohere2moe_parser(testing & t) {
+    std::ifstream fin("models/templates/Cohere2MoE.jinja", std::ios::binary);
+    std::ostringstream buf; buf << fin.rdbuf();
+    std::string src = buf.str();
+    t.assert_true("Cohere2MoE template loaded", src.length() > 0);
+    if (src.empty()) {
+        return;
+    }
+
+    common_chat_templates_ptr tmpls(common_chat_templates_init(/* model = */ nullptr, src));
+
+    common_chat_tool special_function{
+        /* .name        = */ "special_function",
+        /* .description = */ "I'm special",
+        /* .parameters  = */ R"({"type":"object","properties":{"arg1":{"type":"integer"}},"required":["arg1"]})",
+    };
+    common_chat_tool python{
+        /* .name        = */ "python",
+        /* .description = */ "Run Python code",
+        /* .parameters  = */ R"({"type":"object","properties":{"code":{"type":"string"}},"required":["code"]})",
+    };
+
+    common_chat_msg user;
+    user.role    = "user";
+    user.content = "Hey";
+
+    const std::string act =
+        "<|START_ACTION|>[\n"
+        "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+        "]<|END_ACTION|>";
+    const std::string act_numeric_id =
+        "<|START_ACTION|>[\n"
+        "    {\"tool_call_id\": 0, \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+        "]<|END_ACTION|>";
+    const std::string text_resp = "<|START_TEXT|>Hello, world!<|END_TEXT|>";
+
+    struct cohere_case {
+        const char *             name;
+        std::string              input;
+        common_reasoning_format  reasoning_format;
+        bool                     enable_thinking;
+        common_chat_tool_choice  tool_choice;
+        std::string              exp_content;
+        std::string              exp_reasoning;
+        size_t                   exp_tool_calls;
+    };
+
+    const std::vector<cohere_case> cases = {
+        // #1968 follow-up fix: orphaned thinking (no <|START_THINKING|>) under --reasoning off.
+        { "unopened/DEEPSEEK -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "I'm\nthinking", 1 },
+        { "unopened/NONE -> content", "I'm\nthinking<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_NONE, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "I'm\nthinking", "", 1 },
+        { "unopened/DEEPSEEK/required -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_REQUIRED, "", "I'm\nthinking", 1 },
+        { "unopened text/DEEPSEEK -> reasoning_content + content", "I'm\nthinking<|END_THINKING|>" + text_resp,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "Hello, world!", "I'm\nthinking", 0 },
+        { "tool-choice-none text/DEEPSEEK -> reasoning_content + content", "I'm\nthinking<|END_THINKING|>" + text_resp,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_NONE, "Hello, world!", "I'm\nthinking", 0 },
+        // Regression: reasoning enabled still routes thinking to reasoning_content.
+        { "thinking-on/DEEPSEEK -> reasoning_content", "I'm\nthinking<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, true, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "I'm\nthinking", 1 },
+        { "thinking-on/NONE -> tagged content", "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_NONE, true, COMMON_CHAT_TOOL_CHOICE_AUTO,
+          "<|START_THINKING|>I'm\nthinking<|END_THINKING|>", "", 1 },
+        // Regression: existing #1968 shapes still parse to clean native tool calls.
+        { "bare-end/DEEPSEEK -> clean call", "<|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
+        { "bare-end/trailing-end-text/DEEPSEEK -> clean call", "<|END_THINKING|>" + act + "<|END_TEXT|>",
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
+        { "bare-end/numeric-id/DEEPSEEK -> clean call", "<|END_THINKING|>" + act_numeric_id,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
+        { "empty-block/DEEPSEEK -> clean call", "<|START_THINKING|><|END_THINKING|>" + act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
+        { "no-thinking/DEEPSEEK -> clean call", act,
+          COMMON_REASONING_FORMAT_DEEPSEEK, false, COMMON_CHAT_TOOL_CHOICE_AUTO, "", "", 1 },
+    };
+
+    for (const auto & c : cases) {
+        common_chat_templates_inputs inputs;
+        inputs.messages         = { user };
+        inputs.tools            = { special_function };
+        inputs.tool_choice      = c.tool_choice;
+        inputs.reasoning_format = c.reasoning_format;
+        inputs.enable_thinking  = c.enable_thinking;
+
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        auto pos    = params.generation_prompt.rfind("<|START_THINKING|>");
+
+        common_peg_arena arena;
+        arena.load(params.parser);
+
+        common_chat_parser_params pp(params);
+        if (pos != std::string::npos) {
+            pp.generation_prompt = params.generation_prompt.substr(0, pos);
+        }
+
+        auto msg = common_chat_peg_parse(arena, c.input, /* is_partial = */ false, pp);
+
+        t.assert_equal(std::string(c.name) + " : content",    c.exp_content,    msg.content);
+        t.assert_equal(std::string(c.name) + " : reasoning",  c.exp_reasoning,  msg.reasoning_content);
+        t.assert_equal(std::string(c.name) + " : tool calls", c.exp_tool_calls, msg.tool_calls.size());
+        if (c.exp_tool_calls == 1 && msg.tool_calls.size() == 1) {
+            t.assert_equal(std::string(c.name) + " : tool name", std::string("special_function"), msg.tool_calls[0].name);
+            t.assert_equal(std::string(c.name) + " : tool id",   std::string("0"),                msg.tool_calls[0].id);
+        }
+    }
+
+    const std::string parallel_act =
+        "<|START_ACTION|>[\n"
+        "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}},\n"
+        "    {\"tool_call_id\": \"1\", \"tool_name\": \"python\", \"parameters\": {\"code\": \"print('hey')\"}}\n"
+        "]<|END_ACTION|>";
+
+    common_chat_templates_inputs parallel_inputs;
+    parallel_inputs.messages            = { user };
+    parallel_inputs.tools               = { special_function, python };
+    parallel_inputs.tool_choice         = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    parallel_inputs.reasoning_format    = COMMON_REASONING_FORMAT_DEEPSEEK;
+    parallel_inputs.enable_thinking     = false;
+    parallel_inputs.parallel_tool_calls = true;
+
+    auto parallel_params = common_chat_templates_apply(tmpls.get(), parallel_inputs);
+    auto parallel_pos    = parallel_params.generation_prompt.rfind("<|START_THINKING|>");
+
+    common_peg_arena parallel_arena;
+    parallel_arena.load(parallel_params.parser);
+
+    common_chat_parser_params parallel_pp(parallel_params);
+    if (parallel_pos != std::string::npos) {
+        parallel_pp.generation_prompt = parallel_params.generation_prompt.substr(0, parallel_pos);
+    }
+
+    auto parallel_msg = common_chat_peg_parse(
+        parallel_arena,
+        "I'm\nthinking<|END_THINKING|>" + parallel_act,
+        /* is_partial = */ false,
+        parallel_pp);
+
+    t.assert_equal("parallel : content", std::string(""), parallel_msg.content);
+    t.assert_equal("parallel : reasoning", std::string("I'm\nthinking"), parallel_msg.reasoning_content);
+    t.assert_equal("parallel : tool calls", 2u, parallel_msg.tool_calls.size());
+    if (parallel_msg.tool_calls.size() == 2) {
+        t.assert_equal("parallel : tool 0 name", std::string("special_function"), parallel_msg.tool_calls[0].name);
+        t.assert_equal("parallel : tool 0 id",   std::string("0"),                parallel_msg.tool_calls[0].id);
+        t.assert_equal("parallel : tool 1 name", std::string("python"),           parallel_msg.tool_calls[1].name);
+        t.assert_equal("parallel : tool 1 id",   std::string("1"),                parallel_msg.tool_calls[1].id);
+    }
+}