From 6786edd14a4007d6d9b5aceb91bd6fe1fe220cc3 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Mon, 15 Jun 2026 15:00:53 +0200 Subject: [PATCH] chat: fix an "oldie but goodie" grammar generator bug that surfaced during last changes --- common/peg-parser.cpp | 24 +++++++++++++++++-- tests/test-chat.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index 310bebf735..d4b491a80e 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -1507,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector & strin auto pieces = matcher.collect_prefix_and_next(); std::string pattern; + std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end for (size_t i = 0; i < pieces.size(); ++i) { if (i > 0) { pattern += " | "; @@ -1522,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector & strin } if (!pre.empty()) { - pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]"; + std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre)); + pattern += pre_literal + " [^" + cls + "]"; + // Each interior alternative consumes a delimiter-prefix plus a disambiguating + // char, so the repetition alone cannot match a value that *ends* on a proper + // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is + // "\n\n"). The runtime until() (greedy first-match) accepts such + // values, so without this the grammar would reject input the parser accepts. + // Allow the value to terminate on any proper prefix as an optional tail. + // This makes the grammar a slight superset of the runtime language (a value + // may end on the longest prefix, which greedy first-match would not itself + // produce); harmless for constrained generation, which only needs to admit + // every runtime-valid string. + if (!trailing.empty()) { + trailing += " | "; + } + trailing += pre_literal; } else { pattern += "[^" + cls + "]"; } } - return "(" + pattern + ")*"; + std::string result = "(" + pattern + ")*"; + if (!trailing.empty()) { + result += " (" + trailing + ")?"; + } + return result; } static std::unordered_set collect_reachable_rules( diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index e65af46c63..548071c906 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -2024,6 +2024,61 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); + tst.test( + "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} }, + }) + .run(); + + // a parameter value that itself ends in a newline (e.g. a source file with a + // trailing newline). The structural delimiter is "\n\n", so the value + // "#include\n" renders as "...#include\n\n\n". The trailing newline must + // be preserved faithfully (no stripping), and the generated grammar must admit a + // value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern. + tst.test( + "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} }, + }) + .run(); + + // test code that starts with indent tst.test( "\n"