diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index 310bebf735..d4b491a80e 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -1507,6 +1507,7 @@ static std::string gbnf_excluding_pattern(const std::vector & strin auto pieces = matcher.collect_prefix_and_next(); std::string pattern; + std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end for (size_t i = 0; i < pieces.size(); ++i) { if (i > 0) { pattern += " | "; @@ -1522,13 +1523,32 @@ static std::string gbnf_excluding_pattern(const std::vector & strin } if (!pre.empty()) { - pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]"; + std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre)); + pattern += pre_literal + " [^" + cls + "]"; + // Each interior alternative consumes a delimiter-prefix plus a disambiguating + // char, so the repetition alone cannot match a value that *ends* on a proper + // prefix of a delimiter (e.g. a trailing "\n" when the delimiter is + // "\n\n"). The runtime until() (greedy first-match) accepts such + // values, so without this the grammar would reject input the parser accepts. + // Allow the value to terminate on any proper prefix as an optional tail. + // This makes the grammar a slight superset of the runtime language (a value + // may end on the longest prefix, which greedy first-match would not itself + // produce); harmless for constrained generation, which only needs to admit + // every runtime-valid string. + if (!trailing.empty()) { + trailing += " | "; + } + trailing += pre_literal; } else { pattern += "[^" + cls + "]"; } } - return "(" + pattern + ")*"; + std::string result = "(" + pattern + ")*"; + if (!trailing.empty()) { + result += " (" + trailing + ")?"; + } + return result; } static std::unordered_set collect_reachable_rules( diff --git a/tests/peg-parser/test-gbnf-generation.cpp b/tests/peg-parser/test-gbnf-generation.cpp index fe4bbbdd16..00111e6a19 100644 --- a/tests/peg-parser/test-gbnf-generation.cpp +++ b/tests/peg-parser/test-gbnf-generation.cpp @@ -129,7 +129,7 @@ void test_gbnf_generation(testing &t) { }); assert_gbnf_equal(t, R"""( - root ::= ([^<] | "<" [^/] | "])* + root ::= ([^<] | "<" [^/] | "])* ("<" | "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} }, + }) + .run(); + + // a parameter value that itself ends in a newline (e.g. a source file with a + // trailing newline). The structural delimiter is "\n\n", so the value + // "#include\n" renders as "...#include\n\n\n". The trailing newline must + // be preserved faithfully (no stripping), and the generated grammar must admit a + // value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern. + tst.test( + "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} }, + }) + .run(); + + // test code that starts with indent tst.test( "\n"