From 52b3df0023659b142ce29f75c7a82cf437769c33 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Sun, 21 Jun 2026 16:20:58 -0500 Subject: [PATCH] common/peg : implement ac parser for stricter grammar generation (#24869) * common/peg : implement ac parser * cont : extract functions * cont : tidy up * cont : remove a test * cont : move ac() def --- common/chat-auto-parser-generator.cpp | 9 +- common/peg-parser.cpp | 131 +++++++++++++++++----- common/peg-parser.h | 15 ++- tests/peg-parser/test-gbnf-generation.cpp | 69 ++++++++++++ 4 files changed, 190 insertions(+), 34 deletions(-) diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 37ca55c8df..36aab7ecbe 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -395,10 +395,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte arguments.name_suffix) + arguments.value_prefix + (schema_info.resolves_to_string(param_schema) ? - p.tool_arg_string_value(until_suffix) : - p.tool_arg_json_value(p.schema( - p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false))) + - p.tool_arg_close(p.literal(arguments.value_suffix))); + p.ac(p.tool_arg_string_value(until_suffix) + + p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) : + (p.tool_arg_json_value(p.schema( + p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) + + p.tool_arg_close(p.literal(arguments.value_suffix))))); auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg); if (is_required) { diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index 506b902451..807e952d90 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -921,6 +921,10 @@ struct parser_executor { common_peg_parse_result operator()(const common_peg_gbnf_parser & p) { return arena.parse(p.child, ctx, start_pos); } + + common_peg_parse_result operator()(const common_peg_ac_parser & p) { + return arena.parse(p.child, ctx, start_pos); + } }; common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const { @@ -989,7 +993,8 @@ void common_peg_arena::resolve_refs() { std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v) { + std::is_same_v || + std::is_same_v) { p.child = resolve_ref(p.child); } else if constexpr (std::is_same_v) { p.child = resolve_ref(p.child); @@ -1070,6 +1075,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id return "Atomic(" + dump_impl(p.child, visited) + ")"; } else if constexpr (std::is_same_v) { return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")"; + } else if constexpr (std::is_same_v) { + return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")"; } else if constexpr (std::is_same_v) { return "Any"; } else if constexpr (std::is_same_v) { @@ -1479,6 +1486,13 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key }); } +common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector & delimiters) { + if (delimiters.empty()) { + throw std::runtime_error("ac parser requires at least one delimiter"); + } + return add(common_peg_ac_parser{p, delimiters}); +} + static std::string gbnf_escape_char_class(uint32_t c) { if (c == '-' || c == ']' || c == '[' || c == '\\') { return "\\" + std::string(1, (char) c); @@ -1529,14 +1543,22 @@ static std::string gbnf_escape_char_class(uint32_t c) { return std::string(buf); } -// GBNF grammar matching strings that contain no string in `strings` as a -// substring. Emits the complement of an Aho-Corasick automaton DFA and returns -// the start state rule name. -// -// ref: https://github.com/ggml-org/llama.cpp/pull/24839 -static std::string gbnf_excluding_grammar(const common_grammar_builder & builder, - const std::string & prefix, - const std::vector & strings) { +static std::string gbnf_char_class(const std::vector & chars, bool negate) { + std::string s = negate ? "[^" : "["; + for (uint32_t ch : chars) { + s += gbnf_escape_char_class(ch); + } + return s + "]"; +} + +static std::string gbnf_ac_grammar( + const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings, + const std::function &, + const std::map> &, + const std::vector &, + const std::function &)> & build_rule) { aho_corasick ac(strings); auto state_name = [&](size_t s) -> std::string { @@ -1548,42 +1570,30 @@ static std::string gbnf_excluding_grammar(const common_grammar_builder & builder return prefix + "-" + num; }; - auto char_class = [](const std::vector & chars, bool negate) { - std::string s = negate ? "[^" : "["; - for (uint32_t ch : chars) { - s += gbnf_escape_char_class(ch); - } - return s + "]"; - }; - for (size_t q = 0; q < ac.num_states(); q++) { if (ac.is_terminal(q)) { - continue; // match states are dropped + continue; // match states } std::map> buckets; - std::vector excluded; + std::vector completing; // chars that complete a delimiter + std::vector specific; // chars with an explicit transition for (uint32_t c : ac.alphabet) { size_t d = ac.next(q, c); if (ac.is_terminal(d)) { - excluded.push_back(c); // completes a forbidden string -> omit + completing.push_back(c); + specific.push_back(c); } else if (d != 0) { buckets[d].push_back(c); // specific non-root destination - excluded.push_back(c); + specific.push_back(c); } } - std::string rhs = "|"; // every state is accepting - for (const auto & [d, chars] : buckets) { - rhs += " " + char_class(chars, false) + " " + state_name(d) + " |"; - } - rhs += " " + char_class(excluded, true) + " " + state_name(0); - - builder.add_rule(state_name(q), rhs); + builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name)); } // An empty delimiter makes the start state terminal. Emit an entry rule - // that matches nothing so the returned reference stays valid. + // that matches the empty string so the returned reference stays valid. if (ac.is_terminal(0)) { builder.add_rule(prefix, "|"); } @@ -1591,6 +1601,54 @@ static std::string gbnf_excluding_grammar(const common_grammar_builder & builder return state_name(0); } +// GBNF grammar matching strings that contain no string in `strings` as a +// substring. Emits the complement of an Aho-Corasick automaton DFA and returns +// the start state rule name. +// +// ref: https://github.com/ggml-org/llama.cpp/pull/24839 +static std::string gbnf_excluding_grammar(const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings) { + return gbnf_ac_grammar(builder, prefix, strings, + [](const std::vector & /*completing*/, + const std::map> & buckets, + const std::vector & specific, + const std::function & state_name) { + // every state is accepting and completing chars get no + // alternative, so a forbidden string can never be matched + std::string rhs = "|"; + for (const auto & [d, chars] : buckets) { + rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |"; + } + rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0); + return rhs; + }); +} + +// GBNF grammar matching everything up to and including the first occurrence of +// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns +// the start state rule name. +static std::string gbnf_including_grammar(const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings) { + return gbnf_ac_grammar(builder, prefix, strings, + [](const std::vector & completing, + const std::map> & buckets, + const std::vector & specific, + const std::function & state_name) { + std::vector alts; + if (!completing.empty()) { + alts.push_back(gbnf_char_class(completing, false)); // terminate on match + } + for (const auto & [d, chars] : buckets) { + alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d)); + } + // every other character keeps scanning from the start state + alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0)); + return string_join(alts, " | "); + }); +} + static std::set collect_reachable_rules( const common_peg_arena & arena, const common_peg_parser_id & rule @@ -1628,6 +1686,7 @@ static std::set collect_reachable_rules( std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v) { visit(p.child); } else if constexpr (std::is_same_v) { @@ -1822,6 +1881,8 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo return to_gbnf(p.child); } else if constexpr (std::is_same_v) { return p.grammar; + } else if constexpr (std::is_same_v) { + return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters); } else { static_assert(is_always_false_v); } @@ -1958,6 +2019,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & }; } else if constexpr (std::is_same_v) { return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}}; } }, variant); } @@ -2130,6 +2193,16 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json }; } + if (type == "ac") { + if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) { + throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array"); + } + return common_peg_ac_parser{ + j["child"].get(), + j["delimiters"].get>(), + }; + } + throw std::runtime_error("Unknown parser type: " + type); } diff --git a/common/peg-parser.h b/common/peg-parser.h index 132173a64c..c198499dd9 100644 --- a/common/peg-parser.h +++ b/common/peg-parser.h @@ -275,6 +275,11 @@ struct common_peg_gbnf_parser { std::string grammar; }; +struct common_peg_ac_parser { + common_peg_parser_id child; + std::vector delimiters; +}; + // Variant holding all parser types using common_peg_parser_variant = std::variant< common_peg_epsilon_parser, @@ -296,7 +301,8 @@ using common_peg_parser_variant = std::variant< common_peg_ref_parser, common_peg_atomic_parser, common_peg_tag_parser, - common_peg_gbnf_parser + common_peg_gbnf_parser, + common_peg_ac_parser >; class common_peg_arena { @@ -514,6 +520,13 @@ class common_peg_parser_builder { // the child's grammar. Parsing delegates entirely to the child. common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); } + // Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick + // automaton of `delimiters`, matching everything up to and including the + // first delimiter. Parsing delegates entirely to the child, which is + // responsible for consuming the delimiter (e.g. until(D) + literal(D)). + common_peg_parser ac(const common_peg_parser & p, const std::vector & delimiters); + common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector{delimiter}); } + void set_root(const common_peg_parser & p); common_peg_arena build(); diff --git a/tests/peg-parser/test-gbnf-generation.cpp b/tests/peg-parser/test-gbnf-generation.cpp index 45d692ca60..60066a817b 100644 --- a/tests/peg-parser/test-gbnf-generation.cpp +++ b/tests/peg-parser/test-gbnf-generation.cpp @@ -212,6 +212,75 @@ void test_gbnf_generation(testing &t) { )""", gbnf); }); + t.test("ac grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.ac(p.until("") + p.literal(""), ""); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-3 ::= [<] ac-3-01 | [^<] ac-3 + ac-3-01 ::= [<] ac-3-01 | [/] ac-3-02 | [^/<] ac-3 + ac-3-02 ::= [<] ac-3-01 | [t] ac-3-03 | [^] | [<] ac-3-01 | [^<>] ac-3 + root ::= ac-3 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("ac grammar terminates at first delimiter", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.ac(p.until("\n\n") + p.literal("\n\n"), "\n\n"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-3 ::= [\n] ac-3-01 | [^\n] ac-3 + ac-3-01 ::= [\n] ac-3-01 | [<] ac-3-02 | [^\n<] ac-3 + ac-3-02 ::= [\n] ac-3-01 | [/] ac-3-03 | [^\n/] ac-3 + ac-3-03 ::= [\n] ac-3-01 | [p] ac-3-04 | [^\np] ac-3 + ac-3-04 ::= [\n] ac-3-01 | [a] ac-3-05 | [^\na] ac-3 + ac-3-05 ::= [\n] ac-3-01 | [r] ac-3-06 | [^\nr] ac-3 + ac-3-06 ::= [\n] ac-3-01 | [a] ac-3-07 | [^\na] ac-3 + ac-3-07 ::= [\n] ac-3-01 | [m] ac-3-08 | [^\nm] ac-3 + ac-3-08 ::= [\n] ac-3-01 | [e] ac-3-09 | [^\ne] ac-3 + ac-3-09 ::= [\n] ac-3-01 | [t] ac-3-10 | [^\nt] ac-3 + ac-3-10 ::= [\n] ac-3-01 | [e] ac-3-11 | [^\ne] ac-3 + ac-3-11 ::= [\n] ac-3-01 | [r] ac-3-12 | [^\nr] ac-3 + ac-3-12 ::= [\n] ac-3-01 | [>] ac-3-13 | [^\n>] ac-3 + ac-3-13 ::= [\n] | [^\n] ac-3 + root ::= ac-3 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("ac grammar multiple delimiters", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.ac(p.eps(), std::vector{"ab", "cd", "ef"}); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-1 ::= [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^ace] ac-1 + ac-1-01 ::= [b] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^abce] ac-1 + ac-1-03 ::= [d] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^acde] ac-1 + ac-1-05 ::= [f] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^acef] ac-1 + root ::= ac-1 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + t.test("complex expressions with parentheses", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("a") | p.literal("b"));