diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index e2c4d6ce22..b18607cd65 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -233,27 +233,27 @@ struct BuiltinRule { }; static std::unordered_map PRIMITIVE_RULES = { - {"boolean", {"(\"true\" | \"false\") space", {}}}, + {"boolean", {"(\"true\" | \"false\")", {}}}, {"decimal-part", {"[0-9]{1,16}", {}}}, {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}}, - {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, - {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, + {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}}, + {"integer", {"(\"-\"? integral-part)", {"integral-part"}}}, {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, - {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, - {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, - {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}}, + {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}}, + {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}}, + {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}}, {"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}}, - {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, - {"null", {"\"null\" space", {}}}, + {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}}, + {"null", {"\"null\"", {}}}, }; static std::unordered_map STRING_FORMAT_RULES = { {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, {"date-time", {"date \"T\" time", {"date", "time"}}}, - {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, - {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, - {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}} + {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}}, + {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}}, + {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}} }; static bool is_reserved_name(const std::string & name) { @@ -551,16 +551,16 @@ private: } return join_seq(); }; - return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); + return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\""); } /* Returns a rule that matches a JSON string that is none of the provided strings not_strings({"a"}) - -> ["] ( [a] char+ | [^"a] char* )? ["] space + -> ["] ( [a] char+ | [^"a] char* )? ["] not_strings({"and", "also"}) - -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space + -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] */ std::string _not_strings(const std::vector & strings) { @@ -619,7 +619,7 @@ private: if (!trie.is_end_of_string) { out << "?"; } - out << " [\"] space"; + out << " [\"]"; return out.str(); } @@ -725,7 +725,7 @@ private: rule += " )?"; } - rule += " \"}\" space"; + rule += " space \"}\""; return rule; } @@ -858,14 +858,14 @@ public: return _add_rule(rule_name, _generate_union_rule(name, schema_types)); } if (schema.contains("const")) { - return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space"); + return _add_rule(rule_name, _generate_constant_rule(schema["const"])); } if (schema.contains("enum")) { std::vector enum_values; for (const auto & v : schema["enum"]) { enum_values.push_back(_generate_constant_rule(v)); } - return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space"); + return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")"); } if ((schema_type.is_null() || schema_type == "object") && (schema.contains("properties") || @@ -933,7 +933,7 @@ public: } } if (!enum_intersection.empty()) { - return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space"); + return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")"); } } return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); @@ -948,7 +948,7 @@ public: } rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i)); } - rule += " \"]\" space"; + rule += " space \"]\""; return _add_rule(rule_name, rule); } std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item"); @@ -956,7 +956,7 @@ public: json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json(); int max_items = max_items_json.is_number_integer() ? max_items_json.get() : std::numeric_limits::max(); - return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space"); + return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\""); } if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) { return _visit_pattern(schema["pattern"], rule_name); @@ -972,7 +972,7 @@ public: std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); - return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); + return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\""); } if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { int64_t min_value = std::numeric_limits::min(); @@ -990,7 +990,7 @@ public: std::stringstream out; out << "("; build_min_max_int(min_value, max_value, out); - out << ") space"; + out << ")"; return _add_rule(rule_name, out.str()); } if (schema.empty() || schema_type == "object") { diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index d4b491a80e..ff0d24d43f 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -1342,7 +1342,7 @@ common_peg_parser common_peg_parser_builder::json_object() { common_peg_parser common_peg_parser_builder::json_array() { return rule("json-array", [this]() { auto ws = space(); - auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))}); + auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))}); return sequence({ literal("["), ws, diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 077fcfacac..83abd259da 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -198,18 +198,18 @@ class BuiltinRule: SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - 'boolean' : BuiltinRule('("true" | "false") space', []), + 'boolean' : BuiltinRule('("true" | "false")', []), 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), - 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), - 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part)', ['integral-part']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), - 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), - 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []), 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), - 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), - 'null' : BuiltinRule('"null" space', []), + 'string' : BuiltinRule(r'"\"" char* "\""', ['char']), + 'null' : BuiltinRule('"null"', []), } # TODO: support "uri", "email" string formats @@ -217,9 +217,9 @@ STRING_FORMAT_RULES = { 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), - 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), - 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), - 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), + 'date-string' : BuiltinRule('"\\"" date "\\""', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\""', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']), } DOTALL = '[\\U00000000-\\U0010FFFF]' @@ -319,7 +319,7 @@ class SchemaConverter: out.append(f'[^"{"".join(rejects)}] {char_rule}*') visit(trie) - out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + out.append(f' ){"" if trie.is_end_of_string else "?"} ["]') return ''.join(out) def _add_rule(self, name, rule): @@ -549,7 +549,7 @@ class SchemaConverter: return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"") def _resolve_ref(self, ref): @@ -580,10 +580,10 @@ class SchemaConverter: return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') + return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) elif 'enum' in schema: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -624,7 +624,7 @@ class SchemaConverter: enum_intersection &= s if enum_intersection: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')' return self._add_rule(rule_name, rule) return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) @@ -638,12 +638,12 @@ class SchemaConverter: ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') for i, item in enumerate(items)) + - ' "]" space') + ' space "]"') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"') elif schema_type in (None, 'string') and 'pattern' in schema: return self._visit_pattern(schema['pattern'], rule_name) @@ -663,7 +663,7 @@ class SchemaConverter: min_len = schema.get('minLength', 0) max_len = schema.get('maxLength') - return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""') elif schema_type in (None, 'integer') and \ ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): @@ -680,7 +680,7 @@ class SchemaConverter: out = ["("] _generate_min_max_int(min_value, max_value, out) - out.append(") space") + out.append(")") return self._add_rule(rule_name, ''.join(out)) elif (schema_type == 'object') or (len(schema) == 0): @@ -765,7 +765,7 @@ class SchemaConverter: rule += ' )' rule += ' )?' - rule += ' "}" space' + rule += ' space "}"' return rule diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 902a4c135a..30aa35e137 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -5022,14 +5022,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run(); tst.test( - "```json\n\"42\" \n```") + "```json\n\"42\"\n```") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .json_schema(const_schema) .expect_content(R"("42")") .run(); tst.test( - "\"42\" \n") + "\"42\"\n") .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .json_schema(const_schema) .expect_content(R"("42")") diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index b4362852c3..f095274cd1 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -92,7 +92,7 @@ static void test_all(const std::string & lang, std::function