JSON schema conversion: ️ faster repetitions, min/maxLength for strings, cap number length (#6555)

* json: rename python schema converter to make import easier

* server: skip null json_schema / grammar fields

* json: deps management for primitive rules (+ allow null values)

* json: optimize repetitions for minItems/maxItems and regexps: `a{,3}` goes from `"a"? "a"? "a"?` (explosive combos) to `(a (a (a)?)?)?`

* grammars: add troubleshooting section to readme

* json: cap length of numbers to 15 digits before/after decimal point

(avoids infinite gen, e.g. "one third" -> `0.333333333333...`)

* json: unify all repetition code (w/ or w/o sep)

* json: support string minLength/maxLength

* server+json: update server/README w/ result_format

* nits

* json: fix type error w/ python 3.8

* json: fix server/README (json_schema in /completion vs. result_format in /v1/chat/completions)

* json: simplify DOT `{"type": "string", "pattern": "^.$"}`

* json: remove recursion in opt_repetitions (avoids Python stack overflow)

* json: rm dead code

* json: rm useless assert & ggml.h import
This commit is contained in:
Olivier Chafik 2024-04-12 19:43:38 +01:00 committed by GitHub
parent fbbc030ba9
commit ab9a3240a9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 2348 additions and 1929 deletions

View File

@ -11,35 +11,101 @@
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
if (separator_rule.empty()) {
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
return item_rule + "+";
}
}
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else {
std::vector<std::string> items(min_items, item_rule);
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
}
}
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
}
};
if (min_items > 0 && max_items != min_items) {
result += " ";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result;
}
const std::string SPACE_RULE = "\" \"?"; const std::string SPACE_RULE = "\" \"?";
std::unordered_map<std::string, std::string> PRIMITIVE_RULES = { struct BuiltinRule {
{"boolean", "(\"true\" | \"false\") space"}, std::string content;
{"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"}, std::vector<std::string> deps;
{"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
{"value", "object | array | string | number | boolean"},
{"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
{"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
{"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
{"string", " \"\\\"\" (\n"
" [^\"\\\\] |\n"
" \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
" )* \"\\\"\" space"},
{"null", "\"null\" space"}
}; };
std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
std::unordered_map<std::string, std::string> DATE_RULES = { const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
{"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
{"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"}, std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"date-time", "date \"T\" time"}, {"boolean", {"(\"true\" | \"false\") space", {}}},
{"date-string", "\"\\\"\" date \"\\\"\" space"}, {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
{"time-string", "\"\\\"\" time \"\\\"\" space"}, {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
{"date-time-string", "\"\\\"\" date-time \"\\\"\" space"} {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
}; };
static bool is_reserved_name(const std::string & name) { static bool is_reserved_name(const std::string & name) {
@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
if (RESERVED_NAMES.empty()) { if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root"); RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first); for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
} }
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
} }
@ -192,7 +258,7 @@ private:
if (_dotall) { if (_dotall) {
rule = "[\\U00000000-\\U0010FFFF]"; rule = "[\\U00000000-\\U0010FFFF]";
} else { } else {
rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]"; rule = "[^\\x0A\\x0D]";
} }
return _add_rule("dot", rule); return _add_rule("dot", rule);
}; };
@ -308,13 +374,6 @@ private:
auto &sub = last.first; auto &sub = last.first;
auto sub_is_literal = last.second; auto sub_is_literal = last.second;
if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
sub += "*";
} else if (min_times == 0 && max_times == 1) {
sub += "?";
} else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
sub += "+";
} else {
if (!sub_is_literal) { if (!sub_is_literal) {
std::string & sub_id = sub_rule_ids[sub]; std::string & sub_id = sub_rule_ids[sub];
if (sub_id.empty()) { if (sub_id.empty()) {
@ -322,33 +381,14 @@ private:
} }
sub = sub_id; sub = sub_id;
} }
std::string result; seq.back().first = build_repetition(
if (sub_is_literal && min_times > 0) { sub_is_literal ? "\"" + sub + "\"" : sub,
result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\""; min_times,
} else { max_times,
for (int j = 0; j < min_times; j++) { "",
if (j > 0) { sub_is_literal
result += " "; );
}
result += sub;
}
}
if (min_times > 0 && min_times < max_times) {
result += " ";
}
if (max_times == std::numeric_limits<int>::max()) {
result += sub + "*";
} else {
for (int j = min_times; j < max_times; j++) {
if (j > min_times) {
result += " ";
}
result += sub + "?";
}
}
seq.back().first = result;
seq.back().second = false; seq.back().second = false;
}
} else { } else {
std::string literal; std::string literal;
auto is_non_literal = [&](char c) { auto is_non_literal = [&](char c) {
@ -424,7 +464,7 @@ private:
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) { if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
prop_kv_rule_names["*"] = kv_rule; prop_kv_rule_names["*"] = kv_rule;
optional_props.push_back("*"); optional_props.push_back("*");
} }
@ -486,6 +526,25 @@ private:
return rule; return rule;
} }
std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
auto n = _add_rule(name, rule.content);
for (const auto & dep : rule.deps) {
BuiltinRule dep_rule;
auto it = PRIMITIVE_RULES.find(dep);
if (it == PRIMITIVE_RULES.end()) {
it = STRING_FORMAT_RULES.find(dep);
if (it == STRING_FORMAT_RULES.end()) {
_errors.push_back("Rule " + dep + " not known");
continue;
}
}
if (_rules.find(dep) == _rules.end()) {
_add_primitive(dep, it->second);
}
}
return n;
}
public: public:
SchemaConverter( SchemaConverter(
const std::function<json(const std::string &)> & fetch_json, const std::function<json(const std::string &)> & fetch_json,
@ -647,49 +706,33 @@ public:
return _add_rule(rule_name, rule); return _add_rule(rule_name, rule);
} else { } else {
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item"); std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
std::string successive_items;
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0; int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json(); json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1; int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
if (min_items > 0) {
successive_items += repeat(list_item_operator, min_items - 1); return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
min_items--;
}
if (max_items >= 0 && max_items > min_items) {
successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
} else {
successive_items += list_item_operator + "*";
}
std::string rule;
if (min_items == 0) {
rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
} else {
rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
}
return _add_rule(rule_name, rule);
} }
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) { } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
return _visit_pattern(schema["pattern"], rule_name); return _visit_pattern(schema["pattern"], rule_name);
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) { } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid")); return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
} else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) { } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
for (const auto & kv : DATE_RULES) { auto prim_name = schema_format + "-string";
_add_rule(kv.first, kv.second); return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
} } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
return schema_format + "-string"; std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
} else if (schema.empty() || schema_type == "object") { } else if (schema.empty() || schema_type == "object") {
for (const auto & n : OBJECT_RULE_NAMES) { return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
_add_rule(n, PRIMITIVE_RULES.at(n));
}
return _add_rule(rule_name, "object");
} else { } else {
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) { if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
_errors.push_back("Unrecognized schema: " + schema.dump()); _errors.push_back("Unrecognized schema: " + schema.dump());
return ""; return "";
} }
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>())); return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
} }
} }

View File

@ -6,37 +6,94 @@ import re
import sys import sys
from typing import Any, Dict, List, Set, Tuple, Union from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule:
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
elif min_items == 1 and max_items is None:
return f'{item_rule}+'
result = ''
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
def opt_repetitions(up_to_n, prefix_with_sep=False):
'''
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
if min_items > 0 and max_items != min_items:
result += ' '
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
else:
result += f'{item_operator}*'
return result
class BuiltinRule:
def __init__(self, content: str, deps: list = None):
self.content = content
self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in # whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality? # whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?' SPACE_RULE = '" "?'
PRIMITIVE_RULES = { PRIMITIVE_RULES = {
'boolean': '("true" | "false") space', 'boolean' : BuiltinRule('("true" | "false") space', []),
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
'value' : 'object | array | string | number | boolean', 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'array' : '"[" space ( value ("," space value)* )? "]" space', 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space', 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'string': r''' "\"" ( 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
[^"\\] | 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
)* "\"" space''', 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null': '"null" space', 'null' : BuiltinRule('"null" space', []),
} }
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
# TODO: support "uri", "email" string formats # TODO: support "uri", "email" string formats
DATE_RULES = { STRING_FORMAT_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time': 'date "T" time', 'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string': '"\\"" date "\\"" space', 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string': '"\\"" time "\\"" space', 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
'date-time-string': '"\\"" date-time "\\"" space', 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
} }
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()]) DOTALL = '[\\U00000000-\\U0010FFFF]'
DOT = '[^\\x0A\\x0D]'
RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
NON_LITERAL_SET = set('|.()[]{}*+?') NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
class SchemaConverter: class SchemaConverter:
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@ -55,7 +110,9 @@ class SchemaConverter:
self._allow_fetch = allow_fetch self._allow_fetch = allow_fetch
self._dotall = dotall self._dotall = dotall
self._raw_pattern = raw_pattern self._raw_pattern = raw_pattern
self._rules = {'space': SPACE_RULE} self._rules = {
'space': SPACE_RULE,
}
self._refs = {} self._refs = {}
self._refs_being_resolved = set() self._refs_being_resolved = set()
@ -65,6 +122,29 @@ class SchemaConverter:
) )
return f'"{escaped}"' return f'"{escaped}"'
def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
'''
not_literal('a') -> '[^a]'
not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
'''
assert len(literal) > 0, 'Empty literal not supported'
def recurse(i: int):
c = literal[i]
if maybe_escaped_underscores and c == '_':
yield f'[^{c}\\\\]'
yield ' | '
yield f'"\\\\"? "{c}"'
else:
yield f'[^{c}]'
if i < len(literal) - 1:
yield ' | '
yield self._format_literal(c)
yield ' ('
yield from recurse(i + 1)
yield ')?'
return ''.join(('(', *recurse(0), ')'))
def _add_rule(self, name, rule): def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name) esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule: if esc_name not in self._rules or self._rules[esc_name] == rule:
@ -169,10 +249,10 @@ class SchemaConverter:
def get_dot(): def get_dot():
if self._dotall: if self._dotall:
rule = '[\\U00000000-\\U0010FFFF]' rule = DOTALL
else: else:
# Accept any character... except \n and \r line break chars (\x0A and \xOD) # Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]' rule = DOT
return self._add_rule(f'dot', rule) return self._add_rule(f'dot', rule)
def join_seq(): def join_seq():
@ -246,13 +326,6 @@ class SchemaConverter:
(sub, sub_is_literal) = seq[-1] (sub, sub_is_literal) = seq[-1]
if min_times == 0 and max_times is None:
seq[-1] = (f'{sub}*', False)
elif min_times == 0 and max_times == 1:
seq[-1] = (f'{sub}?', False)
elif min_times == 1 and max_times is None:
seq[-1] = (f'{sub}+', False)
else:
if not sub_is_literal: if not sub_is_literal:
id = sub_rule_ids.get(sub) id = sub_rule_ids.get(sub)
if id is None: if id is None:
@ -260,12 +333,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id sub_rule_ids[sub] = id
sub = id sub = id
seq[-1] = ( seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
' '.join(
([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
False
)
else: else:
literal = '' literal = ''
while i < length: while i < length:
@ -373,49 +441,47 @@ class SchemaConverter:
' "]" space') ' "]" space')
else: else:
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
list_item_operator = f'( "," space {item_rule_name} )'
successive_items = ""
min_items = schema.get("minItems", 0) min_items = schema.get("minItems", 0)
max_items = schema.get("maxItems") max_items = schema.get("maxItems")
if min_items > 0: return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
successive_items = list_item_operator * (min_items - 1)
min_items -= 1
if max_items is not None and max_items > min_items:
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
else:
successive_items += list_item_operator + "*"
if min_items == 0:
rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
else:
rule = f'"[" space {item_rule_name} {successive_items} "]" space'
return self._add_rule(rule_name, rule)
elif schema_type in (None, 'string') and 'pattern' in schema: elif schema_type in (None, 'string') and 'pattern' in schema:
return self._visit_pattern(schema['pattern'], rule_name) return self._visit_pattern(schema['pattern'], rule_name)
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''): elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
return self._add_rule( return self._add_primitive(
'root' if rule_name == 'root' else schema_format, 'root' if rule_name == 'root' else schema_format,
PRIMITIVE_RULES['uuid'] PRIMITIVE_RULES['uuid']
) )
elif schema_type in (None, 'string') and schema_format in DATE_RULES: elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
for t, r in DATE_RULES.items(): prim_name = f'{schema_format}-string'
self._add_rule(t, r) return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
return schema_format + '-string'
elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
min_len = schema.get('minLength', 0)
max_len = schema.get('maxLength')
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
elif (schema_type == 'object') or (len(schema) == 0): elif (schema_type == 'object') or (len(schema) == 0):
for n in OBJECT_RULE_NAMES: return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
self._add_rule(n, PRIMITIVE_RULES[n])
return self._add_rule(rule_name, 'object')
else: else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return self._add_rule( return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type] def _add_primitive(self, name: str, rule: BuiltinRule):
) n = self._add_rule(name, rule.content)
for dep in rule.deps:
dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
assert dep_rule, f'Rule {dep} not known'
if dep not in self._rules:
self._add_primitive(dep, dep_rule)
return n
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]): def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
prop_order = self._prop_order prop_order = self._prop_order
@ -437,7 +503,7 @@ class SchemaConverter:
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
prop_kv_rule_names["*"] = self._add_rule( prop_kv_rule_names["*"] = self._add_rule(
f'{sub_name}-kv', f'{sub_name}-kv',
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
) )
optional_props.append("*") optional_props.append("*")

View File

@ -8,7 +8,7 @@ print(subprocess.check_output(
"python", "python",
os.path.join( os.path.join(
os.path.dirname(os.path.realpath(__file__)), os.path.dirname(os.path.realpath(__file__)),
"json-schema-to-grammar.py"), "json_schema_to_grammar.py"),
*rest, *rest,
"-", "-",
"--raw-pattern", "--raw-pattern",

View File

@ -11,6 +11,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
* Continuous batching * Continuous batching
* Multimodal (wip) * Multimodal (wip)
* Monitoring endpoints * Monitoring endpoints
* Schema-constrained JSON response format
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
@ -250,6 +251,8 @@ node index.js
`grammar`: Set grammar for grammar-based sampling. Default: no grammar `grammar`: Set grammar for grammar-based sampling. Default: no grammar
`json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema.
`seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. `seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
`ignore_eos`: Ignore end of stream token and continue generating. Default: `false` `ignore_eos`: Ignore end of stream token and continue generating. Default: `false`
@ -365,6 +368,8 @@ Notice that each `probs` is an array of length `n_probs`.
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
*Examples:* *Examples:*
You can use either Python `openai` library with appropriate checkpoints: You can use either Python `openai` library with appropriate checkpoints:

File diff suppressed because it is too large Load Diff

View File

@ -1,33 +1,95 @@
// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first. // WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
const SPACE_RULE = '" "?'; const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
const separatorRule = opts.separatorRule ?? '';
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
if (separatorRule === '') {
if (minItems === 0 && maxItems === 1) {
return `${itemRule}?`;
} else if (minItems === 1 && maxItems === undefined) {
return `${itemRule}+`;
}
}
let result = '';
if (minItems > 0) {
if (itemRuleIsLiteral && separatorRule === '') {
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
} else {
result = Array.from({ length: minItems }, () => itemRule)
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
}
}
const optRepetitions = (upToN, prefixWithSep=false) => {
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
if (upToN === 0) {
return '';
} else if (upToN === 1) {
return `(${content})?`;
} else if (separatorRule !== '' && !prefixWithSep) {
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
} else {
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
}
};
if (minItems > 0 && maxItems !== minItems) {
result += ' ';
}
if (maxItems !== undefined) {
result += optRepetitions(maxItems - minItems, minItems > 0);
} else {
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
if (minItems === 0 && separatorRule !== '') {
result = `(${itemRule} ${itemOperator}*)?`;
} else {
result += `${itemOperator}*`;
}
}
return result;
}
class BuiltinRule {
constructor(content, deps) {
this.content = content;
this.deps = deps || [];
}
}
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
const PRIMITIVE_RULES = { const PRIMITIVE_RULES = {
boolean: '("true" | "false") space', boolean : new BuiltinRule('("true" | "false") space', []),
number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', 'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
integer: '("-"? ([0-9] | [1-9] [0-9]*)) space', 'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
value: 'object | array | string | number | boolean', number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
array: '"[" space ( value ("," space value)* )? "]" space', value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
string: ` "\\"" ( array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
[^"\\\\] | uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
"\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
)* "\\"" space`, string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null: '"null" space', null : new BuiltinRule('"null" space', []),
}; };
const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
// TODO: support "uri", "email" string formats // TODO: support "uri", "email" string formats
const DATE_RULES = { const STRING_FORMAT_RULES = {
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', 'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', 'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time': 'date "T" time', 'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
'date-string': '"\\"" date "\\"" space', 'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string': '"\\"" time "\\"" space', 'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),
'date-time-string': '"\\"" date-time "\\"" space', 'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
}; }
const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES}; const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g; const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g; const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@ -158,7 +220,7 @@ export class SchemaConverter {
rule = '[\\U00000000-\\U0010FFFF]'; rule = '[\\U00000000-\\U0010FFFF]';
} else { } else {
// Accept any character... except \n and \r line break chars (\x0A and \xOD) // Accept any character... except \n and \r line break chars (\x0A and \xOD)
rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'; rule = '[^\\x0A\\x0D]';
} }
return this._addRule('dot', rule); return this._addRule('dot', rule);
}; };
@ -259,13 +321,6 @@ export class SchemaConverter {
let [sub, subIsLiteral] = seq[seq.length - 1]; let [sub, subIsLiteral] = seq[seq.length - 1];
if (minTimes === 0 && maxTimes === Infinity) {
seq[seq.length - 1] = [`${sub}*`, false];
} else if (minTimes === 0 && maxTimes === 1) {
seq[seq.length - 1] = [`${sub}?`, false];
} else if (minTimes === 1 && maxTimes === Infinity) {
seq[seq.length - 1] = [`${sub}+`, false];
} else {
if (!subIsLiteral) { if (!subIsLiteral) {
let id = subRuleIds[sub]; let id = subRuleIds[sub];
if (id === undefined) { if (id === undefined) {
@ -275,10 +330,10 @@ export class SchemaConverter {
sub = id; sub = id;
} }
const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub); seq[seq.length - 1] = [
const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`]; _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false]; false
} ];
} else { } else {
let literal = ''; let literal = '';
while (i < length) { while (i < length) {
@ -394,49 +449,50 @@ export class SchemaConverter {
); );
} else { } else {
const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`); const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
const listItemOperator = `( "," space ${itemRuleName} )`; const minItems = schema.minItems || 0;
let successiveItems = '';
let minItems = schema.minItems || 0;
const maxItems = schema.maxItems; const maxItems = schema.maxItems;
if (minItems > 0) { return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
successiveItems = listItemOperator.repeat(minItems - 1);
minItems--;
}
if (maxItems !== undefined && maxItems > minItems) {
successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
} else {
successiveItems += `${listItemOperator}*`;
}
const rule = minItems === 0
? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
: `"[" space ${itemRuleName} ${successiveItems} "]" space`;
return this._addRule(ruleName, rule);
} }
} else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) { } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
return this._visitPattern(schema.pattern, ruleName); return this._visitPattern(schema.pattern, ruleName);
} else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) { } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
return this._addRule( return this._addPrimitive(
ruleName === 'root' ? 'root' : schemaFormat, ruleName === 'root' ? 'root' : schemaFormat,
PRIMITIVE_RULES['uuid']) PRIMITIVE_RULES['uuid']
} else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) { );
for (const [t, r] of Object.entries(DATE_RULES)) { } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
this._addRule(t, r); const primName = `${schema.format}-string`
} return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
return schemaFormat + '-string'; } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
const minLen = schema.minLength || 0;
const maxLen = schema.maxLength;
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) { } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
for (const n of OBJECT_RULE_NAMES) { return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
this._addRule(n, PRIMITIVE_RULES[n]);
}
return this._addRule(ruleName, 'object');
} else { } else {
if (!(schemaType in PRIMITIVE_RULES)) { if (!(schemaType in PRIMITIVE_RULES)) {
throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`); throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
} }
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]); return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
} }
} }
_addPrimitive(name, rule) {
let n = this._addRule(name, rule.content);
for (const dep of rule.deps) {
const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
if (!depRule) {
throw new Error(`Rule ${dep} not known`);
}
if (!(dep in this._rules)) {
this._addPrimitive(dep, depRule);
}
}
return n;
}
_buildObjectRule(properties, required, name, additionalProperties) { _buildObjectRule(properties, required, name, additionalProperties) {
const propOrder = this._propOrder; const propOrder = this._propOrder;
// sort by position in prop_order (if specified) then by original order // sort by position in prop_order (if specified) then by original order
@ -462,7 +518,7 @@ export class SchemaConverter {
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`); const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
propKvRuleNames['*'] = this._addRule( propKvRuleNames['*'] = this._addRule(
`${subName}-kv`, `${subName}-kv`,
`${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`); `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
optionalProps.push('*'); optionalProps.push('*');
} }

View File

@ -859,7 +859,7 @@ struct server_context {
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
// process "json_schema" and "grammar" // process "json_schema" and "grammar"
if (data.contains("json_schema") && data.contains("grammar")) { if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} else if (data.contains("json_schema") && !data.contains("grammar")) { } else if (data.contains("json_schema") && !data.contains("grammar")) {

View File

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
# #
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}" # ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json # python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
# #
set -euo pipefail set -euo pipefail
@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M
# https://github.com/YousefED/typescript-json-schema # https://github.com/YousefED/typescript-json-schema
# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2 # npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
./examples/json-schema-to-grammar.py "$SCHEMA_FILE" ./examples/json_schema_to_grammar.py "$SCHEMA_FILE"

View File

@ -89,3 +89,13 @@ This guide provides a brief overview. Check out the GBNF files in this directory
``` ```
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' ./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
``` ```
## Troubleshooting
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
### Efficient optional repetitions
A common pattern is to allow repetitions of a pattern `x` up to N times.
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)

View File

@ -104,16 +104,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""( R"""(
array ::= "[" space ( value ("," space value)* )? "]" space array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
null ::= "null" space null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object root ::= object
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] | value ::= object | array | string | number | boolean | null
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)""" )"""
}); });
@ -133,10 +133,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
date-string ::= "\"" date "\"" space date-string ::= "\"" date "\"" space
date-time ::= date "T" time date-time ::= date "T" time
date-time-string ::= "\"" date-time "\"" space date-time-string ::= "\"" date-time "\"" space
root ::= "[" space date-string "," space uuid "," space time-string "," space date-time-string "]" space root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
space ::= " "? space ::= " "?
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] ) time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
time-string ::= "\"" time "\"" space time-string ::= "\"" time "\"" space
tuple-0 ::= date-string
tuple-2 ::= time-string
tuple-3 ::= date-time-string
uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
)""" )"""
}); });
@ -148,10 +151,65 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"type": "string" "type": "string"
})""", })""",
R"""( R"""(
root ::= "\"" ( char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
[^"\\] | root ::= "\"" char* "\"" space
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) space ::= " "?
)* "\"" space )"""
});
test({
SUCCESS,
"string w/ min length 1",
R"""({
"type": "string",
"minLength": 1
})""",
R"""(
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "\"" char+ "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"string w/ min length 3",
R"""({
"type": "string",
"minLength": 3
})""",
R"""(
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "\"" char char char (char)* "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"string w/ max length",
R"""({
"type": "string",
"maxLength": 3
})""",
R"""(
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "\"" (char (char (char)?)?)? "\"" space
space ::= " "?
)"""
});
test({
SUCCESS,
"string w/ min & max length",
R"""({
"type": "string",
"minLength": 1,
"maxLength": 4
})""",
R"""(
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "\"" char (char (char (char)?)?)? "\"" space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -175,7 +233,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"type": "integer" "type": "integer"
})""", })""",
R"""( R"""(
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) space integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
root ::= ("-"? integral-part) space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -223,12 +282,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }] "prefixItems": [{ "type": "string" }]
})""", })""",
R"""( R"""(
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "[" space string "]" space root ::= "[" space string "]" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -239,13 +296,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }, { "type": "number" }] "prefixItems": [{ "type": "string" }, { "type": "number" }]
})""", })""",
R"""( R"""(
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "[" space string "," space number "]" space root ::= "[" space string "," space number "]" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -256,7 +313,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"type": "number" "type": "number"
})""", })""",
R"""( R"""(
root ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -272,7 +331,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""", })""",
R"""( R"""(
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
root ::= "[" space boolean ( "," space boolean )( "," space boolean )* "]" space root ::= "[" space boolean "," space boolean ("," space boolean)* "]" space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -288,7 +347,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""", })""",
R"""( R"""(
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
root ::= "[" space ( boolean )? "]" space root ::= "[" space (boolean)? "]" space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -304,7 +363,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""", })""",
R"""( R"""(
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
root ::= "[" space ( boolean ( "," space boolean )? )? "]" space root ::= "[" space (boolean ("," space boolean)?)? "]" space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -320,10 +379,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"maxItems": 5 "maxItems": 5
})""", })""",
R"""( R"""(
integer ::= ("-"? ([0-9] | [1-9] [0-9]*)) space decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integer ::= ("-"? integral-part) space
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
item ::= number | integer item ::= number | integer
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "[" space item ( "," space item )( "," space item )( "," space item )?( "," space item )? "]" space root ::= "[" space item "," space item "," space item ("," space item ("," space item)?)? "]" space
space ::= " "? space ::= " "?
)""" )"""
}); });
@ -372,11 +433,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"regexp", "regexp",
R"""({ R"""({
"type": "string", "type": "string",
"pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} and...$" "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} a{3,5}nd...$"
})""", })""",
R"""( R"""(
dot ::= [\U00000000-\x09\x0B\x0C\x0E-\U0010FFFF] dot ::= [^\x0A\x0D]
root ::= "\"" ("(" root-1 root-1? root-1? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " and" dot dot dot "\"" space root ::= "\"" ("(" root-1 (root-1 (root-1)?)? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " " "aaa" ("a" ("a")?)? "nd" dot dot dot "\"" space
root-1 ::= [0-9] root-1 ::= [0-9]
space ::= " "? space ::= " "?
)""" )"""
@ -404,12 +465,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -426,12 +485,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""", })""",
R"""( R"""(
a-kv ::= "\"a\"" space ":" space string a-kv ::= "\"a\"" space ":" space string
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "{" space (a-kv )? "}" space root ::= "{" space (a-kv )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -452,12 +509,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
b-kv ::= "\"b\"" space ":" space string b-kv ::= "\"b\"" space ":" space string
b-rest ::= ( "," space c-kv )? b-rest ::= ( "," space c-kv )?
c-kv ::= "\"c\"" space ":" space string c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -478,14 +533,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
d-kv ::= "\"d\"" space ":" space string d-kv ::= "\"d\"" space ":" space string
d-rest ::= ( "," space c-kv )? d-rest ::= ( "," space c-kv )?
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -499,14 +552,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""( R"""(
additional-kv ::= string ":" space additional-value additional-kv ::= string ":" space additional-value
additional-kvs ::= additional-kv ( "," space additional-kv )* additional-kvs ::= additional-kv ( "," space additional-kv )*
additional-value ::= "[" space ( number ( "," space number )* )? "]" space additional-value ::= "[" space (number ("," space number)*)? "]" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space (additional-kvs )? "}" space root ::= "{" space (additional-kvs )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -520,16 +573,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""( R"""(
array ::= "[" space ( value ("," space value)* )? "]" space array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
null ::= "null" space null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object root ::= object
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] | value ::= object | array | string | number | boolean | null
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)""" )"""
}); });
@ -542,16 +595,16 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""( R"""(
array ::= "[" space ( value ("," space value)* )? "]" space array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space boolean ::= ("true" | "false") space
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
null ::= "null" space null ::= "null" space
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object root ::= object
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] | value ::= object | array | string | number | boolean | null
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
value ::= object | array | string | number | boolean
)""" )"""
}); });
@ -583,13 +636,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space number a-kv ::= "\"a\"" space ":" space number
additional-kv ::= string ":" space string additional-kv ::= string ":" space string
additional-kvs ::= additional-kv ( "," space additional-kv )* additional-kvs ::= additional-kv ( "," space additional-kv )*
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -608,13 +661,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-rest ::= additional-kvs a-rest ::= additional-kvs
additional-kv ::= string ":" space number additional-kv ::= string ":" space number
additional-kvs ::= additional-kv ( "," space additional-kv )* additional-kvs ::= additional-kv ( "," space additional-kv )*
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -636,13 +689,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
additional-kvs ::= additional-kv ( "," space additional-kv )* additional-kvs ::= additional-kv ( "," space additional-kv )*
b-kv ::= "\"b\"" space ":" space number b-kv ::= "\"b\"" space ":" space number
b-rest ::= additional-kvs b-rest ::= additional-kvs
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -650,9 +703,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
SUCCESS, SUCCESS,
"top-level $ref", "top-level $ref",
R"""({ R"""({
"$ref": "#/definitions/MyType", "$ref": "#/definitions/foo",
"definitions": { "definitions": {
"MyType": { "foo": {
"type": "object", "type": "object",
"properties": { "properties": {
"a": { "a": {
@ -667,14 +720,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
} }
})""", })""",
R"""( R"""(
MyType ::= "{" space MyType-a-kv "}" space char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
MyType-a-kv ::= "\"a\"" space ":" space string foo ::= "{" space foo-a-kv "}" space
root ::= MyType foo-a-kv ::= "\"a\"" space ":" space string
root ::= foo
space ::= " "? space ::= " "?
string ::= "\"" ( string ::= "\"" char* "\"" space
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space
)""" )"""
}); });
@ -701,9 +752,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
alternative-1 ::= bar alternative-1 ::= bar
bar ::= "{" space (bar-b-kv )? "}" space bar ::= "{" space (bar-b-kv )? "}" space
bar-b-kv ::= "\"b\"" space ":" space number bar-b-kv ::= "\"b\"" space ":" space number
decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
foo ::= "{" space (foo-a-kv )? "}" space foo ::= "{" space (foo-a-kv )? "}" space
foo-a-kv ::= "\"a\"" space ":" space number foo-a-kv ::= "\"a\"" space ":" space number
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= alternative-0 | alternative-1 root ::= alternative-0 | alternative-1
space ::= " "? space ::= " "?
)""" )"""
@ -745,7 +798,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
c-kv ::= "\"c\"" space ":" space number c-kv ::= "\"c\"" space ":" space number
d-kv ::= "\"d\"" space ":" space number d-kv ::= "\"d\"" space ":" space number
d-rest ::= ( "," space c-kv )? d-rest ::= ( "," space c-kv )?
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= " "? space ::= " "?
)""" )"""
@ -786,7 +841,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"definitions": {} "definitions": {}
})""", })""",
R"""( R"""(
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
number- ::= "{" space number-number-kv "}" space number- ::= "{" space number-number-kv "}" space
number-kv ::= "\"number\"" space ":" space number- number-kv ::= "\"number\"" space ":" space number-
number-number ::= "{" space number-number-root-kv "}" space number-number ::= "{" space number-number-root-kv "}" space
@ -816,7 +873,7 @@ int main() {
test_all("Python", [](const TestCase & tc) { test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema); write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system( tc.verify_status(std::system(
"python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
tc.verify(read("test-grammar-output.tmp")); tc.verify(read("test-grammar-output.tmp"));
}); });
} else { } else {