From b61eb9644d64e90123ac805436d95b94b3b4cc3f Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Tue, 11 Jun 2024 02:22:57 +0100 Subject: [PATCH] json: refine constraint for whitespace to avoid runaways yet allow pretty print (#7866) --- common/json-schema-to-grammar.cpp | 2 +- examples/json_schema_to_grammar.py | 5 +- .../server/public/json-schema-to-grammar.mjs | 2 +- grammars/json.gbnf | 2 +- grammars/json_arr.gbnf | 2 +- tests/test-json-schema-to-grammar.cpp | 76 +++++++++---------- 6 files changed, 44 insertions(+), 45 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 11221a32f..10b9b3d1d 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items return result; } -const std::string SPACE_RULE = "\" \"?"; +const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}"; struct BuiltinRule { std::string content; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index cd444d010..ab19e20df 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -29,9 +29,8 @@ class BuiltinRule: self.content = content self.deps = deps or [] -# whitespace is constrained to a single space char to prevent model "running away" in -# whitespace. Also maybe improves generation quality? -SPACE_RULE = '" "?' +# Constraining spaces to prevent model "running away". +SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}' PRIMITIVE_RULES = { 'boolean' : BuiltinRule('("true" | "false") space', []), diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index dc2468396..faed6a32c 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -1,5 +1,5 @@ // WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first. -const SPACE_RULE = '" "?'; +const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'; function _buildRepetition(itemRule, minItems, maxItems, opts={}) { if (minItems === 0 && maxItems === 1) { diff --git a/grammars/json.gbnf b/grammars/json.gbnf index 064a53f8a..b6448c87b 100644 --- a/grammars/json.gbnf +++ b/grammars/json.gbnf @@ -22,4 +22,4 @@ string ::= number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws # Optional space: by convention, applied in this grammar after literal chars when allowed -ws ::= [ \t\n]{0,20} +ws ::= | " " | "\n" [ \t]{0,20} diff --git a/grammars/json_arr.gbnf b/grammars/json_arr.gbnf index bd1312d96..b3dc6f9b1 100644 --- a/grammars/json_arr.gbnf +++ b/grammars/json_arr.gbnf @@ -31,4 +31,4 @@ string ::= number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws # Optional space: by convention, applied in this grammar after literal chars when allowed -ws ::= [ \t\n]{0,20} +ws ::= | " " | "\n" [ \t]{0,20} diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index bea876bd1..a33104dea 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -112,7 +112,7 @@ static void test_all(const std::string & lang, std::function