From 057bbdc1f3344e7d675a3c4681731a880d0c22e0 Mon Sep 17 00:00:00 2001 From: ochafik Date: Tue, 30 Apr 2024 23:23:55 +0100 Subject: [PATCH 01/24] json: support minimum for positive integer values --- common/json-schema-to-grammar.cpp | 96 ++++++++++++++++++++++++++- tests/test-json-schema-to-grammar.cpp | 79 +++++++++++++++++++++- 2 files changed, 173 insertions(+), 2 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 737bae27c7206..35d261d19ec4f 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -16,7 +16,7 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa static std::string repeat(const std::string & str, size_t n); -static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { +static std::string build_repetition(const std::string & item_rule, const int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); if (min_items == 0 && max_items == 1) { @@ -160,6 +160,82 @@ static std::string format_literal(const std::string & literal) { return "\"" + escaped + "\""; } +static void _generate_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { + auto has_max = max_value != std::numeric_limits::max(); + auto has_min = min_value != std::numeric_limits::min(); + + if (has_min && has_max && min_value > max_value) { + throw std::invalid_argument("min value must be less than or equal to max value"); + } + if (has_min && has_max && min_value == max_value) { + out << "\"" << min_value << "\""; + } else if (has_min && has_max) { + throw std::invalid_argument("min and max values not supported yet"); + } else if (has_min) { + if (min_value < 0) { + throw std::invalid_argument("negative min values not supported yet"); + // out << "\"-\" "; + // _generate_min_max_int(std::nullopt, -min_value, out); + // ... + } + auto less_decimals = std::max(decimals_left - 1, 1); + auto more_digits = [&](int min_digits, int decimals) { + out << "[0-9]{" << min_digits << "," << decimals << "}"; + }; + auto digit_range = [&](char from, char to) { + out << "["; + if (from == to) { + out << from; + } else { + out << from << "-" << to; + } + out << "]"; + }; + + if (min_value == 0) { + if (top_level) { + out << "[1-9] "; + more_digits(0, less_decimals); + } else { + more_digits(1, decimals_left); + } + } else if (min_value <= 9) { + char c = '0' + min_value; + if (min_value > (top_level ? 1 : 0)) { + digit_range('0', c - 1); + out << " "; + more_digits(1, less_decimals); + out << " | "; + } + digit_range(c, '9'); + out << " "; + more_digits(0, less_decimals); + } else { + auto min_s = std::to_string(min_value); + auto len = min_s.length(); + auto c = min_s[0]; + + if (c > '1') { + digit_range(top_level ? '1' : '0', c - 1); + out << " "; + more_digits(len, less_decimals); + out << " | "; + } + digit_range(c, c); + out << " ("; + _generate_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits::max(), out, less_decimals, /* top_level= */ false); + out << ")"; + if (c < '9') { + out << " | "; + digit_range(c + 1, '9'); + out << " "; + more_digits(len - 1, less_decimals); + } + } + } else { + throw std::invalid_argument("max values not supported yet"); + } +} class SchemaConverter { private: @@ -681,6 +757,24 @@ class SchemaConverter { } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) { auto prim_name = schema_format + "-string"; return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name))); + } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { + int min_value = std::numeric_limits::min(); + int max_value = std::numeric_limits::max(); + if (schema.contains("minimum")) { + min_value = schema["minimum"].get(); + } else if (schema.contains("exclusiveMinimum")) { + min_value = schema["exclusiveMinimum"].get() + 1; + } + if (schema.contains("maximum")) { + max_value = schema["maximum"].get(); + } else if (schema.contains("exclusiveMaximum")) { + max_value = schema["exclusiveMaximum"].get() - 1; + } + std::stringstream out; + out << "("; + _generate_min_max_int(min_value, max_value, out); + out << ") space"; + return _add_rule(rule_name, out.str()); } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) { std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 052c0807310ce..e001ef9fba418 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -80,6 +80,84 @@ static void test_all(const std::string & lang, std::function Date: Tue, 30 Apr 2024 23:35:33 +0100 Subject: [PATCH 02/24] json: fix min 0 --- common/json-schema-to-grammar.cpp | 1 + tests/test-json-schema-to-grammar.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 35d261d19ec4f..294370aa0f6b4 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -195,6 +195,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea if (min_value == 0) { if (top_level) { out << "[1-9] "; + out << "[0] | [1-9] "; more_digits(0, less_decimals); } else { more_digits(1, decimals_left); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index e001ef9fba418..0fcf25b95e8e6 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -88,7 +88,7 @@ static void test_all(const std::string & lang, std::function Date: Wed, 1 May 2024 01:35:53 +0100 Subject: [PATCH 03/24] json: min + max integer constraints --- common/json-schema-to-grammar.cpp | 144 ++++++++++++++++++++------ tests/test-json-schema-to-grammar.cpp | 94 +++++++++++++++++ 2 files changed, 209 insertions(+), 29 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 294370aa0f6b4..a1a873e8d0b71 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -161,40 +161,109 @@ static std::string format_literal(const std::string & literal) { } static void _generate_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { - auto has_max = max_value != std::numeric_limits::max(); auto has_min = min_value != std::numeric_limits::min(); + auto has_max = max_value != std::numeric_limits::max(); - if (has_min && has_max && min_value > max_value) { - throw std::invalid_argument("min value must be less than or equal to max value"); - } - if (has_min && has_max && min_value == max_value) { - out << "\"" << min_value << "\""; - } else if (has_min && has_max) { - throw std::invalid_argument("min and max values not supported yet"); - } else if (has_min) { - if (min_value < 0) { - throw std::invalid_argument("negative min values not supported yet"); - // out << "\"-\" "; - // _generate_min_max_int(std::nullopt, -min_value, out); - // ... + auto digit_range = [&](char from, char to) { + out << "["; + if (from == to) { + out << from; + } else { + out << from << "-" << to; } - auto less_decimals = std::max(decimals_left - 1, 1); - auto more_digits = [&](int min_digits, int decimals) { - out << "[0-9]{" << min_digits << "," << decimals << "}"; - }; - auto digit_range = [&](char from, char to) { - out << "["; - if (from == to) { - out << from; + out << "]"; + }; + auto more_digits = [&](int min_digits, int max_digits) {//} = std::numeric_limits::max()) { + out << "[0-9]"; + if (min_digits == max_digits && min_digits == 1) { + return; + } + out << "{"; + out << min_digits; + if (max_digits != min_digits) { + out << ","; + if (max_digits != std::numeric_limits::max()) { + out << max_digits; + } + } + out << "}"; + }; + std::function uniform_range = [&](const std::string_view & from, const std::string_view & to) { + size_t i = 0; + while (from[i] == to[i]) { + i++; + } + if (i > 0) { + out << "\"" << from.substr(0, i) << "\""; + } + if (i < from.length()) { + if (i > 0) { + out << " "; + } + auto sub_len = from.length() - i - 1; + if (sub_len > 0) { + auto from_sub = from.substr(i + 1); + auto to_sub = to.substr(i + 1); + auto sub_zeros = repeat("0", sub_len); + auto sub_nines = repeat("9", sub_len); + + auto to_reached = false; + if (from_sub == sub_zeros) { + digit_range(from[i], to[i] - 1); + out << " "; + more_digits(sub_len, sub_len); + } else { + out << "[" << from[i] << "] "; + uniform_range(from_sub, sub_nines); + if (from[i] < to[i] - 1) { + out << " | "; + if (to_sub == sub_nines) { + digit_range(from[i] + 1, to[i]); + to_reached = true; + } else { + digit_range(from[i] + 1, to[i] - 1); + } + out << " "; + more_digits(sub_len, sub_len); + } + } + if (!to_reached) { + out << " | "; + digit_range(to[i], to[i]); + out << " "; + uniform_range(sub_zeros, to_sub); + } } else { - out << from << "-" << to; + out << "[" << from[i] << "-" << to[i] << "]"; } - out << "]"; - }; + } + }; + + if (has_min && has_max) { + auto min_s = std::to_string(min_value); + auto max_s = std::to_string(max_value); + auto min_digits = min_s.length(); + auto max_digits = max_s.length(); + + for (auto digits = min_digits; digits < max_digits; digits++) { + uniform_range(min_s, repeat("9", digits)); + min_s = "1" + repeat("0", digits); + out << " | "; + } + uniform_range(min_s, max_s); + return; + } + + auto less_decimals = std::max(decimals_left - 1, 1); - if (min_value == 0) { + if (has_min) { + if (min_value < 0) { + out << "\"-\" "; + _generate_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); + out << " | [0] | [1-9] "; + more_digits(0, decimals_left - 1); + } else if (min_value == 0) { if (top_level) { - out << "[1-9] "; out << "[0] | [1-9] "; more_digits(0, less_decimals); } else { @@ -233,9 +302,26 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea more_digits(len - 1, less_decimals); } } - } else { - throw std::invalid_argument("max values not supported yet"); + return; + } + + if (has_max) { + if (max_value >= 0) { + if (top_level) { + out << "\"-\" [1-9] "; + more_digits(0, less_decimals); + out << " | "; + } + _generate_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); + } else { + out << "\"-\" ("; + _generate_min_max_int(std::numeric_limits::min(), -max_value, out, decimals_left, /* top_level= */ false); + out << ")"; + } + return; } + + assert(false); } class SchemaConverter { diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 0fcf25b95e8e6..17d85ba29eed9 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -158,6 +158,100 @@ static void test_all(const std::string & lang, std::function Date: Wed, 1 May 2024 01:47:35 +0100 Subject: [PATCH 04/24] json: handle negative min / max integer bounds --- common/json-schema-to-grammar.cpp | 14 ++++++++++++++ tests/test-json-schema-to-grammar.cpp | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index a1a873e8d0b71..cdc44e0fbfc88 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -240,6 +240,20 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea }; if (has_min && has_max) { + if (min_value < 0 && max_value < 0) { + out << "\"-\" ("; + _generate_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true); + out << ")"; + return; + } + + if (min_value < 0) { + out << "\"-\" ("; + _generate_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true); + out << ") | "; + min_value = 0; + } + auto min_s = std::to_string(min_value); auto max_s = std::to_string(max_value); auto min_digits = min_s.length(); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 17d85ba29eed9..8a58db1d1bf28 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -252,6 +252,20 @@ static void test_all(const std::string & lang, std::function Date: Wed, 1 May 2024 02:23:18 +0100 Subject: [PATCH 05/24] json: fix missing paren min/max bug --- common/json-schema-to-grammar.cpp | 2 ++ tests/test-json-schema-to-grammar.cpp | 40 ++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index cdc44e0fbfc88..ec0ebe2bd5fee 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -214,6 +214,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea more_digits(sub_len, sub_len); } else { out << "[" << from[i] << "] "; + out << "("; uniform_range(from_sub, sub_nines); if (from[i] < to[i] - 1) { out << " | "; @@ -226,6 +227,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea out << " "; more_digits(sub_len, sub_len); } + out << ")"; } if (!to_reached) { out << " | "; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 8a58db1d1bf28..a25cab2eec64b 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -233,7 +233,7 @@ static void test_all(const std::string & lang, std::function Date: Wed, 1 May 2024 02:32:21 +0100 Subject: [PATCH 06/24] json: proper paren fix --- common/json-schema-to-grammar.cpp | 2 +- tests/test-json-schema-to-grammar.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index ec0ebe2bd5fee..8f5e35d499673 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -216,6 +216,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea out << "[" << from[i] << "] "; out << "("; uniform_range(from_sub, sub_nines); + out << ")"; if (from[i] < to[i] - 1) { out << " | "; if (to_sub == sub_nines) { @@ -227,7 +228,6 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea out << " "; more_digits(sub_len, sub_len); } - out << ")"; } if (!to_reached) { out << " | "; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index a25cab2eec64b..e2a96d25ce509 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -233,7 +233,7 @@ static void test_all(const std::string & lang, std::function Date: Sun, 19 May 2024 00:35:01 +0100 Subject: [PATCH 07/24] json: integration test for schemas --- Makefile | 2 +- common/json-schema-to-grammar.cpp | 2 +- tests/test-grammar-integration.cpp | 61 +++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 895c62f84def0..e81dc8240622a 100644 --- a/Makefile +++ b/Makefile @@ -987,7 +987,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o json-schema-to-grammar.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 8f5e35d499673..42faa8adbc7b7 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -337,7 +337,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea return; } - assert(false); + throw std::runtime_error("At least one of min_value or max_value must be set"); } class SchemaConverter { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 9bdab05af7259..ca9b9c11049a0 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -11,6 +11,9 @@ #include #include #include +#include + +using json = nlohmann::ordered_json; static llama_grammar* build_grammar(const std::string & grammar_str) { auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); @@ -65,8 +68,8 @@ static bool match_string(const std::string & input, llama_grammar* grammar) { return false; } -static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { - fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str()); +static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { + fprintf(stderr, "⚫ Testing %s\n", test_desc.c_str(), grammar_str.c_str()); fflush(stderr); auto grammar = build_grammar(grammar_str); @@ -118,8 +121,62 @@ static void test_grammar(const std::string & test_desc, const std::string & gram // Clean up allocated memory llama_grammar_free(grammar); } +static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { + test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings); +} +static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector & passing_strings, const std::vector & failing_strings) { + test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings); +} static void test_simple_grammar() { + test_schema( + "simple min 0", + R"""({ + "type": "integer", + "minimum": 0 + })""", + // Passing strings + { + "0", + "10", + "10000", + }, + // Failing strings + { + "-1", + "-10", + "-10000", + "-100000000000000000000000000000000", + "100000000000000000000000000000000", + } + ); + test_schema( + "simple min -123 max 42", + R"""({ + "type": "integer", + "minimum": -123, + "maximum": 42 + })""", + // Passing strings + { + "-123", + "-122", + "-11", + "-1", + "0", + "1", + "10", + "39", + "42", + }, + // Failing strings + { + "-124", + "43", + "123", + } + ); + // Test case for a simple grammar test_grammar( "simple grammar", From 431edb8e7bf2d05e803bf9b6850e6b074eac1531 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 19 May 2024 00:52:34 +0100 Subject: [PATCH 08/24] json: fix bounds tests --- common/json-schema-to-grammar.cpp | 2 ++ tests/test-grammar-integration.cpp | 23 +++++++++++++++++++ tests/test-json-schema-to-grammar.cpp | 33 +++++++++++++++++++-------- 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 42faa8adbc7b7..d6346913aa3cf 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -202,6 +202,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea } auto sub_len = from.length() - i - 1; if (sub_len > 0) { + out << "("; auto from_sub = from.substr(i + 1); auto to_sub = to.substr(i + 1); auto sub_zeros = repeat("0", sub_len); @@ -235,6 +236,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea out << " "; uniform_range(sub_zeros, to_sub); } + out << ")"; } else { out << "[" << from[i] << "-" << to[i] << "]"; } diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index ca9b9c11049a0..6aca32a3938d5 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -150,6 +150,29 @@ static void test_simple_grammar() { "100000000000000000000000000000000", } ); + test_schema( + "simple min -1 max 1", + R"""({ + "type": "integer", + "minimum": -1, + "maximum": 1 + })""", + // Passing strings + { + "-1", + "0", + "1", + }, + // Failing strings + { + "-11", + "-10", + "-2", + "2", + "10", + "11", + } + ); test_schema( "simple min -123 max 42", R"""({ diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index e2a96d25ce509..ba6000ae0b452 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -166,7 +166,7 @@ static void test_all(const std::string & lang, std::function Date: Sun, 19 May 2024 02:29:31 +0100 Subject: [PATCH 09/24] Update json-schema-to-grammar.cpp --- common/json-schema-to-grammar.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index d6346913aa3cf..b5ead70aa64a9 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -202,13 +202,13 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea } auto sub_len = from.length() - i - 1; if (sub_len > 0) { - out << "("; auto from_sub = from.substr(i + 1); auto to_sub = to.substr(i + 1); auto sub_zeros = repeat("0", sub_len); auto sub_nines = repeat("9", sub_len); auto to_reached = false; + out << "("; if (from_sub == sub_zeros) { digit_range(from[i], to[i] - 1); out << " "; From 931b5436075437061285b6b5db13ded58f653241 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sat, 8 Jun 2024 20:33:12 +0100 Subject: [PATCH 10/24] json: fix negative max --- common/json-schema-to-grammar.cpp | 2 +- tests/test-json-schema-to-grammar.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index b5ead70aa64a9..61bdd1bf9bbbe 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -333,7 +333,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea _generate_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); } else { out << "\"-\" ("; - _generate_min_max_int(std::numeric_limits::min(), -max_value, out, decimals_left, /* top_level= */ false); + _generate_min_max_int(-max_value, std::numeric_limits::max(), out, decimals_left, /* top_level= */ false); out << ")"; } return; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index ba6000ae0b452..c0c67c9b34b63 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -180,6 +180,18 @@ static void test_all(const std::string & lang, std::function Date: Sat, 8 Jun 2024 20:33:40 +0100 Subject: [PATCH 11/24] json: fix negative min (w/ more than 1 digit) --- common/json-schema-to-grammar.cpp | 4 ++-- tests/test-grammar-integration.cpp | 24 ++++++++++++++++++++++++ tests/test-json-schema-to-grammar.cpp | 16 +++++++++++++++- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 61bdd1bf9bbbe..79b037f2720ec 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -276,9 +276,9 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea if (has_min) { if (min_value < 0) { - out << "\"-\" "; + out << "\"-\" ("; _generate_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); - out << " | [0] | [1-9] "; + out << ") | [0] | [1-9] "; more_digits(0, decimals_left - 1); } else if (min_value == 0) { if (top_level) { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 2d78c61d382e9..da5c5f98d4e0e 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -199,6 +199,30 @@ static void test_simple_grammar() { "123", } ); + test_schema( + "simple min -123", + R"""({ + "type": "integer", + "minimum": -123 + })""", + // Passing strings + { + "-123", + "-122", + "-11", + "-1", + "0", + "1", + "123", + "1234", + "2345", + }, + // Failing strings + { + "-1234", + "-124", + } + ); // Test case for a simple grammar test_grammar( diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index c0c67c9b34b63..2134eac4b0a38 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -179,7 +179,21 @@ static void test_all(const std::string & lang, std::function Date: Sat, 8 Jun 2024 20:39:42 +0100 Subject: [PATCH 12/24] Update test-grammar-integration.cpp --- tests/test-grammar-integration.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index da5c5f98d4e0e..0fe6abef152f4 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -148,6 +148,33 @@ static void test_simple_grammar() { "-10000", "-100000000000000000000000000000000", "100000000000000000000000000000000", + "00", + "01", + "-0", + } + ); + test_schema( + "simple min 456", + R"""({ + "type": "integer", + "minimum": 456 + })""", + // Passing strings + { + "456", + "4560", + "457", + "460", + "500", + }, + // Failing strings + { + "455", + "356", + "50", + "050", + "-1", + "-456", } ); test_schema( From 3549702da7b7a8d67f66e9c5932f07009776c6db Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sat, 8 Jun 2024 21:21:35 +0100 Subject: [PATCH 13/24] json: nit: move string rules together --- common/json-schema-to-grammar.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 79b037f2720ec..4c25829bdda7e 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -173,7 +173,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea } out << "]"; }; - auto more_digits = [&](int min_digits, int max_digits) {//} = std::numeric_limits::max()) { + auto more_digits = [&](int min_digits, int max_digits) { out << "[0-9]"; if (min_digits == max_digits && min_digits == 1) { return; @@ -862,6 +862,11 @@ class SchemaConverter { } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) { auto prim_name = schema_format + "-string"; return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name))); + } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) { + std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); + int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; + int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); + return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { int min_value = std::numeric_limits::min(); int max_value = std::numeric_limits::max(); @@ -880,11 +885,6 @@ class SchemaConverter { _generate_min_max_int(min_value, max_value, out); out << ") space"; return _add_rule(rule_name, out.str()); - } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) { - std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); - int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; - int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); - return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); } else if (schema.empty() || schema_type == "object") { return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object"))); } else { From e93368076b3d862d424c033ec960c3e5644c9713 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sat, 8 Jun 2024 21:33:24 +0100 Subject: [PATCH 14/24] json: port min/max integer support to Python & JS --- examples/json_schema_to_grammar.py | 183 ++++++++++++++- .../server/public/json-schema-to-grammar.mjs | 212 ++++++++++++++++++ 2 files changed, 394 insertions(+), 1 deletion(-) diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 7d889c3fe1287..b1b7a974df536 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -4,7 +4,7 @@ import json import re import sys -from typing import Any, Dict, List, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union def _build_repetition(item_rule, min_items, max_items, separator_rule=None): @@ -23,6 +23,169 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None): result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None) return f'({result})?' if min_items == 0 else result +def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True): + has_min = min_value != None + has_max = max_value != None + + def digit_range(from_char: str, to_char: str): + out.append("[") + if from_char == to_char: + out.append(from_char) + else: + out.append(from_char) + out.append("-") + out.append(to_char) + out.append("]") + + def more_digits(min_digits: int, max_digits: int): + out.append("[0-9]") + if min_digits == max_digits and min_digits == 1: + return + out.append("{") + out.append(str(min_digits)) + if max_digits != min_digits: + out.append(",") + if max_digits != sys.maxsize: + out.append(str(max_digits)) + out.append("}") + + def uniform_range(from_str: str, to_str: str): + i = 0 + while i < len(from_str) and from_str[i] == to_str[i]: + i += 1 + if i > 0: + out.append("\"") + out.append(from_str[:i]) + out.append("\"") + if i < len(from_str): + if i > 0: + out.append(" ") + sub_len = len(from_str) - i - 1 + if sub_len > 0: + from_sub = from_str[i+1:] + to_sub = to_str[i+1:] + sub_zeros = "0" * sub_len + sub_nines = "9" * sub_len + + to_reached = False + out.append("(") + if from_sub == sub_zeros: + digit_range(from_str[i], chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + else: + out.append("[") + out.append(from_str[i]) + out.append("] ") + out.append("(") + uniform_range(from_sub, sub_nines) + out.append(")") + if ord(from_str[i]) < ord(to_str[i]) - 1: + out.append(" | ") + if to_sub == sub_nines: + digit_range(chr(ord(from_str[i]) + 1), to_str[i]) + to_reached = True + else: + digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1)) + out.append(" ") + more_digits(sub_len, sub_len) + if not to_reached: + out.append(" | ") + digit_range(to_str[i], to_str[i]) + out.append(" ") + uniform_range(sub_zeros, to_sub) + out.append(")") + else: + out.append("[") + out.append(from_str[i]) + out.append("-") + out.append(to_str[i]) + out.append("]") + + if has_min and has_max: + if min_value < 0 and max_value < 0: + out.append("\"-\" (") + _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True) + out.append(")") + return + + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True) + out.append(") | ") + min_value = 0 + + min_s = str(min_value) + max_s = str(max_value) + min_digits = len(min_s) + max_digits = len(max_s) + + for digits in range(min_digits, max_digits): + uniform_range(min_s, "9" * digits) + min_s = "1" + "0" * digits + out.append(" | ") + uniform_range(min_s, max_s) + return + + less_decimals = max(decimals_left - 1, 1) + + if has_min: + if min_value < 0: + out.append("\"-\" (") + _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False) + out.append(") | [0] | [1-9] ") + more_digits(0, decimals_left - 1) + elif min_value == 0: + if top_level: + out.append("[0] | [1-9] ") + more_digits(0, less_decimals) + else: + more_digits(1, decimals_left) + elif min_value <= 9: + c = str(min_value) + if min_value > (1 if top_level else 0): + digit_range("0", chr(ord(c) - 1)) + out.append(" ") + more_digits(1, less_decimals) + out.append(" | ") + digit_range(c, "9") + out.append(" ") + more_digits(0, less_decimals) + else: + min_s = str(min_value) + length = len(min_s) + c = min_s[0] + + if c > "1": + digit_range("1" if top_level else "0", chr(ord(c) - 1)) + out.append(" ") + more_digits(length, less_decimals) + out.append(" | ") + digit_range(c, c) + out.append(" (") + _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False) + out.append(")") + if c < "9": + out.append(" | ") + digit_range(chr(ord(c) + 1), "9") + out.append(" ") + more_digits(length - 1, less_decimals) + return + + if has_max: + if max_value >= 0: + if top_level: + out.append("\"-\" [1-9] ") + more_digits(0, less_decimals) + out.append(" | ") + _generate_min_max_int(0, max_value, out, decimals_left, top_level=True) + else: + out.append("\"-\" (") + _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False) + out.append(")") + return + + raise RuntimeError("At least one of min_value or max_value must be set") class BuiltinRule: def __init__(self, content: str, deps: list = None): @@ -433,6 +596,24 @@ def add_component(comp_schema, is_required): return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + elif schema_type in (None, 'integer') and \ + ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): + min_value = None + max_value = None + if 'minimum' in schema: + min_value = schema['minimum'] + elif 'exclusiveMinimum' in schema: + min_value = schema['exclusiveMinimum'] + 1 + if 'maximum' in schema: + max_value = schema['maximum'] + elif 'exclusiveMaximum' in schema: + max_value = schema['exclusiveMaximum'] - 1 + + out = ["("] + _generate_min_max_int(min_value, max_value, out) + out.append(") space") + return self._add_rule(rule_name, ''.join(out)) + elif (schema_type == 'object') or (len(schema) == 0): return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index cef11eab83a46..7c0f9c7a210c0 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -24,6 +24,200 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) { return minItems === 0 ? `(${result})?` : result; } +function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) { + const hasMin = minValue !== null; + const hasMax = maxValue !== null; + + function digitRange(fromChar, toChar) { + out.push("["); + if (fromChar === toChar) { + out.push(fromChar); + } else { + out.push(fromChar); + out.push("-"); + out.push(toChar); + } + out.push("]"); + } + + function moreDigits(minDigits, maxDigits) { + out.push("[0-9]"); + if (minDigits === maxDigits && minDigits === 1) { + return; + } + out.push("{"); + out.push(minDigits.toString()); + if (maxDigits !== minDigits) { + out.push(","); + if (maxDigits !== Number.MAX_SAFE_INTEGER) { + out.push(maxDigits.toString()); + } + } + out.push("}"); + } + + function uniformRange(fromStr, toStr) { + let i = 0; + while (i < fromStr.length && fromStr[i] === toStr[i]) { + i++; + } + if (i > 0) { + out.push("\""); + out.push(fromStr.slice(0, i)); + out.push("\""); + } + if (i < fromStr.length) { + if (i > 0) { + out.push(" "); + } + const subLen = fromStr.length - i - 1; + if (subLen > 0) { + const fromSub = fromStr.slice(i + 1); + const toSub = toStr.slice(i + 1); + const subZeros = "0".repeat(subLen); + const subNines = "9".repeat(subLen); + + let toReached = false; + out.push("("); + if (fromSub === subZeros) { + digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1)); + out.push(" "); + moreDigits(subLen, subLen); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("] "); + out.push("("); + uniformRange(fromSub, subNines); + out.push(")"); + if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) { + out.push(" | "); + if (toSub === subNines) { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]); + toReached = true; + } else { + digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1)); + } + out.push(" "); + moreDigits(subLen, subLen); + } + } + if (!toReached) { + out.push(" | "); + digitRange(toStr[i], toStr[i]); + out.push(" "); + uniformRange(subZeros, toSub); + } + out.push(")"); + } else { + out.push("["); + out.push(fromStr[i]); + out.push("-"); + out.push(toStr[i]); + out.push("]"); + } + } + } + + if (hasMin && hasMax) { + if (minValue < 0 && maxValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true); + out.push(")"); + return; + } + + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(0, -minValue, out, decimalsLeft, true); + out.push(") | "); + minValue = 0; + } + + let minS = minValue.toString(); + const maxS = maxValue.toString(); + const minDigits = minS.length; + const maxDigits = maxS.length; + + for (let digits = minDigits; digits < maxDigits; digits++) { + uniformRange(minS, "9".repeat(digits)); + minS = "1" + "0".repeat(digits); + out.push(" | "); + } + uniformRange(minS, maxS); + return; + } + + const lessDecimals = Math.max(decimalsLeft - 1, 1); + + if (hasMin) { + if (minValue < 0) { + out.push("\"-\" ("); + _generateMinMaxInt(null, -minValue, out, decimalsLeft, false); + out.push(") | [0] | [1-9] "); + moreDigits(0, decimalsLeft - 1); + } else if (minValue === 0) { + if (topLevel) { + out.push("[0] | [1-9] "); + moreDigits(0, lessDecimals); + } else { + moreDigits(1, decimalsLeft); + } + } else if (minValue <= 9) { + const c = minValue.toString(); + if (minValue > (topLevel ? 1 : 0)) { + digitRange("0", String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(1, lessDecimals); + out.push(" | "); + } + digitRange(c, "9"); + out.push(" "); + moreDigits(0, lessDecimals); + } else { + const minS = minValue.toString(); + const length = minS.length; + const c = minS[0]; + + if (c > "1") { + digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1)); + out.push(" "); + moreDigits(length, lessDecimals); + out.push(" | "); + } + digitRange(c, c); + out.push(" ("); + _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false); + out.push(")"); + if (c < "9") { + out.push(" | "); + digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9"); + out.push(" "); + moreDigits(length - 1, lessDecimals); + } + } + return; + } + + if (hasMax) { + if (maxValue >= 0) { + if (topLevel) { + out.push("\"-\" [1-9] "); + moreDigits(0, lessDecimals); + out.push(" | "); + } + _generateMinMaxInt(0, maxValue, out, decimalsLeft, true); + } else { + out.push("\"-\" ("); + _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false); + out.push(")"); + } + return; + } + + throw new Error("At least one of minValue or maxValue must be set"); +} + class BuiltinRule { constructor(content, deps) { this.content = content; @@ -435,6 +629,24 @@ export class SchemaConverter { const minLen = schema.minLength || 0; const maxLen = schema.maxLength; return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space'); + } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) { + let minValue = null; + let maxValue = null; + if ('minimum' in schema) { + minValue = schema.minimum; + } else if ('exclusiveMinimum' in schema) { + minValue = schema.exclusiveMinimum + 1; + } + if ('maximum' in schema) { + maxValue = schema.maximum; + } else if ('exclusiveMaximum' in schema) { + maxValue = schema.exclusiveMaximum - 1; + } + + const out = ["("]; + _generateMinMaxInt(minValue, maxValue, out); + out.push(") space"); + return this._addRule(ruleName, out.join('')); } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) { return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object'])); } else { From a0f19047af6749a3a7f963b89b00a61bf4d7e839 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sat, 8 Jun 2024 21:46:18 +0100 Subject: [PATCH 15/24] nit: move + rename _build_min_max_int --- common/json-schema-to-grammar.cpp | 256 +++++++++++++++--------------- 1 file changed, 128 insertions(+), 128 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 4c25829bdda7e..b60966a2c9fb8 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -40,127 +40,7 @@ static std::string build_repetition(const std::string & item_rule, const int min return result; } -const std::string SPACE_RULE = "\" \"?"; - -struct BuiltinRule { - std::string content; - std::vector deps; -}; - -std::unordered_map PRIMITIVE_RULES = { - {"boolean", {"(\"true\" | \"false\") space", {}}}, - {"decimal-part", {"[0-9]{1,16}", {}}}, - {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}}, - {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, - {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, - {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, - {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, - {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, - {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}}, - {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}}, - {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, - {"null", {"\"null\" space", {}}}, -}; - -std::unordered_map STRING_FORMAT_RULES = { - {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, - {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, - {"date-time", {"date \"T\" time", {"date", "time"}}}, - {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, - {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, - {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}} -}; - -static bool is_reserved_name(const std::string & name) { - static std::unordered_set RESERVED_NAMES; - if (RESERVED_NAMES.empty()) { - RESERVED_NAMES.insert("root"); - for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); - for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); - } - return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); -} - -std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+"); -std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]"); -std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]"); -std::unordered_map GRAMMAR_LITERAL_ESCAPES = { - {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"} -}; - -std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; - -template -std::string join(Iterator begin, Iterator end, const std::string & separator) { - std::ostringstream result; - if (begin != end) { - result << *begin; - for (Iterator it = begin + 1; it != end; ++it) { - result << separator << *it; - } - } - return result.str(); -} - -static std::vector split(const std::string & str, const std::string & delimiter) { - std::vector tokens; - size_t start = 0; - size_t end = str.find(delimiter); - - while (end != std::string::npos) { - tokens.push_back(str.substr(start, end - start)); - start = end + delimiter.length(); - end = str.find(delimiter, start); - } - - tokens.push_back(str.substr(start)); - - return tokens; -} - -static std::string repeat(const std::string & str, size_t n) { - if (n == 0) { - return ""; - } - - std::string result; - result.reserve(str.length() * n); - - for (size_t i = 0; i < n; ++i) { - result += str; - } - - return result; -} - -static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function & replacement) { - std::smatch match; - std::string result; - - std::string::const_iterator searchStart(input.cbegin()); - std::string::const_iterator searchEnd(input.cend()); - - while (std::regex_search(searchStart, searchEnd, match, regex)) { - result.append(searchStart, searchStart + match.position()); - result.append(replacement(match)); - searchStart = match.suffix().first; - } - - result.append(searchStart, searchEnd); - - return result; -} - -static std::string format_literal(const std::string & literal) { - std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) { - char c = match.str()[0]; - return GRAMMAR_LITERAL_ESCAPES.at(c); - }); - return "\"" + escaped + "\""; -} - -static void _generate_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { +static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { auto has_min = min_value != std::numeric_limits::min(); auto has_max = max_value != std::numeric_limits::max(); @@ -246,14 +126,14 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea if (has_min && has_max) { if (min_value < 0 && max_value < 0) { out << "\"-\" ("; - _generate_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true); + _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true); out << ")"; return; } if (min_value < 0) { out << "\"-\" ("; - _generate_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true); + _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true); out << ") | "; min_value = 0; } @@ -277,7 +157,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea if (has_min) { if (min_value < 0) { out << "\"-\" ("; - _generate_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); + _build_min_max_int(std::numeric_limits::min(), -min_value, out, decimals_left, /* top_level= */ false); out << ") | [0] | [1-9] "; more_digits(0, decimals_left - 1); } else if (min_value == 0) { @@ -311,7 +191,7 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea } digit_range(c, c); out << " ("; - _generate_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits::max(), out, less_decimals, /* top_level= */ false); + _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits::max(), out, less_decimals, /* top_level= */ false); out << ")"; if (c < '9') { out << " | "; @@ -330,10 +210,10 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea more_digits(0, less_decimals); out << " | "; } - _generate_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); + _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true); } else { out << "\"-\" ("; - _generate_min_max_int(-max_value, std::numeric_limits::max(), out, decimals_left, /* top_level= */ false); + _build_min_max_int(-max_value, std::numeric_limits::max(), out, decimals_left, /* top_level= */ false); out << ")"; } return; @@ -342,6 +222,126 @@ static void _generate_min_max_int(int min_value, int max_value, std::stringstrea throw std::runtime_error("At least one of min_value or max_value must be set"); } +const std::string SPACE_RULE = "\" \"?"; + +struct BuiltinRule { + std::string content; + std::vector deps; +}; + +std::unordered_map PRIMITIVE_RULES = { + {"boolean", {"(\"true\" | \"false\") space", {}}}, + {"decimal-part", {"[0-9]{1,16}", {}}}, + {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}}, + {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, + {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, + {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, + {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, + {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, + {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}}, + {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}}, + {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, + {"null", {"\"null\" space", {}}}, +}; + +std::unordered_map STRING_FORMAT_RULES = { + {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, + {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, + {"date-time", {"date \"T\" time", {"date", "time"}}}, + {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, + {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, + {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}} +}; + +static bool is_reserved_name(const std::string & name) { + static std::unordered_set RESERVED_NAMES; + if (RESERVED_NAMES.empty()) { + RESERVED_NAMES.insert("root"); + for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); + for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); + } + return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); +} + +std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+"); +std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]"); +std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]"); +std::unordered_map GRAMMAR_LITERAL_ESCAPES = { + {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"} +}; + +std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; + +template +std::string join(Iterator begin, Iterator end, const std::string & separator) { + std::ostringstream result; + if (begin != end) { + result << *begin; + for (Iterator it = begin + 1; it != end; ++it) { + result << separator << *it; + } + } + return result.str(); +} + +static std::vector split(const std::string & str, const std::string & delimiter) { + std::vector tokens; + size_t start = 0; + size_t end = str.find(delimiter); + + while (end != std::string::npos) { + tokens.push_back(str.substr(start, end - start)); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + tokens.push_back(str.substr(start)); + + return tokens; +} + +static std::string repeat(const std::string & str, size_t n) { + if (n == 0) { + return ""; + } + + std::string result; + result.reserve(str.length() * n); + + for (size_t i = 0; i < n; ++i) { + result += str; + } + + return result; +} + +static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function & replacement) { + std::smatch match; + std::string result; + + std::string::const_iterator searchStart(input.cbegin()); + std::string::const_iterator searchEnd(input.cend()); + + while (std::regex_search(searchStart, searchEnd, match, regex)) { + result.append(searchStart, searchStart + match.position()); + result.append(replacement(match)); + searchStart = match.suffix().first; + } + + result.append(searchStart, searchEnd); + + return result; +} + +static std::string format_literal(const std::string & literal) { + std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) { + char c = match.str()[0]; + return GRAMMAR_LITERAL_ESCAPES.at(c); + }); + return "\"" + escaped + "\""; +} + class SchemaConverter { private: std::function _fetch_json; @@ -882,7 +882,7 @@ class SchemaConverter { } std::stringstream out; out << "("; - _generate_min_max_int(min_value, max_value, out); + _build_min_max_int(min_value, max_value, out); out << ") space"; return _add_rule(rule_name, out.str()); } else if (schema.empty() || schema_type == "object") { From dcc27d1a933bb38463a041de3105812b565f569c Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Sun, 9 Jun 2024 09:42:19 +0100 Subject: [PATCH 16/24] fix min in [1, 9] --- common/json-schema-to-grammar.cpp | 5 ++-- examples/json_schema_to_grammar.py | 5 ++-- .../server/public/json-schema-to-grammar.mjs | 5 ++-- tests/test-grammar-integration.cpp | 24 +++++++++++++++++++ tests/test-json-schema-to-grammar.cpp | 4 ++-- 5 files changed, 35 insertions(+), 8 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index b60966a2c9fb8..c2a57461919d6 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -169,8 +169,9 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & } } else if (min_value <= 9) { char c = '0' + min_value; - if (min_value > (top_level ? 1 : 0)) { - digit_range('0', c - 1); + auto range_start = top_level ? '1' : '0'; + if (c > range_start) { + digit_range(range_start, c - 1); out << " "; more_digits(1, less_decimals); out << " | "; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index b1b7a974df536..27a89596af985 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -143,8 +143,9 @@ def uniform_range(from_str: str, to_str: str): more_digits(1, decimals_left) elif min_value <= 9: c = str(min_value) - if min_value > (1 if top_level else 0): - digit_range("0", chr(ord(c) - 1)) + range_start = '1' if top_level else '0' + if c > range_start: + digit_range(range_start, chr(ord(c) - 1)) out.append(" ") more_digits(1, less_decimals) out.append(" | ") diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 7c0f9c7a210c0..3d3d2837c58cb 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -165,8 +165,9 @@ function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel } } else if (minValue <= 9) { const c = minValue.toString(); - if (minValue > (topLevel ? 1 : 0)) { - digitRange("0", String.fromCharCode(c.charCodeAt(0) - 1)); + const range_start = topLevel ? '1' : '0'; + if (c > range_start) { + digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1)); out.push(" "); moreDigits(1, lessDecimals); out.push(" | "); diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 0fe6abef152f4..51c0f6df03d76 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -153,6 +153,30 @@ static void test_simple_grammar() { "-0", } ); + test_schema( + "simple min 3", + R"""({ + "type": "integer", + "minimum": 3 + })""", + // Passing strings + { + "3", + "4", + "10", + "20", + }, + // Failing strings + { + "-1", + "-100", + "0", + "1", + "2", + "01", + "02", + } + ); test_schema( "simple min 456", R"""({ diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 2134eac4b0a38..93f74a57d7bbc 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -114,7 +114,7 @@ static void test_all(const std::string & lang, std::function Date: Sun, 9 Jun 2024 13:22:41 +0100 Subject: [PATCH 17/24] Update test-grammar-integration.cpp --- tests/test-grammar-integration.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 51c0f6df03d76..bf5a7ebcc1fc7 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -69,7 +69,7 @@ static bool match_string(const std::string & input, llama_grammar* grammar) { } static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { - fprintf(stderr, "⚫ Testing %s\n", test_desc.c_str(), grammar_str.c_str()); + fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str()); fflush(stderr); auto grammar = build_grammar(grammar_str); From cad377d3a1b3bfccfb1097fab738c1abf2aeb3e0 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sun, 9 Jun 2024 19:35:36 +0100 Subject: [PATCH 18/24] add C++11-compatible replacement for std::string_view --- common/json-schema-to-grammar.cpp | 130 +++++++++++++++++++----------- 1 file changed, 85 insertions(+), 45 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index c2a57461919d6..cb45c81ac4943 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -40,6 +40,45 @@ static std::string build_repetition(const std::string & item_rule, const int min return result; } +/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */ +class string_view { + const std::string & _str; + const size_t _start; + const size_t _end; +public: + string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {} + + size_t size() const { + return _end - _start; + } + + size_t length() const { + return size(); + } + + operator std::string() const { + return str(); + } + + std::string str() const { + return _str.substr(_start, _end - _start); + } + + string_view substr(size_t pos, size_t len = std::string::npos) const { + return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len); + } + + char operator[](size_t pos) const { + return _str[_start + pos]; + } + + bool operator==(const string_view & other) const { + std::string this_str = *this; + std::string other_str = other; + return this_str == other_str; + } +}; + static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) { auto has_min = min_value != std::numeric_limits::min(); auto has_max = max_value != std::numeric_limits::max(); @@ -68,60 +107,61 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & } out << "}"; }; - std::function uniform_range = [&](const std::string_view & from, const std::string_view & to) { - size_t i = 0; - while (from[i] == to[i]) { - i++; - } - if (i > 0) { - out << "\"" << from.substr(0, i) << "\""; - } - if (i < from.length()) { + std::function uniform_range = + [&](const string_view & from, const string_view & to) { + size_t i = 0; + while (from[i] == to[i]) { + i++; + } if (i > 0) { - out << " "; + out << "\"" << from.substr(0, i).str() << "\""; } - auto sub_len = from.length() - i - 1; - if (sub_len > 0) { - auto from_sub = from.substr(i + 1); - auto to_sub = to.substr(i + 1); - auto sub_zeros = repeat("0", sub_len); - auto sub_nines = repeat("9", sub_len); - - auto to_reached = false; - out << "("; - if (from_sub == sub_zeros) { - digit_range(from[i], to[i] - 1); + if (i < from.length()) { + if (i > 0) { out << " "; - more_digits(sub_len, sub_len); - } else { - out << "[" << from[i] << "] "; + } + auto sub_len = from.length() - i - 1; + if (sub_len > 0) { + auto from_sub = from.substr(i + 1); + auto to_sub = to.substr(i + 1); + auto sub_zeros = repeat("0", sub_len); + auto sub_nines = repeat("9", sub_len); + + auto to_reached = false; out << "("; - uniform_range(from_sub, sub_nines); - out << ")"; - if (from[i] < to[i] - 1) { - out << " | "; - if (to_sub == sub_nines) { - digit_range(from[i] + 1, to[i]); - to_reached = true; - } else { - digit_range(from[i] + 1, to[i] - 1); - } + if (from_sub == sub_zeros) { + digit_range(from[i], to[i] - 1); out << " "; more_digits(sub_len, sub_len); + } else { + out << "[" << from[i] << "] "; + out << "("; + uniform_range(from_sub, sub_nines); + out << ")"; + if (from[i] < to[i] - 1) { + out << " | "; + if (to_sub == sub_nines) { + digit_range(from[i] + 1, to[i]); + to_reached = true; + } else { + digit_range(from[i] + 1, to[i] - 1); + } + out << " "; + more_digits(sub_len, sub_len); + } } + if (!to_reached) { + out << " | "; + digit_range(to[i], to[i]); + out << " "; + uniform_range(sub_zeros, to_sub); + } + out << ")"; + } else { + out << "[" << from[i] << "-" << to[i] << "]"; } - if (!to_reached) { - out << " | "; - digit_range(to[i], to[i]); - out << " "; - uniform_range(sub_zeros, to_sub); - } - out << ")"; - } else { - out << "[" << from[i] << "-" << to[i] << "]"; } - } - }; + }; if (has_min && has_max) { if (min_value < 0 && max_value < 0) { From d6483a9c07d5c6d3c0f0db6482d461baaab458e8 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 10 Jun 2024 02:00:04 +0100 Subject: [PATCH 19/24] add min/max constrained int field to pydantic json schema example --- examples/json-schema-pydantic-example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py index 69ebfd4093824..3d8dc023043ae 100644 --- a/examples/json-schema-pydantic-example.py +++ b/examples/json-schema-pydantic-example.py @@ -53,6 +53,7 @@ class QAPair(BaseModel): question: str concise_answer: str justification: str + stars: Annotated[int, Field(ge=1, le=5)] class PyramidalSummary(BaseModel): title: str From 948e55e8903769ef0883c2eeba9e6c98eff0a5d6 Mon Sep 17 00:00:00 2001 From: ochafik Date: Sat, 22 Jun 2024 21:10:05 +0100 Subject: [PATCH 20/24] fix merge --- tests/test-json-schema-to-grammar.cpp | 38 +++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 5b1c685c49188..2e591bd71abaa 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -89,7 +89,7 @@ static void test_all(const std::string & lang, std::function Date: Sat, 22 Jun 2024 21:11:42 +0100 Subject: [PATCH 21/24] json: add integration tests for min/max bounds --- tests/test-grammar-integration.cpp | 160 ++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 1 deletion(-) diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 5d5fe8783c25b..28b1513662243 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -922,7 +922,6 @@ static void test_json_schema() { } ); - test_schema( "min+max items", // Schema @@ -949,6 +948,165 @@ static void test_json_schema() { } ); + test_schema( + "min -123 max 42", + // Schema + R"""({ + "type": "integer", + "minimum": -123, + "maximum": 42 + })""", + // Passing strings + { + "-123", + "-13", + "-12", + "-2", + "-1", + "0", + "1", + "5", + "40", + "42", + }, + // Failing strings + { + "-0123", + "-124", + "-1123", + "-200", + "43", + "123", + "0123", + } + ); + + test_schema( + "min 5 max 30", + // Schema + R"""({ + "type": "integer", + "minimum": 5, + "maximum": 30 + })""", + // Passing strings + { + "5", + "10", + "30", + }, + // Failing strings + { + "05", + "4", + "-1", + "31", + "123", + "0123", + } + ); + + test_schema( + "min 2", + // Schema + R"""({ + "type": "integer", + "minimum": 2 + })""", + // Passing strings + { + "2", + "1234567890000000", + }, + // Failing strings + { + "0", + "1", + "12345678900000000", + } + ); + + test_schema( + "min 0", + // Schema + R"""({ + "type": "integer", + "minimum": 0 + })""", + // Passing strings + { + "0", + "12", + }, + // Failing strings + { + "-1", + "01", + } + ); + + test_schema( + "max 9999", + // Schema + R"""({ + "type": "integer", + "maximum": 9999 + })""", + // Passing strings + { + "-99999", + "0", + "9999", + }, + // Failing strings + { + "10000", + "99991", + } + ); + + test_schema( + "max -9999", + // Schema + R"""({ + "type": "integer", + "maximum": -9999 + })""", + // Passing strings + { + "-10000", + "-9999", + }, + // Failing strings + { + "-9998", + "0", + "9999", + } + ); + + test_schema( + "exclusive min / max", + // Schema + R"""({ + "type": "integer", + "exclusiveMinimum": 0, + "exclusiveMaximum": 10000 + })""", + // Passing strings + { + "1", + "9999", + }, + // Failing strings + { + "0", + "01", + "10000", + "99999", + } + ); + // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties) test_schema( "object properties", From 3a80d1e1b33284efe99caf41ec31a0ad0521e4a8 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 24 Jun 2024 21:28:58 +0100 Subject: [PATCH 22/24] reshuffle/merge min/max integ test cases --- tests/test-grammar-integration.cpp | 295 ++++++++++++----------------- 1 file changed, 117 insertions(+), 178 deletions(-) diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 28b1513662243..ff4b146ab4c4d 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -152,7 +152,7 @@ static void test_schema(const std::string & test_desc, const std::string & schem static void test_simple_grammar() { test_schema( - "simple min 0", + "min 0", R"""({ "type": "integer", "minimum": 0 @@ -161,6 +161,7 @@ static void test_simple_grammar() { { "0", "10", + "12", "10000", }, // Failing strings @@ -176,31 +177,36 @@ static void test_simple_grammar() { } ); test_schema( - "simple min 3", + "min 2", + // Schema R"""({ "type": "integer", - "minimum": 3 + "minimum": 2 })""", // Passing strings { + "2", "3", "4", "10", "20", + "1234567890000000", }, // Failing strings { + "0", + "1", "-1", "-100", "0", "1", - "2", "01", "02", + "12345678900000000", } ); test_schema( - "simple min 456", + "min 456", R"""({ "type": "integer", "minimum": 456 @@ -224,7 +230,94 @@ static void test_simple_grammar() { } ); test_schema( - "simple min -1 max 1", + "min -123", + R"""({ + "type": "integer", + "minimum": -123 + })""", + // Passing strings + { + "-123", + "-122", + "-11", + "-1", + "0", + "1", + "123", + "1234", + "2345", + }, + // Failing strings + { + "-1234", + "-124", + } + ); + + test_schema( + "max 9999", + // Schema + R"""({ + "type": "integer", + "maximum": 9999 + })""", + // Passing strings + { + "-99999", + "0", + "9999", + }, + // Failing strings + { + "10000", + "99991", + } + ); + test_schema( + "max -9999", + // Schema + R"""({ + "type": "integer", + "maximum": -9999 + })""", + // Passing strings + { + "-10000", + "-9999", + }, + // Failing strings + { + "-9998", + "0", + "9999", + } + ); + test_schema( + "min 5 max 30", + // Schema + R"""({ + "type": "integer", + "minimum": 5, + "maximum": 30 + })""", + // Passing strings + { + "5", + "10", + "30", + }, + // Failing strings + { + "05", + "4", + "-1", + "31", + "123", + "0123", + } + ); + test_schema( + "min -1 max 1", R"""({ "type": "integer", "minimum": -1, @@ -247,7 +340,7 @@ static void test_simple_grammar() { } ); test_schema( - "simple min -123 max 42", + "min -123 max 42", R"""({ "type": "integer", "minimum": -123, @@ -257,43 +350,48 @@ static void test_simple_grammar() { { "-123", "-122", + "-13", "-11", + "-2", "-1", "0", "1", + "5", "10", "39", + "40", "42", }, // Failing strings { + "-0123", "-124", + "-1123", + "-200", "43", "123", + "0123", } ); test_schema( - "simple min -123", + "exclusive min / max", + // Schema R"""({ "type": "integer", - "minimum": -123 + "exclusiveMinimum": 0, + "exclusiveMaximum": 10000 })""", // Passing strings { - "-123", - "-122", - "-11", - "-1", - "0", "1", - "123", - "1234", - "2345", + "9999", }, // Failing strings { - "-1234", - "-124", + "0", + "01", + "10000", + "99999", } ); @@ -948,165 +1046,6 @@ static void test_json_schema() { } ); - test_schema( - "min -123 max 42", - // Schema - R"""({ - "type": "integer", - "minimum": -123, - "maximum": 42 - })""", - // Passing strings - { - "-123", - "-13", - "-12", - "-2", - "-1", - "0", - "1", - "5", - "40", - "42", - }, - // Failing strings - { - "-0123", - "-124", - "-1123", - "-200", - "43", - "123", - "0123", - } - ); - - test_schema( - "min 5 max 30", - // Schema - R"""({ - "type": "integer", - "minimum": 5, - "maximum": 30 - })""", - // Passing strings - { - "5", - "10", - "30", - }, - // Failing strings - { - "05", - "4", - "-1", - "31", - "123", - "0123", - } - ); - - test_schema( - "min 2", - // Schema - R"""({ - "type": "integer", - "minimum": 2 - })""", - // Passing strings - { - "2", - "1234567890000000", - }, - // Failing strings - { - "0", - "1", - "12345678900000000", - } - ); - - test_schema( - "min 0", - // Schema - R"""({ - "type": "integer", - "minimum": 0 - })""", - // Passing strings - { - "0", - "12", - }, - // Failing strings - { - "-1", - "01", - } - ); - - test_schema( - "max 9999", - // Schema - R"""({ - "type": "integer", - "maximum": 9999 - })""", - // Passing strings - { - "-99999", - "0", - "9999", - }, - // Failing strings - { - "10000", - "99991", - } - ); - - test_schema( - "max -9999", - // Schema - R"""({ - "type": "integer", - "maximum": -9999 - })""", - // Passing strings - { - "-10000", - "-9999", - }, - // Failing strings - { - "-9998", - "0", - "9999", - } - ); - - test_schema( - "exclusive min / max", - // Schema - R"""({ - "type": "integer", - "exclusiveMinimum": 0, - "exclusiveMaximum": 10000 - })""", - // Passing strings - { - "1", - "9999", - }, - // Failing strings - { - "0", - "01", - "10000", - "99999", - } - ); - // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties) test_schema( "object properties", From 09a9b7565e7e12044071ec1196cfe12d32f72248 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 24 Jun 2024 23:44:02 +0100 Subject: [PATCH 23/24] nits / cleanups --- common/json-schema-to-grammar.cpp | 2 +- tests/test-grammar-integration.cpp | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index ea761fdc3ee04..2c47645cf4913 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -16,7 +16,7 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa static std::string repeat(const std::string & str, size_t n); -static std::string build_repetition(const std::string & item_rule, const int min_items, int max_items, const std::string & separator_rule = "") { +static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") { auto has_max = max_items != std::numeric_limits::max(); if (min_items == 0 && max_items == 1) { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index ff4b146ab4c4d..5b3992236c26c 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -12,9 +12,6 @@ #include #include #include -#include - -using json = nlohmann::ordered_json; using json = nlohmann::ordered_json; From 36bf00369a2736473a878b12d773d363b97103d3 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Tue, 25 Jun 2024 14:09:22 +0100 Subject: [PATCH 24/24] defensive code against string out of bounds (apparently different behaviour of libstdc++ vs. clang's libc++, can't read final NULL char w/ former) --- common/json-schema-to-grammar.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 2c47645cf4913..07d0e952d74cf 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -69,6 +69,10 @@ class string_view { } char operator[](size_t pos) const { + auto index = _start + pos; + if (index >= _end) { + throw std::out_of_range("string_view index out of range"); + } return _str[_start + pos]; } @@ -110,13 +114,13 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & std::function uniform_range = [&](const string_view & from, const string_view & to) { size_t i = 0; - while (from[i] == to[i]) { + while (i < from.length() && i < to.length() && from[i] == to[i]) { i++; } if (i > 0) { out << "\"" << from.substr(0, i).str() << "\""; } - if (i < from.length()) { + if (i < from.length() && i < to.length()) { if (i > 0) { out << " "; }