From e6143e5a40e017469a39361c0dd0d0abe3e95744 Mon Sep 17 00:00:00 2001 From: Charles Cooper Date: Sat, 16 Mar 2024 13:50:20 -0400 Subject: [PATCH] Revert "get rid of utf8_encodable checks" This reverts commit 3b92c85f8df8cbdc4168d33863eda07ce799f36a. --- tests/functional/grammar/test_grammar.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/functional/grammar/test_grammar.py b/tests/functional/grammar/test_grammar.py index fba1137f76..699865b24f 100644 --- a/tests/functional/grammar/test_grammar.py +++ b/tests/functional/grammar/test_grammar.py @@ -37,6 +37,20 @@ def test_basic_grammar_empty(): assert len(tree.children) == 0 +def utf8_encodable(terminal: str) -> bool: + try: + if "\x00" not in terminal and "\\ " not in terminal and "\x0c" not in terminal: + terminal.encode("utf-8-sig") + return True + else: + return False + except UnicodeEncodeError: # pragma: no cover + # Very rarely, a "." in some terminal regex will generate a surrogate + # character that cannot be encoded as UTF-8. We apply this filter to + # ensure it doesn't happen at runtime, but don't worry about coverage. + return False + + ALLOWED_CHARS = st.characters(codec="utf-8", min_codepoint=1) @@ -46,7 +60,7 @@ class GrammarStrategy(LarkStrategy): def __init__(self, grammar, start, explicit_strategies): super().__init__(grammar, start, explicit_strategies, alphabet=ALLOWED_CHARS) self.terminal_strategies = { - k: v.map(lambda s: s.replace("\0", "")) + k: v.map(lambda s: s.replace("\0", "")).filter(utf8_encodable) for k, v in self.terminal_strategies.items() # type: ignore } @@ -91,7 +105,7 @@ def has_no_docstrings(c): @pytest.mark.fuzzing -@given(code=from_grammar()) +@given(code=from_grammar().filter(lambda c: utf8_encodable(c))) @hypothesis.settings( max_examples=500, suppress_health_check=[HealthCheck.too_slow, HealthCheck.filter_too_much] )