Revert "get rid of utf8_encodable checks"

This reverts commit 3b92c85.
vyperlang · Mar 16, 2024 · e6143e5 · e6143e5
1 parent 1127fc4
commit e6143e5
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/tests/functional/grammar/test_grammar.py b/tests/functional/grammar/test_grammar.py
@@ -37,6 +37,20 @@ def test_basic_grammar_empty():
     assert len(tree.children) == 0
 
 
+def utf8_encodable(terminal: str) -> bool:
+    try:
+        if "\x00" not in terminal and "\\ " not in terminal and "\x0c" not in terminal:
+            terminal.encode("utf-8-sig")
+            return True
+        else:
+            return False
+    except UnicodeEncodeError:  # pragma: no cover
+        # Very rarely, a "." in some terminal regex will generate a surrogate
+        # character that cannot be encoded as UTF-8.  We apply this filter to
+        # ensure it doesn't happen at runtime, but don't worry about coverage.
+        return False
+
+
 ALLOWED_CHARS = st.characters(codec="utf-8", min_codepoint=1)
 
 
@@ -46,7 +60,7 @@ class GrammarStrategy(LarkStrategy):
     def __init__(self, grammar, start, explicit_strategies):
         super().__init__(grammar, start, explicit_strategies, alphabet=ALLOWED_CHARS)
         self.terminal_strategies = {
-            k: v.map(lambda s: s.replace("\0", ""))
+            k: v.map(lambda s: s.replace("\0", "")).filter(utf8_encodable)
             for k, v in self.terminal_strategies.items()  # type: ignore
         }
 
@@ -91,7 +105,7 @@ def has_no_docstrings(c):
 
 
 @pytest.mark.fuzzing
-@given(code=from_grammar())
+@given(code=from_grammar().filter(lambda c: utf8_encodable(c)))
 @hypothesis.settings(
     max_examples=500, suppress_health_check=[HealthCheck.too_slow, HealthCheck.filter_too_much]
 )