diff --git a/changes/40.fix.rst b/changes/40.fix.rst new file mode 100644 index 0000000..d26f92b --- /dev/null +++ b/changes/40.fix.rst @@ -0,0 +1 @@ +Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern diff --git a/rita/__init__.py b/rita/__init__.py index b4c093f..d6d9184 100644 --- a/rita/__init__.py +++ b/rita/__init__.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -__version__ = (0, 3, 3, os.getenv("VERSION_PATCH")) +__version__ = (0, 3, 4, os.getenv("VERSION_PATCH")) def get_version(): diff --git a/rita/engine/translate_spacy.py b/rita/engine/translate_spacy.py index 8999f2f..244f357 100644 --- a/rita/engine/translate_spacy.py +++ b/rita/engine/translate_spacy.py @@ -11,7 +11,7 @@ def any_of_parse(lst, config, op=None): for item in lst]) base = {"LOWER": {"REGEX": r"({0})".format("|".join(normalized))}} else: - base = {"REGEX": r"({0})".format("|".join(sorted(lst)))} + base = {"TEXT": {"REGEX": r"({0})".format("|".join(sorted(lst)))}} if op: base["OP"] = op diff --git a/rita/engine/translate_standalone.py b/rita/engine/translate_standalone.py index 8f18511..fa17302 100644 --- a/rita/engine/translate_standalone.py +++ b/rita/engine/translate_standalone.py @@ -2,6 +2,7 @@ import re from functools import partial +from itertools import groupby logger = logging.getLogger(__name__) @@ -120,7 +121,7 @@ def compile(self, label, rules): return re.compile(r"(?P<{0}>{1})".format(label, "".join(rules)), flags) - def execute(self, text): + def _results(self, text): for p in self.patterns: for match in p.finditer(text): yield { @@ -130,6 +131,16 @@ def execute(self, text): "label": match.lastgroup, } + def execute(self, text): + results = sorted(list(self._results(text)), key=lambda x: x["start"]) + for k, g in groupby(results, lambda x: x["start"]): + group = list(g) + if len(group) == 1: + yield group[0] + else: + data = sorted(group, key=lambda x: -x["end"]) + yield data[0] + def compile_rules(rules, config): logger.info("Using standalone rule implementation") diff --git a/tests/test_examples.py b/tests/test_examples.py index 65afc8a..b596aca 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -104,6 +104,29 @@ def test_exclude_word(engine): assert len(r2) == 0 +@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) +def test_case_sensitive(engine): + parser = engine( + """ + !CONFIG("ignore_case", "N") + + variants = {"Bitcoin", "BTC", "Bitcoin Cash"} + + {IN_LIST(variants)}->MARK("CRYPTO") + """ + ) + + text = """ + A bitcoin mining magnate has proposed a new development fund for Bitcoin Cash. + According to BTC.TOP CEO Jiang Zhuoer, the scheme will 'tax' Bitcoin Cash mining rewards + in an effort to increase funding for Bitcoin Cash infrastructure. + """ + + results = parser(text) + print(results) + assert results[0] == ("Bitcoin Cash", "CRYPTO") + + @pytest.mark.parametrize('engine', [spacy_engine, standalone_engine]) def test_benchmark(benchmark, engine, bench_text): """