Skip to content

Commit

Permalink
Merge pull request #40 from zaibacu/case-sensitive-spacy-rules-bugfix
Browse files Browse the repository at this point in the history
Cover the case
  • Loading branch information
zaibacu authored Jan 23, 2020
2 parents 82812e7 + 0e9b383 commit 1fdb95e
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 3 deletions.
1 change: 1 addition & 0 deletions changes/40.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern
2 changes: 1 addition & 1 deletion rita/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

logger = logging.getLogger(__name__)

__version__ = (0, 3, 3, os.getenv("VERSION_PATCH"))
__version__ = (0, 3, 4, os.getenv("VERSION_PATCH"))


def get_version():
Expand Down
2 changes: 1 addition & 1 deletion rita/engine/translate_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def any_of_parse(lst, config, op=None):
for item in lst])
base = {"LOWER": {"REGEX": r"({0})".format("|".join(normalized))}}
else:
base = {"REGEX": r"({0})".format("|".join(sorted(lst)))}
base = {"TEXT": {"REGEX": r"({0})".format("|".join(sorted(lst)))}}

if op:
base["OP"] = op
Expand Down
13 changes: 12 additions & 1 deletion rita/engine/translate_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re

from functools import partial
from itertools import groupby

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -120,7 +121,7 @@ def compile(self, label, rules):

return re.compile(r"(?P<{0}>{1})".format(label, "".join(rules)), flags)

def execute(self, text):
def _results(self, text):
for p in self.patterns:
for match in p.finditer(text):
yield {
Expand All @@ -130,6 +131,16 @@ def execute(self, text):
"label": match.lastgroup,
}

def execute(self, text):
results = sorted(list(self._results(text)), key=lambda x: x["start"])
for k, g in groupby(results, lambda x: x["start"]):
group = list(g)
if len(group) == 1:
yield group[0]
else:
data = sorted(group, key=lambda x: -x["end"])
yield data[0]


def compile_rules(rules, config):
logger.info("Using standalone rule implementation")
Expand Down
23 changes: 23 additions & 0 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,29 @@ def test_exclude_word(engine):
assert len(r2) == 0


@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine])
def test_case_sensitive(engine):
parser = engine(
"""
!CONFIG("ignore_case", "N")
variants = {"Bitcoin", "BTC", "Bitcoin Cash"}
{IN_LIST(variants)}->MARK("CRYPTO")
"""
)

text = """
A bitcoin mining magnate has proposed a new development fund for Bitcoin Cash.
According to BTC.TOP CEO Jiang Zhuoer, the scheme will 'tax' Bitcoin Cash mining rewards
in an effort to increase funding for Bitcoin Cash infrastructure.
"""

results = parser(text)
print(results)
assert results[0] == ("Bitcoin Cash", "CRYPTO")


@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine])
def test_benchmark(benchmark, engine, bench_text):
"""
Expand Down

0 comments on commit 1fdb95e

Please sign in to comment.