diff --git a/CHANGELOG.md b/CHANGELOG.md index b254d4b..e30aa74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,22 @@ -None 0.3.2 (2019-12-19) +0.4.0 (2020-01-25) +**************************** + +Features +-------- + +- Support for deaccent. In general, if accented version of word is given, both deaccented and accented will be used to match. To turn iit off - `!CONFIG("deaccent", "N")` + #38 +- Added shortcuts module to simplify injecting into spaCy + #42 + +Fix +--- + +- Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern + #40 + + +0.3.2 (2019-12-19) *********************** Features diff --git a/README.md b/README.md index cbb88d2..3e7a113 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,49 @@ This is a language, loosely based on language [Apache UIMA RUTA](https://uima.ap ## Support [![reddit](https://img.shields.io/reddit/subreddit-subscribers/ritaDSL?style=social)](https://www.reddit.com/r/ritaDSL/) + + +## Simple Rules example + +```python +rules = """ +cuts = {"fitted", "wide-cut"} +lengths = {"short", "long", "calf-length", "knee-length"} +fabric_types = {"soft", "airy", "crinkled"} +fabrics = {"velour", "chiffon", "knit", "woven", "stretch"} + +{IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE") +{IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE") +{IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC") +""" +``` + +### Loading in spaCy +```python +import spacy +from rita.shortcuts import setup_spacy + + +nlp = spacy.load("en") +setup_spacy(nlp, rules_string=rules) +``` + +And using it: +``` +>>> r = nlp("She was wearing a short wide-cut dress") +>>> [{"label": e.label_, "text": e.text} for e in r.ents] +[{'label': 'DRESS_TYPE', 'text': 'short wide-cut dress'}] +``` + +### Loading using Regex (standalone) +``` +import rita + +patterns = rita.compile_string(rules, use_engine="standalone") +``` + +And using it: +``` +>>> list(patterns.execute("She was wearing a short wide-cut dress")) +[{'end': 38, 'label': 'DRESS_TYPE', 'start': 18, 'text': 'short wide-cut dress'}] +``` \ No newline at end of file diff --git a/changes/38.feature.rst b/changes/38.feature.rst deleted file mode 100644 index d373184..0000000 --- a/changes/38.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Support for deaccent. In general, if accented version of word is given, both deaccented and accented will be used to match. To turn iit off - `!CONFIG("deaccent", "N")` diff --git a/changes/40.fix.rst b/changes/40.fix.rst deleted file mode 100644 index d26f92b..0000000 --- a/changes/40.fix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern diff --git a/changes/42.feature.rst b/changes/42.feature.rst new file mode 100644 index 0000000..48e5138 --- /dev/null +++ b/changes/42.feature.rst @@ -0,0 +1 @@ +Added shortcuts module to simplify injecting into spaCy diff --git a/docs/quickstart.md b/docs/quickstart.md index fdabda5..29be667 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -22,6 +22,23 @@ Now you can compile these rules `rita -f .rita output.jsonl` ## spaCy backend +### NEW in 0.4.0: Shortcuts to simplify life: +``` +import spacy +from rita.shortcuts import setup_spacy + +nlp = spacy.load("en") +setup_spacy(nlp, ...) +``` + +If comipling rules from string: +`setup_spacy(nlp, rules_string=rules)` +If loading rules from `.rita` file +`setup_spacy(nlp, rules_path="examples/car-colors.rita")` +If loading from spaCy compiled rules: +`setup_spacy(nlp, patterns="rules.jsonl")` + +### Doing it manually ```python import spacy from spacy.pipeline import EntityRuler diff --git a/rita/__init__.py b/rita/__init__.py index d6d9184..c6c2861 100644 --- a/rita/__init__.py +++ b/rita/__init__.py @@ -9,7 +9,7 @@ logger = logging.getLogger(__name__) -__version__ = (0, 3, 4, os.getenv("VERSION_PATCH")) +__version__ = (0, 4, 0, os.getenv("VERSION_PATCH")) def get_version(): @@ -39,8 +39,8 @@ def compile_string(raw, config, use_engine=None): return result -def compile(fname, compile_fn=None): +def compile(fname, use_engine=None): with open(fname, "r") as f: raw = f.read() - return compile_string(raw) + return compile_string(raw, use_engine=use_engine) diff --git a/rita/shortcuts.py b/rita/shortcuts.py new file mode 100644 index 0000000..025ec81 --- /dev/null +++ b/rita/shortcuts.py @@ -0,0 +1,20 @@ +import rita + + +def setup_spacy(model, patterns=None, rules_path=None, rules_string=None, override_ents=True): + from spacy.pipeline import EntityRuler + ruler = EntityRuler(model, overwrite_ents=override_ents) + if not patterns: + if rules_path: + patterns = rita.compile(rules_path, use_engine="spacy") + elif rules_string: + patterns = rita.compile_string(rules_string, use_engine="spacy") + else: + raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`") + + ruler.add_patterns(patterns) + else: + ruler.from_disk(patterns) + + model.add_pipe(ruler) + return model diff --git a/tests/test_run.py b/tests/test_run.py index 7c8c3bc..002d753 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -1,6 +1,12 @@ import sys +import json +import tempfile + +import pytest +import rita from rita.run import main +from rita.shortcuts import setup_spacy def test_simple_compile(mocker): @@ -11,3 +17,35 @@ def test_simple_compile(mocker): "output.jsonl" ] main() + + +def test_shortcuts_spacy_inline(): + spacy = pytest.importorskip("spacy", minversion="2.1") + nlp = spacy.load("en") + rules = """ + {WORD("TEST")}->MARK("TEST") + """ + setup_spacy(nlp, rules_string=rules) + + +def test_shortcuts_spacy_file(): + spacy = pytest.importorskip("spacy", minversion="2.1") + nlp = spacy.load("en") + setup_spacy(nlp, rules_path="examples/color-car.rita") + + +def test_shortcuts_spacy_compiled(): + spacy = pytest.importorskip("spacy", minversion="2.1") + nlp = spacy.load("en") + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl") as f: + patterns = rita.compile("examples/color-car.rita") + for pattern in patterns: + f.write(json.dumps(pattern) + "\n") + setup_spacy(nlp, patterns=f.name) + + +def test_shortcuts_spacy_giving_no_rules(): + spacy = pytest.importorskip("spacy", minversion="2.1") + nlp = spacy.load("en") + with pytest.raises(RuntimeError): + setup_spacy(nlp)