Skip to content

Commit

Permalink
Update spaCy English models
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jul 25, 2024
1 parent b8b1092 commit c166031
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 10 deletions.
4 changes: 3 additions & 1 deletion data/deps.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
"spacy": "3.7.5",
"spacy_cpu_model": "3.7.0",
"spacy_trf_model": "3.7.2",
"en_spacy_cpu_model": "3.7.1",
"en_spacy_trf_model": "3.7.3",
"thinc-apple-ops": "0.1.5",
"torch": "2.3.1",
"torch": "2.4.0",
"typing-extensions": "4.12.2"
}
5 changes: 2 additions & 3 deletions deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
PROFICIENCY_RELEASE_URL,
Prefs,
get_plugin_path,
get_spacy_model_version,
get_wiktionary_klld_path,
kindle_db_path,
load_plugin_json,
Expand Down Expand Up @@ -44,9 +45,7 @@ def install_deps(pkg: str, notif: Any) -> None:
if pkg == "":
pip_install("spacy", dep_versions["spacy"], notif=notif)
else:
model_version = dep_versions[
"spacy_trf_model" if pkg.endswith("_trf") else "spacy_cpu_model"
]
model_version = get_spacy_model_version(pkg, dep_versions)
url = (
"https://github.com/explosion/spacy-models/releases/download/"
f"{pkg}-{model_version}/{pkg}-{model_version}-py3-none-any.whl"
Expand Down
6 changes: 3 additions & 3 deletions dump_lemmas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .utils import (
Prefs,
custom_lemmas_folder,
get_spacy_model_version,
insert_installed_libs,
load_plugin_json,
use_kindle_ww_db,
Expand All @@ -14,6 +15,7 @@
from utils import (
Prefs,
custom_lemmas_folder,
get_spacy_model_version,
insert_installed_libs,
load_plugin_json,
use_kindle_ww_db,
Expand Down Expand Up @@ -74,9 +76,7 @@ def dump_spacy_docs(
save_spacy_docs(
nlp,
spacy_model,
pkg_versions[
"spacy_trf_model" if spacy_model.endswith("_trf") else "spacy_cpu_model"
],
get_spacy_model_version(spacy_model, pkg_versions),
lemma_lang,
is_kindle,
lemmas_conn,
Expand Down
6 changes: 3 additions & 3 deletions parse_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
Prefs,
dump_prefs,
get_plugin_path,
get_spacy_model_version,
get_user_agent,
get_wiktionary_klld_path,
insert_installed_libs,
Expand Down Expand Up @@ -68,6 +69,7 @@
from utils import (
CJK_LANGS,
Prefs,
get_spacy_model_version,
insert_installed_libs,
kindle_db_path,
load_languages_data,
Expand Down Expand Up @@ -795,9 +797,7 @@ def create_spacy_matcher(

disabled_pipes = list(set(["ner", "parser", "senter"]) & set(nlp.pipe_names))
pkg_versions = load_plugin_json(plugin_path, "data/deps.json")
model_version = pkg_versions[
"spacy_trf_model" if model.endswith("_trf") else "spacy_cpu_model"
]
model_version = get_spacy_model_version(model, pkg_versions)
# Chinese words don't have inflection forms, only use phrase matcher
use_lemma_matcher = prefs["use_pos"] and lemma_lang != "zh" and model != ""
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
Expand Down
11 changes: 11 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,14 @@ def load_languages_data(
supported_languages["zh_cn"] = supported_languages["zh"].copy()
supported_languages["zh_cn"]["name"] = "Simplified Chinese"
return supported_languages


def get_spacy_model_version(
model_name: str, dependency_versions: dict[str, str]
) -> str:
key = "spacy_trf_model" if model_name.endswith("_trf") else "spacy_cpu_model"
lang_code = model_name[:2]
lang_key = f"{lang_code}_{key}"
if lang_key in dependency_versions:
return dependency_versions[lang_key]
return dependency_versions.get(key, "")

0 comments on commit c166031

Please sign in to comment.