Skip to content

Commit

Permalink
Merge #299
Browse files Browse the repository at this point in the history
299: Simplify lang detection r=dureuill a=ManyTheFish

- Change the language `allow_list` from a map of script->language to an array of allowed languages
- Allow to dynamically change the language allow list when tokenizing text


Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
  • Loading branch information
meili-bors[bot] and ManyTheFish authored Jul 25, 2024
2 parents ae07a58 + cbeb12a commit 9f27d85
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 107 deletions.
6 changes: 1 addition & 5 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@ exclude = ["dictionaries/txt/thai/words.txt"]

[dependencies]
aho-corasick = "1.1.3"
cow-utils = "0.1"
csv = "1.3.0"
deunicode = "1.6.0"
either = "1.13.0"
finl_unicode = { version= "1.2.0", optional = true }
fst = "0.4"
jieba-rs = { version = "0.7", optional = true }
once_cell = "1.19.0"
serde = "1.0"
serde = "1.0.192"
slice-group-by = "0.3.1"
whatlang = "0.16.4"
lindera = { version = "=0.32.2", default-features = false, optional = true }
Expand All @@ -31,8 +29,6 @@ pinyin = { version = "0.10", default-features = false, features = [
wana_kana = { version = "3.0.0", optional = true }
unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
litemap = "0.7.3"
zerovec = "0.10.4"

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]
Expand Down
35 changes: 13 additions & 22 deletions charabia/src/detection/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::collections::HashMap;

pub use script_language::{Language, Script};
use whatlang::Detector;

Expand All @@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
inner: &'o str,
pub script: Option<Script>,
pub language: Option<Language>,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
allow_list: Option<&'al [Language]>,
}

impl<'o, 'al> StrDetection<'o, 'al> {
pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
Self { inner, script: None, language: None, allow_list }
}

Expand All @@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
*self.script.get_or_insert_with(|| Self::detect_script(inner))
}

pub fn language(&mut self) -> Language {
pub fn language(&mut self) -> Option<Language> {
let inner = self.inner;
let script = self.script();
*self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
self.language = match self.language.take() {
Some(lang) => Some(lang),
None => Self::detect_lang(inner, self.allow_list),
};

self.language
}

/// detect script with whatlang,
Expand All @@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {

/// detect lang with whatlang
/// if no language is detected, return Language::Other
fn detect_lang(
text: &str,
script: Script,
allow_list: Option<&HashMap<Script, Vec<Language>>>,
) -> Language {
fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
let detector = allow_list
.and_then(|allow_list| allow_list.get(&script))
.map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
.map(Detector::with_allowlist)
.unwrap_or_default();

detector.detect_lang(text).map(Language::from).unwrap_or_default()
detector.detect_lang(text).map(Language::from)
}
}

pub trait Detect<'o, 'al> {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al>;
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
}

impl<'o, 'al> Detect<'o, 'al> for &str {
fn detect(
&'o self,
allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
) -> StrDetection<'o, 'al> {
fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
StrDetection::new(self, allow_list)
}
}
32 changes: 12 additions & 20 deletions charabia/src/detection/script_language.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ use core::str::FromStr;

#[cfg(test)]
use quickcheck::{Arbitrary, Gen};
use serde::{Deserialize, Serialize};

use super::chars;

macro_rules! make_language {
($($language:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Language {
$($language),+,
Other,
}
impl From<whatlang::Lang> for Language {
fn from(other: whatlang::Lang) -> Language {
Expand All @@ -24,27 +24,19 @@ macro_rules! make_language {
fn from(other: Language) -> whatlang::Lang {
match other {
$(Language::$language => whatlang::Lang::$language), +,
_other => whatlang::Lang::Eng,
}
}
}

impl Default for Language {
fn default() -> Self {
Self::Other
}
}

impl Language {
pub fn name(&self) -> &'static str {
pub fn code(&self) -> &'static str {
match self {
$(Language::$language => whatlang::Lang::$language.code()), +,
_other => "other",
}
}

pub fn from_name<S: AsRef<str>>(code: S) -> Language {
whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
whatlang::Lang::from_code(code.as_ref()).map(Language::from)
}
}
};
Expand Down Expand Up @@ -124,7 +116,7 @@ make_language! {

macro_rules! make_script {
($($script:tt), +) => {
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
pub enum Script {
$($script),+,
Cj,
Expand Down Expand Up @@ -361,12 +353,12 @@ mod test {

#[test]
fn from_into_language() {
assert_eq!(Language::Eng.name(), "eng");
assert_eq!(Language::from_name("eng"), Language::Eng);
assert_eq!(Language::Jpn.name(), "jpn");
assert_eq!(Language::from_name("jpn"), Language::Jpn);
assert_eq!(Language::Cmn.name(), "cmn");
assert_eq!(Language::from_name("cmn"), Language::Cmn);
assert_eq!(Language::Eng.code(), "eng");
assert_eq!(Language::from_code("eng"), Some(Language::Eng));
assert_eq!(Language::Jpn.code(), "jpn");
assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
assert_eq!(Language::Cmn.code(), "cmn");
assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ mod detection;
mod token;
mod tokenizer;

pub use detection::{Language, Script};
pub use detection::{Language, Script, StrDetection};
pub use normalizer::Normalize;
pub use segmenter::Segment;
pub use token::{SeparatorKind, Token, TokenKind};
Expand Down
13 changes: 8 additions & 5 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
};

/// Iterator over Normalized [`Token`]s.
pub struct NormalizedTokenIter<'o, 'tb> {
token_iter: SegmentedTokenIter<'o, 'tb>,
pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
options: &'tb NormalizerOption<'tb>,
}

impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
type Item = Token<'o>;

fn next(&mut self) -> Option<Self::Item> {
Expand Down Expand Up @@ -232,11 +232,14 @@ impl From<String> for CharOrStr {
}
}

impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
/// Normalize [`Token`]s using all the compatible Normalizers.
///
/// A Latin `Token` would not be normalized the same as a Chinese `Token`.
pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
pub fn normalize<'tb>(
self,
options: &'tb NormalizerOption<'tb>,
) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
NormalizedTokenIter { token_iter: self, options }
}
}
Expand Down
8 changes: 6 additions & 2 deletions charabia/src/normalizer/swedish_recomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use once_cell::sync::Lazy;

use super::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::{Script, Token};
use crate::{Language, Token};

static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
Expand Down Expand Up @@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {

// Returns `true` if the Normalizer should be used.
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
}
}

Expand All @@ -101,6 +101,7 @@ mod test {
use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;
use crate::Script;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
Expand All @@ -109,6 +110,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand All @@ -121,6 +123,7 @@ mod test {
char_end: 13,
byte_end: 19,
script: Script::Latin,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down Expand Up @@ -148,6 +151,7 @@ mod test {
]),
script: Script::Latin,
kind: TokenKind::Word,
language: Some(Language::Swe),
..Default::default()
}]
}
Expand Down
2 changes: 1 addition & 1 deletion charabia/src/segmenter/latin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ mod test {
"snake", "_", "case",
];

test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);
test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
}
Loading

0 comments on commit 9f27d85

Please sign in to comment.