Merge #299

299: Simplify lang detection r=dureuill a=ManyTheFish - Change the language `allow_list` from a map of script->language to an array of allowed languages - Allow to dynamically change the language allow list when tokenizing text Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
meilisearch · Jul 25, 2024 · 9f27d85 · 9f27d85
2 parents ae07a58 + cbeb12a
commit 9f27d85
Show file tree

Hide file tree

Showing 9 changed files with 139 additions and 107 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -13,15 +13,13 @@ exclude = ["dictionaries/txt/thai/words.txt"]
 
 [dependencies]
 aho-corasick = "1.1.3"
-cow-utils = "0.1"
 csv = "1.3.0"
-deunicode = "1.6.0"
 either = "1.13.0"
 finl_unicode = { version= "1.2.0", optional = true }
 fst = "0.4"
 jieba-rs = { version = "0.7", optional = true }
 once_cell = "1.19.0"
-serde = "1.0"
+serde = "1.0.192"
 slice-group-by = "0.3.1"
 whatlang = "0.16.4"
 lindera = { version = "=0.32.2", default-features = false, optional = true }
@@ -31,8 +29,6 @@ pinyin = { version = "0.10", default-features = false, features = [
 wana_kana = { version = "3.0.0", optional = true }
 unicode-normalization = "0.1.23"
 irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
-litemap = "0.7.3"
-zerovec = "0.10.4"
 
 [features]
 default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"]

diff --git a/charabia/src/detection/mod.rs b/charabia/src/detection/mod.rs
@@ -1,5 +1,3 @@
-use std::collections::HashMap;
-
 pub use script_language::{Language, Script};
 use whatlang::Detector;
 
@@ -12,11 +10,11 @@ pub struct StrDetection<'o, 'al> {
     inner: &'o str,
     pub script: Option<Script>,
     pub language: Option<Language>,
-    allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
+    allow_list: Option<&'al [Language]>,
 }
 
 impl<'o, 'al> StrDetection<'o, 'al> {
-    pub fn new(inner: &'o str, allow_list: Option<&'al HashMap<Script, Vec<Language>>>) -> Self {
+    pub fn new(inner: &'o str, allow_list: Option<&'al [Language]>) -> Self {
         Self { inner, script: None, language: None, allow_list }
     }
 
@@ -25,10 +23,14 @@ impl<'o, 'al> StrDetection<'o, 'al> {
         *self.script.get_or_insert_with(|| Self::detect_script(inner))
     }
 
-    pub fn language(&mut self) -> Language {
+    pub fn language(&mut self) -> Option<Language> {
         let inner = self.inner;
-        let script = self.script();
-        *self.language.get_or_insert_with(|| Self::detect_lang(inner, script, self.allow_list))
+        self.language = match self.language.take() {
+            Some(lang) => Some(lang),
+            None => Self::detect_lang(inner, self.allow_list),
+        };
+
+        self.language
     }
 
     /// detect script with whatlang,
@@ -39,33 +41,22 @@ impl<'o, 'al> StrDetection<'o, 'al> {
 
     /// detect lang with whatlang
     /// if no language is detected, return Language::Other
-    fn detect_lang(
-        text: &str,
-        script: Script,
-        allow_list: Option<&HashMap<Script, Vec<Language>>>,
-    ) -> Language {
+    fn detect_lang(text: &str, allow_list: Option<&[Language]>) -> Option<Language> {
         let detector = allow_list
-            .and_then(|allow_list| allow_list.get(&script))
             .map(|allow_list| allow_list.iter().map(|lang| (*lang).into()).collect())
             .map(Detector::with_allowlist)
             .unwrap_or_default();
 
-        detector.detect_lang(text).map(Language::from).unwrap_or_default()
+        detector.detect_lang(text).map(Language::from)
     }
 }
 
 pub trait Detect<'o, 'al> {
-    fn detect(
-        &'o self,
-        allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
-    ) -> StrDetection<'o, 'al>;
+    fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al>;
 }
 
 impl<'o, 'al> Detect<'o, 'al> for &str {
-    fn detect(
-        &'o self,
-        allow_list: Option<&'al HashMap<Script, Vec<Language>>>,
-    ) -> StrDetection<'o, 'al> {
+    fn detect(&'o self, allow_list: Option<&'al [Language]>) -> StrDetection<'o, 'al> {
         StrDetection::new(self, allow_list)
     }
 }
diff --git a/charabia/src/detection/script_language.rs b/charabia/src/detection/script_language.rs
@@ -2,15 +2,15 @@ use core::str::FromStr;
 
 #[cfg(test)]
 use quickcheck::{Arbitrary, Gen};
+use serde::{Deserialize, Serialize};
 
 use super::chars;
 
 macro_rules! make_language {
     ($($language:tt), +) => {
-        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
+        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
         pub enum Language {
             $($language),+,
-            Other,
         }
         impl From<whatlang::Lang> for Language {
             fn from(other: whatlang::Lang) -> Language {
@@ -24,27 +24,19 @@ macro_rules! make_language {
             fn from(other: Language) -> whatlang::Lang {
                 match other {
                     $(Language::$language => whatlang::Lang::$language), +,
-                    _other => whatlang::Lang::Eng,
                 }
             }
         }
 
-        impl Default for Language {
-            fn default() -> Self {
-                Self::Other
-            }
-        }
-
         impl Language {
-            pub fn name(&self) -> &'static str {
+            pub fn code(&self) -> &'static str {
                 match self {
                     $(Language::$language => whatlang::Lang::$language.code()), +,
-                    _other => "other",
                 }
             }
 
-            pub fn from_name<S: AsRef<str>>(code: S) -> Language {
-                whatlang::Lang::from_code(code.as_ref()).map(Language::from).unwrap_or_default()
+            pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
+                whatlang::Lang::from_code(code.as_ref()).map(Language::from)
             }
         }
     };
@@ -124,7 +116,7 @@ make_language! {
 
 macro_rules! make_script {
     ($($script:tt), +) => {
-        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
+        #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
         pub enum Script {
             $($script),+,
             Cj,
@@ -361,12 +353,12 @@ mod test {
 
     #[test]
     fn from_into_language() {
-        assert_eq!(Language::Eng.name(), "eng");
-        assert_eq!(Language::from_name("eng"), Language::Eng);
-        assert_eq!(Language::Jpn.name(), "jpn");
-        assert_eq!(Language::from_name("jpn"), Language::Jpn);
-        assert_eq!(Language::Cmn.name(), "cmn");
-        assert_eq!(Language::from_name("cmn"), Language::Cmn);
+        assert_eq!(Language::Eng.code(), "eng");
+        assert_eq!(Language::from_code("eng"), Some(Language::Eng));
+        assert_eq!(Language::Jpn.code(), "jpn");
+        assert_eq!(Language::from_code("jpn"), Some(Language::Jpn));
+        assert_eq!(Language::Cmn.code(), "cmn");
+        assert_eq!(Language::from_code("cmn"), Some(Language::Cmn));
     }
 
     #[test]

diff --git a/charabia/src/lib.rs b/charabia/src/lib.rs
@@ -56,7 +56,7 @@ mod detection;
 mod token;
 mod tokenizer;
 
-pub use detection::{Language, Script};
+pub use detection::{Language, Script, StrDetection};
 pub use normalizer::Normalize;
 pub use segmenter::Segment;
 pub use token::{SeparatorKind, Token, TokenKind};

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -81,12 +81,12 @@ pub(crate) const DEFAULT_NORMALIZER_OPTION: NormalizerOption = NormalizerOption
 };
 
 /// Iterator over Normalized [`Token`]s.
-pub struct NormalizedTokenIter<'o, 'tb> {
-    token_iter: SegmentedTokenIter<'o, 'tb>,
+pub struct NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
+    token_iter: SegmentedTokenIter<'o, 'aho, 'lang>,
     options: &'tb NormalizerOption<'tb>,
 }
 
-impl<'o> Iterator for NormalizedTokenIter<'o, '_> {
+impl<'o> Iterator for NormalizedTokenIter<'o, '_, '_, '_> {
     type Item = Token<'o>;
 
     fn next(&mut self) -> Option<Self::Item> {
@@ -232,11 +232,14 @@ impl From<String> for CharOrStr {
     }
 }
 
-impl<'o, 'tb> SegmentedTokenIter<'o, 'tb> {
+impl<'o, 'aho, 'lang> SegmentedTokenIter<'o, 'aho, 'lang> {
     /// Normalize [`Token`]s using all the compatible Normalizers.
     ///
     /// A Latin `Token` would not be normalized the same as a Chinese `Token`.
-    pub fn normalize(self, options: &'tb NormalizerOption<'tb>) -> NormalizedTokenIter<'o, 'tb> {
+    pub fn normalize<'tb>(
+        self,
+        options: &'tb NormalizerOption<'tb>,
+    ) -> NormalizedTokenIter<'o, 'aho, 'lang, 'tb> {
         NormalizedTokenIter { token_iter: self, options }
     }
 }

diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs
@@ -5,7 +5,7 @@ use once_cell::sync::Lazy;
 
 use super::Normalizer;
 use crate::normalizer::NormalizerOption;
-use crate::{Script, Token};
+use crate::{Language, Token};
 
 static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
     AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
@@ -77,7 +77,7 @@ impl Normalizer for SwedishRecompositionNormalizer {
 
     // Returns `true` if the Normalizer should be used.
     fn should_normalize(&self, token: &Token) -> bool {
-        token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
+        token.language == Some(Language::Swe) && MATCHING_STR.is_match(token.lemma())
     }
 }
 
@@ -101,6 +101,7 @@ mod test {
     use crate::normalizer::test::test_normalizer;
     use crate::normalizer::Normalizer;
     use crate::token::TokenKind;
+    use crate::Script;
 
     // base tokens to normalize.
     fn tokens() -> Vec<Token<'static>> {
@@ -109,6 +110,7 @@ mod test {
             char_end: 13,
             byte_end: 19,
             script: Script::Latin,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }
@@ -121,6 +123,7 @@ mod test {
             char_end: 13,
             byte_end: 19,
             script: Script::Latin,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }
@@ -148,6 +151,7 @@ mod test {
             ]),
             script: Script::Latin,
             kind: TokenKind::Word,
+            language: Some(Language::Swe),
             ..Default::default()
         }]
     }

diff --git a/charabia/src/segmenter/latin/mod.rs b/charabia/src/segmenter/latin/mod.rs
@@ -40,5 +40,5 @@ mod test {
         "snake", "_", "case",
     ];
 
-    test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Other);
+    test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
 }