Merge #308

308: Prepare v0.9.1 r=Kerollmops a=ManyTheFish # Pull Request Make some modification to prepare v0.9.1 Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Many the fish <many@meilisearch.com>
meilisearch · Sep 19, 2024 · 2d90e4c · 2d90e4c
2 parents 2dc8ac8 + 30692ec
commit 2d90e4c
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 12 deletions.
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -31,7 +31,7 @@ unicode-normalization = "0.1.23"
 irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }
 
 [features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]
+default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]
 
 # allow chinese specialized tokenization
 chinese = ["chinese-segmentation", "chinese-normalization"]

diff --git a/charabia/README.md b/charabia/README.md
@@ -17,6 +17,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor
 |  Script / Language  |                           specialized segmentation                            | specialized normalization | Segmentation Performance level | Tokenization Performance level |
 |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---|
 | **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
+| **Latin** - **German** | ✅ CamelCase segmentation + German word segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec    | 🟨 ~9MiB/sec    |
 | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization         | 🟩 ~27MiB/sec    | 🟨 ~8MiB/sec    |
 | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase          | 🟩 ~27MiB/sec    | 🟨 ~9MiB/sec    |
 | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec    | 🟧 ~5MiB/sec    |

diff --git a/charabia/src/detection/mod.rs b/charabia/src/detection/mod.rs
@@ -27,7 +27,11 @@ impl<'o, 'al> StrDetection<'o, 'al> {
         let inner = self.inner;
         self.language = match self.language.take() {
             Some(lang) => Some(lang),
-            None => Self::detect_lang(inner, self.allow_list),
+            None => match self.allow_list {
+                Some([unique_language]) => Some(*unique_language),
+                None if Self::detect_script(inner) == Script::Latin => None,
+                _otherwise => Self::detect_lang(inner, self.allow_list),
+            },
         };
 
         self.language

diff --git a/charabia/src/detection/script_language.rs b/charabia/src/detection/script_language.rs
@@ -10,6 +10,7 @@ macro_rules! make_language {
     ($($language:tt), +) => {
         #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)]
         pub enum Language {
+            Zho,
             $($language),+,
         }
         impl From<whatlang::Lang> for Language {
@@ -23,6 +24,7 @@ macro_rules! make_language {
         impl From<Language> for whatlang::Lang {
             fn from(other: Language) -> whatlang::Lang {
                 match other {
+                    Language::Zho => whatlang::Lang::Cmn,
                     $(Language::$language => whatlang::Lang::$language), +,
                 }
             }
@@ -31,12 +33,16 @@ macro_rules! make_language {
         impl Language {
             pub fn code(&self) -> &'static str {
                 match self {
+                    Language::Zho => "zho",
                     $(Language::$language => whatlang::Lang::$language.code()), +,
                 }
             }
 
             pub fn from_code<S: AsRef<str>>(code: S) -> Option<Language> {
-                whatlang::Lang::from_code(code.as_ref()).map(Language::from)
+                match code.as_ref() {
+                    "zho" => Some(Language::Zho),
+                    _ => whatlang::Lang::from_code(code.as_ref()).map(Language::from),
+                }
             }
         }
     };

diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs
@@ -38,7 +38,8 @@ impl CharNormalizer for ChineseNormalizer {
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
-        token.script == Script::Cj && matches!(token.language, None | Some(Language::Cmn))
+        token.script == Script::Cj
+            && matches!(token.language, None | Some(Language::Cmn) | Some(Language::Zho))
     }
 }
 
@@ -74,7 +75,7 @@ mod test {
                 char_end: 5,
                 byte_end: 15,
                 script: Script::Cj,
-                language: Some(Language::Cmn),
+                language: Some(Language::Zho),
                 ..Default::default()
             },
         ]
@@ -111,7 +112,7 @@ mod test {
                 byte_end: 15,
                 char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
                 script: Script::Cj,
-                language: Some(Language::Cmn),
+                language: Some(Language::Zho),
                 ..Default::default()
             },
         ]
@@ -147,7 +148,7 @@ mod test {
                 byte_end: 15,
                 char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
                 script: Script::Cj,
-                language: Some(Language::Cmn),
+                language: Some(Language::Zho),
                 kind: TokenKind::Word,
                 ..Default::default()
             },
@@ -182,7 +183,7 @@ mod test {
                 byte_end: 15,
                 char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
                 script: Script::Cj,
-                language: Some(Language::Cmn),
+                language: Some(Language::Zho),
                 ..Default::default()
             },
         ]
@@ -223,7 +224,7 @@ mod test {
                 byte_end: 15,
                 char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]),
                 script: Script::Cj,
-                language: Some(Language::Cmn),
+                language: Some(Language::Zho),
             },
         ]
     }

diff --git a/charabia/src/segmenter/latin/mod.rs b/charabia/src/segmenter/latin/mod.rs
@@ -27,18 +27,42 @@ mod test {
 
     const TEXT: &str =
         "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case";
+
+    #[rustfmt::skip]
+    #[cfg(feature = "latin-camelcase")]
     const SEGMENTED: &[&str] = &[
         "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ",
         "snake", "_", "case",
     ];
+
+    #[rustfmt::skip]
+    #[cfg(feature = "latin-camelcase")]
     const TOKENIZED: &[&str] = &[
         "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
         " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
         "'", "s", " ", "29", ".", "3°f", "!", " ", "camel", "case", " ", "kebab", "-", "case", " ",
         "snake", "_", "case",
     ];
 
+    #[rustfmt::skip]
+    #[cfg(not(feature = "latin-camelcase"))]
+    const SEGMENTED: &[&str] = &[
+        "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t",
+        " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it",
+        "'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ",
+        "snake", "_", "case",
+    ];
+
+    #[rustfmt::skip]
+    #[cfg(not(feature = "latin-camelcase"))]
+    const TOKENIZED: &[&str] = &[
+        "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t",
+        " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it",
+        "'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ",
+        "snake", "_", "case",
+    ];
+
     test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng);
 }
diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs
@@ -63,6 +63,8 @@ pub static SEGMENTERS: Lazy<SegmenterMap> = Lazy::new(|| {
         // chinese segmenter
         #[cfg(feature = "chinese-segmentation")]
         ((Script::Cj, Some(Language::Cmn)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
+        #[cfg(feature = "chinese-segmentation")]
+        ((Script::Cj, Some(Language::Zho)), Box::new(ChineseSegmenter) as Box<dyn Segmenter>),
         // japanese segmenter
         #[cfg(feature = "japanese")]
         ((Script::Cj, Some(Language::Jpn)), Box::new(JapaneseSegmenter) as Box<dyn Segmenter>),
@@ -395,7 +397,6 @@ mod test {
     ($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => {
             use crate::{Token, Language, Script};
             use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO};
-            use crate::tokenizer::Tokenize;
             use super::*;
 
             #[test]
@@ -425,7 +426,7 @@ Check if the expected Script/Language corresponds to the detected Script/Languag
 
             #[test]
             fn segment() {
-                let segmented_text: Vec<_> = $text.segment_str().collect();
+                let segmented_text: Vec<_> = $text.segment_str_with_option(None, Some(&[$language])).collect();
                 assert_eq!(&segmented_text[..], $segmented, r#"
 Segmenter chosen by global segment() function, didn't segment the text as expected.
 
@@ -436,7 +437,8 @@ Check if the tested segmenter is assigned to the good Script/Language in `SEGMEN
 
             #[test]
             fn tokenize() {
-                let tokens: Vec<_> = $text.tokenize().collect();
+                let tokenizer = crate::TokenizerBuilder::default().into_tokenizer();
+                let tokens: Vec<_> = tokenizer.tokenize_with_allow_list($text, Some(&[$language])).collect();
                 let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect();
 
                 assert_eq!(&tokenized_text[..], $tokenized, r#"