From b726e84b490eff8aa7c3a4a68a722f65f580f82f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 30 Apr 2024 11:02:48 +0200 Subject: [PATCH] Add swedish recomposition normalizer and link it to a feature --- .github/workflows/rust.yml | 2 + charabia/Cargo.toml | 3 + charabia/src/normalizer/mod.rs | 6 + .../src/normalizer/swedish_recomposition.rs | 161 ++++++++++++++++++ 4 files changed, 172 insertions(+) create mode 100644 charabia/src/normalizer/swedish_recomposition.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index c0fb766..e36292f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -28,6 +28,8 @@ jobs: run: cargo test --verbose --features japanese-transliteration - name: Run tests with chinese-normalization-pinyin on run: cargo test --verbose --features chinese chinese-normalization-pinyin + - name: Run tests with swedish-recomposition on + run: cargo test --verbose --features swedish-recomposition - name: Run irg-kvariants tests run: cargo test -p irg-kvariants --verbose diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 0b6628e..aba2627 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -72,6 +72,9 @@ vietnamese = [] # allow splitting snake_case latin words latin-snakecase = ["dep:finl_unicode"] +# force Charabia to recompose Swedish characters +swedish-recomposition = [] + [dev-dependencies] criterion = "0.5" jemallocator = "0.5.4" diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 31c783b..05ac83f 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; +#[cfg(feature = "swedish-recomposition")] +use self::swedish_recomposition::SwedishRecompositionNormalizer; #[cfg(feature = "vietnamese")] pub use self::vietnamese::VietnameseNormalizer; use crate::segmenter::SegmentedTokenIter; @@ -33,6 +35,8 @@ mod japanese; mod lowercase; mod nonspacing_mark; mod quote; +#[cfg(feature = "swedish-recomposition")] +mod swedish_recomposition; #[cfg(feature = "vietnamese")] mod vietnamese; @@ -40,6 +44,8 @@ mod vietnamese; pub static NORMALIZERS: Lazy>> = Lazy::new(|| { vec![ Box::new(CompatibilityDecompositionNormalizer), + #[cfg(feature = "swedish-recomposition")] + Box::new(SwedishRecompositionNormalizer), Box::new(ControlCharNormalizer), Box::new(Classifier), ] diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs new file mode 100644 index 0000000..298ab60 --- /dev/null +++ b/charabia/src/normalizer/swedish_recomposition.rs @@ -0,0 +1,161 @@ +use std::borrow::Cow; + +use aho_corasick::AhoCorasick; +use once_cell::sync::Lazy; + +use super::Normalizer; +use crate::normalizer::NormalizerOption; +use crate::{Script, Token}; + +static MATCHING_STR: Lazy = Lazy::new(|| { + AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"]) + .unwrap() +}); + +/// Swedish specialized [`Normalizer`]. +/// +/// This Normalizer recompose swedish characters containing diacritics. +/// +/// This avoids the diacritic removal from the letter and preserves expected swedish character ordering. +pub struct SwedishRecompositionNormalizer; + +impl Normalizer for SwedishRecompositionNormalizer { + fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> { + match token.char_map.take() { + Some(mut char_map) => { + // if a char_map already exists,iterate over it to reconstruct sub-strings. + let mut lemma = String::new(); + let mut tail = token.lemma.as_ref(); + let mut normalized = String::new(); + for (_, normalized_len) in char_map.iter_mut() { + let (head, t) = tail.split_at(*normalized_len as usize); + tail = t; + normalized.clear(); + // then normalize each sub-strings recomputing the size in the char_map. + let mut peekable = head.chars().peekable(); + while let Some(c) = peekable.next() { + let (c, peek_consumed) = recompose_swedish(c, peekable.peek()); + if peek_consumed { + peekable.next(); + } + + normalized.push(c); + } + + *normalized_len = normalized.len() as u8; + lemma.push_str(normalized.as_ref()); + } + + token.lemma = Cow::Owned(lemma); + token.char_map = Some(char_map); + } + None => { + // if no char_map exists, iterate over the lemma recomposing characters. + let mut char_map = Vec::new(); + let mut lemma = String::new(); + let mut peekable = token.lemma.chars().peekable(); + while let Some(c) = peekable.next() { + let (normalized, peek_consumed) = recompose_swedish(c, peekable.peek()); + if peek_consumed { + peekable.next(); + } + + if options.create_char_map { + char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8)); + } + lemma.push(normalized); + } + token.lemma = Cow::Owned(lemma); + if options.create_char_map { + token.char_map = Some(char_map); + } + } + } + + token + } + + // Returns `true` if the Normalizer should be used. + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Latin && MATCHING_STR.is_match(token.lemma()) + } +} + +fn recompose_swedish(current: char, next: Option<&char>) -> (char, bool) { + match (current, next) { + ('A', Some('\u{30a}')) => ('Å', true), + ('a', Some('\u{30a}')) => ('å', true), + ('A', Some('\u{308}')) => ('Ä', true), + ('a', Some('\u{308}')) => ('ä', true), + ('O', Some('\u{308}')) => ('Ö', true), + ('o', Some('\u{308}')) => ('ö', true), + (c, _) => (c, false), + } +} + +// Test the normalizer: +#[cfg(test)] +mod test { + use std::borrow::Cow::Owned; + + use crate::normalizer::test::test_normalizer; + use crate::normalizer::Normalizer; + use crate::token::TokenKind; + + // base tokens to normalize. + fn tokens() -> Vec> { + vec![Token { + lemma: Owned("öpÅscålcäsÄÖs".to_string()), + char_end: 13, + byte_end: 19, + script: Script::Latin, + ..Default::default() + }] + } + + // expected result of the current Normalizer. + fn normalizer_result() -> Vec> { + vec![Token { + // lowercased + lemma: Owned("öpÅscålcäsÄÖs".to_string()), + char_end: 13, + byte_end: 19, + script: Script::Latin, + ..Default::default() + }] + } + + // expected result of the complete Normalizer pieline. + fn normalized_tokens() -> Vec> { + vec![Token { + lemma: Owned("öpåscålcäsäös".to_string()), + char_end: 13, + byte_end: 19, + char_map: Some(vec![ + (2, 2), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (1, 1), + (2, 2), + (1, 1), + (2, 2), + (2, 2), + (1, 1), + ]), + script: Script::Latin, + kind: TokenKind::Word, + ..Default::default() + }] + } + + test_normalizer!( + SwedishRecompositionNormalizer, + tokens(), + normalizer_result(), + normalized_tokens() + ); +}