Skip to content

Commit

Permalink
make tokenizer customizable
Browse files Browse the repository at this point in the history
  • Loading branch information
Nickersoft committed Apr 14, 2024
1 parent caceb88 commit cd92b9f
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 33 deletions.
1 change: 1 addition & 0 deletions lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ config = ["dep:dirs"]
charabia = ["dep:charabia"]
search = ["dep:tantivy", "dep:tantivy-tokenizer-api"]
serve = ["dep:actix-web"]
default = ["json", "sql", "config"]


[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion lib/src/search/tokenizer.rs → lib/src/search/charabia.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ impl Tokenizer for CharabiaTokenizer {

#[cfg(test)]
mod tests {
use crate::search::tokenizer::CharabiaTokenizer;
use crate::search::charabia::CharabiaTokenizer;
use tantivy::tokenizer::*;

#[test]
Expand Down
22 changes: 21 additions & 1 deletion lib/src/search/constants.rs
Original file line number Diff line number Diff line change
@@ -1 +1,21 @@
pub const CHARABIA: &str = "CHARABIA";
use once_cell::sync::Lazy;
use tantivy::tokenizer::TextAnalyzer;

#[cfg(feature = "charabia")]
use super::charabia::CharabiaTokenizer;

#[cfg(not(feature = "charabia"))]
use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer};

pub const CUSTOM_TOKENIZER: &str = "CUSTOM_TOKENIZER";

pub const DEFAULT_TOKENIZER: Lazy<TextAnalyzer> = Lazy::new(|| {
#[cfg(not(feature = "charabia"))]
return TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build();

#[cfg(feature = "charabia")]
return CharabiaTokenizer::default().into();
});
21 changes: 13 additions & 8 deletions lib/src/search/index.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
use std::{error::Error, ffi::OsStr, fs::create_dir_all, fs::remove_dir_all, path::PathBuf};

use tantivy::tokenizer::TextAnalyzer;
use tantivy::{doc, Index};

use crate::config::get_config_dir;
use crate::{Dictionary, PreviewOptions};

#[cfg(feature = "charabia")]
use super::constants::CHARABIA;

use super::constants::{CUSTOM_TOKENIZER, DEFAULT_TOKENIZER};
use super::schema::{FIELD_BUFFER, FIELD_DEFINITIONS, FIELD_TERM, SCHEMA};

#[cfg(feature = "charabia")]
use super::tokenizer::CharabiaTokenizer;

pub struct IndexOptions {
pub memory: usize,
pub dir: PathBuf,
pub overwrite: bool,
pub tokenizer: TextAnalyzer,
pub cb_on_item: Box<dyn Fn(usize, &str) + Send + Sync>,
}

Expand All @@ -29,11 +26,20 @@ impl IndexOptions {
Self {
memory: 50_000_000,
overwrite: false,
tokenizer: DEFAULT_TOKENIZER.to_owned(),
dir: get_default_index_dir(),
cb_on_item: Box::new(|_, _| {}),
}
}

pub fn tokenizer<T>(mut self, tokenizer: T) -> Self
where
TextAnalyzer: From<T>,
{
self.tokenizer = tokenizer.into();
self
}

pub fn overwrite(mut self, overwrite: bool) -> Self {
self.overwrite = overwrite;
self
Expand Down Expand Up @@ -82,10 +88,9 @@ impl Dictionary {

let index = Index::create_in_dir(&index_path, SCHEMA.to_owned())?;

#[cfg(feature = "charabia")]
index
.tokenizers()
.register(CHARABIA, CharabiaTokenizer::default());
.register(CUSTOM_TOKENIZER, opts.tokenizer.clone());

let mut index_writer = index.writer(opts.memory)?;

Expand Down
5 changes: 3 additions & 2 deletions lib/src/search/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
mod constants;
mod index;
mod schema;
mod search;

#[cfg(feature = "charabia")]
mod constants;
mod charabia;

#[cfg(feature = "charabia")]
mod tokenizer;
pub use self::charabia::*;

pub use self::index::*;
pub use self::search::*;
22 changes: 7 additions & 15 deletions lib/src/search/schema.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
use once_cell::sync::Lazy;
use tantivy::schema::{Field, Schema, TextOptions, STORED};

#[cfg(feature = "charabia")]
use tantivy::schema::{IndexRecordOption, TextFieldIndexing};
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED};

#[cfg(feature = "charabia")]
use super::constants::CHARABIA;
use super::constants::CUSTOM_TOKENIZER;

pub(super) const SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();

#[cfg(feature = "charabia")]
let text_indexing = TextFieldIndexing::default()
.set_tokenizer(CHARABIA) // Set custom tokenizer
.set_index_option(IndexRecordOption::WithFreqsAndPositions);

#[cfg(feature = "charabia")]
let text_options = TextOptions::default().set_indexing_options(text_indexing);

#[cfg(not(feature = "charabia"))]
let text_options = TextOptions::default();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer(CUSTOM_TOKENIZER)
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
);

schema_builder.add_text_field("term", text_options.clone().set_stored());
schema_builder.add_text_field("definitions", text_options);
Expand Down
20 changes: 15 additions & 5 deletions lib/src/search/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ use std::{error::Error, ffi::OsStr, path::PathBuf};

use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use rkyv::{archived_root, Deserialize, Infallible};
use tantivy::{collector::TopDocs, query::QueryParser, Index, ReloadPolicy};
use tantivy::{
collector::TopDocs, query::QueryParser, tokenizer::TextAnalyzer, Index, ReloadPolicy,
};

use crate::{Dictionary, Entry};

#[cfg(feature = "charabia")]
use super::{constants::CHARABIA, tokenizer::CharabiaTokenizer};
use super::constants::{CUSTOM_TOKENIZER, DEFAULT_TOKENIZER};

use super::{
get_default_index_dir,
Expand All @@ -18,6 +19,7 @@ pub struct SearchOptions {
pub dir: PathBuf,
pub threshold: u32,
pub limit: usize,
pub tokenizer: TextAnalyzer,
}

impl SearchOptions {
Expand All @@ -26,6 +28,7 @@ impl SearchOptions {
dir: get_default_index_dir(),
threshold: 1,
limit: 10,
tokenizer: DEFAULT_TOKENIZER.to_owned(),
}
}

Expand All @@ -34,6 +37,14 @@ impl SearchOptions {
self
}

pub fn tokenizer<T>(mut self, tokenizer: T) -> Self
where
TextAnalyzer: From<T>,
{
self.tokenizer = tokenizer.into();
self
}

pub fn threshold(mut self, threshold: u32) -> Self {
self.threshold = threshold;
self
Expand Down Expand Up @@ -61,10 +72,9 @@ impl Dictionary {
let index_path = opts.dir.join(self.id.as_str());
let index = Index::open_in_dir(&index_path)?;

#[cfg(feature = "charabia")]
index
.tokenizers()
.register(CHARABIA, CharabiaTokenizer::default());
.register(CUSTOM_TOKENIZER, opts.tokenizer.to_owned());

let reader = index
.reader_builder()
Expand Down
2 changes: 1 addition & 1 deletion lib/tests/sql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ mod helpers;
mod index_tests {
use super::helpers::EXAMPLE_DICTIONARY_1;
use insta::assert_snapshot;
use odict::{SQLDialect, ToSQL, ID};
use odict::{SQLDialect, ToSQL};
use regex::Regex;

#[test]
Expand Down

0 comments on commit cd92b9f

Please sign in to comment.