diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 622cff117..543ae91e2 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -10,6 +10,7 @@ config = ["dep:dirs"] charabia = ["dep:charabia"] search = ["dep:tantivy", "dep:tantivy-tokenizer-api"] serve = ["dep:actix-web"] +default = ["json", "sql", "config"] [dependencies] diff --git a/lib/src/search/tokenizer.rs b/lib/src/search/charabia.rs similarity index 98% rename from lib/src/search/tokenizer.rs rename to lib/src/search/charabia.rs index e49d15f8a..711ffc901 100644 --- a/lib/src/search/tokenizer.rs +++ b/lib/src/search/charabia.rs @@ -59,7 +59,7 @@ impl Tokenizer for CharabiaTokenizer { #[cfg(test)] mod tests { - use crate::search::tokenizer::CharabiaTokenizer; + use crate::search::charabia::CharabiaTokenizer; use tantivy::tokenizer::*; #[test] diff --git a/lib/src/search/constants.rs b/lib/src/search/constants.rs index 2d0197bcf..a799ef430 100644 --- a/lib/src/search/constants.rs +++ b/lib/src/search/constants.rs @@ -1 +1,21 @@ -pub const CHARABIA: &str = "CHARABIA"; +use once_cell::sync::Lazy; +use tantivy::tokenizer::TextAnalyzer; + +#[cfg(feature = "charabia")] +use super::charabia::CharabiaTokenizer; + +#[cfg(not(feature = "charabia"))] +use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, SimpleTokenizer}; + +pub const CUSTOM_TOKENIZER: &str = "CUSTOM_TOKENIZER"; + +pub const DEFAULT_TOKENIZER: Lazy = Lazy::new(|| { + #[cfg(not(feature = "charabia"))] + return TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .build(); + + #[cfg(feature = "charabia")] + return CharabiaTokenizer::default().into(); +}); diff --git a/lib/src/search/index.rs b/lib/src/search/index.rs index 57565c2d6..e0cb7e3bc 100644 --- a/lib/src/search/index.rs +++ b/lib/src/search/index.rs @@ -1,22 +1,19 @@ use std::{error::Error, ffi::OsStr, fs::create_dir_all, fs::remove_dir_all, path::PathBuf}; +use tantivy::tokenizer::TextAnalyzer; use tantivy::{doc, Index}; use crate::config::get_config_dir; use crate::{Dictionary, PreviewOptions}; -#[cfg(feature = "charabia")] -use super::constants::CHARABIA; - +use super::constants::{CUSTOM_TOKENIZER, DEFAULT_TOKENIZER}; use super::schema::{FIELD_BUFFER, FIELD_DEFINITIONS, FIELD_TERM, SCHEMA}; -#[cfg(feature = "charabia")] -use super::tokenizer::CharabiaTokenizer; - pub struct IndexOptions { pub memory: usize, pub dir: PathBuf, pub overwrite: bool, + pub tokenizer: TextAnalyzer, pub cb_on_item: Box, } @@ -29,11 +26,20 @@ impl IndexOptions { Self { memory: 50_000_000, overwrite: false, + tokenizer: DEFAULT_TOKENIZER.to_owned(), dir: get_default_index_dir(), cb_on_item: Box::new(|_, _| {}), } } + pub fn tokenizer(mut self, tokenizer: T) -> Self + where + TextAnalyzer: From, + { + self.tokenizer = tokenizer.into(); + self + } + pub fn overwrite(mut self, overwrite: bool) -> Self { self.overwrite = overwrite; self @@ -82,10 +88,9 @@ impl Dictionary { let index = Index::create_in_dir(&index_path, SCHEMA.to_owned())?; - #[cfg(feature = "charabia")] index .tokenizers() - .register(CHARABIA, CharabiaTokenizer::default()); + .register(CUSTOM_TOKENIZER, opts.tokenizer.clone()); let mut index_writer = index.writer(opts.memory)?; diff --git a/lib/src/search/mod.rs b/lib/src/search/mod.rs index 983ec3899..84108212c 100644 --- a/lib/src/search/mod.rs +++ b/lib/src/search/mod.rs @@ -1,12 +1,13 @@ +mod constants; mod index; mod schema; mod search; #[cfg(feature = "charabia")] -mod constants; +mod charabia; #[cfg(feature = "charabia")] -mod tokenizer; +pub use self::charabia::*; pub use self::index::*; pub use self::search::*; diff --git a/lib/src/search/schema.rs b/lib/src/search/schema.rs index ed0a9a6f1..2cc57072f 100644 --- a/lib/src/search/schema.rs +++ b/lib/src/search/schema.rs @@ -1,25 +1,17 @@ use once_cell::sync::Lazy; -use tantivy::schema::{Field, Schema, TextOptions, STORED}; -#[cfg(feature = "charabia")] -use tantivy::schema::{IndexRecordOption, TextFieldIndexing}; +use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, STORED}; -#[cfg(feature = "charabia")] -use super::constants::CHARABIA; +use super::constants::CUSTOM_TOKENIZER; pub(super) const SCHEMA: Lazy = Lazy::new(|| { let mut schema_builder = Schema::builder(); - #[cfg(feature = "charabia")] - let text_indexing = TextFieldIndexing::default() - .set_tokenizer(CHARABIA) // Set custom tokenizer - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - - #[cfg(feature = "charabia")] - let text_options = TextOptions::default().set_indexing_options(text_indexing); - - #[cfg(not(feature = "charabia"))] - let text_options = TextOptions::default(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer(CUSTOM_TOKENIZER) + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ); schema_builder.add_text_field("term", text_options.clone().set_stored()); schema_builder.add_text_field("definitions", text_options); diff --git a/lib/src/search/search.rs b/lib/src/search/search.rs index 278249f23..2d4eb250a 100644 --- a/lib/src/search/search.rs +++ b/lib/src/search/search.rs @@ -2,12 +2,13 @@ use std::{error::Error, ffi::OsStr, path::PathBuf}; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use rkyv::{archived_root, Deserialize, Infallible}; -use tantivy::{collector::TopDocs, query::QueryParser, Index, ReloadPolicy}; +use tantivy::{ + collector::TopDocs, query::QueryParser, tokenizer::TextAnalyzer, Index, ReloadPolicy, +}; use crate::{Dictionary, Entry}; -#[cfg(feature = "charabia")] -use super::{constants::CHARABIA, tokenizer::CharabiaTokenizer}; +use super::constants::{CUSTOM_TOKENIZER, DEFAULT_TOKENIZER}; use super::{ get_default_index_dir, @@ -18,6 +19,7 @@ pub struct SearchOptions { pub dir: PathBuf, pub threshold: u32, pub limit: usize, + pub tokenizer: TextAnalyzer, } impl SearchOptions { @@ -26,6 +28,7 @@ impl SearchOptions { dir: get_default_index_dir(), threshold: 1, limit: 10, + tokenizer: DEFAULT_TOKENIZER.to_owned(), } } @@ -34,6 +37,14 @@ impl SearchOptions { self } + pub fn tokenizer(mut self, tokenizer: T) -> Self + where + TextAnalyzer: From, + { + self.tokenizer = tokenizer.into(); + self + } + pub fn threshold(mut self, threshold: u32) -> Self { self.threshold = threshold; self @@ -61,10 +72,9 @@ impl Dictionary { let index_path = opts.dir.join(self.id.as_str()); let index = Index::open_in_dir(&index_path)?; - #[cfg(feature = "charabia")] index .tokenizers() - .register(CHARABIA, CharabiaTokenizer::default()); + .register(CUSTOM_TOKENIZER, opts.tokenizer.to_owned()); let reader = index .reader_builder() diff --git a/lib/tests/sql.rs b/lib/tests/sql.rs index 07daff190..6f4b6c616 100644 --- a/lib/tests/sql.rs +++ b/lib/tests/sql.rs @@ -4,7 +4,7 @@ mod helpers; mod index_tests { use super::helpers::EXAMPLE_DICTIONARY_1; use insta::assert_snapshot; - use odict::{SQLDialect, ToSQL, ID}; + use odict::{SQLDialect, ToSQL}; use regex::Regex; #[test]