Fix small chunk behavior (#84)

* Make less tests fail without features * Fix for regression not being fixed for tiny chunk sizes For very small chunk sizes (i.e. 5 tokens), the chunk size behavior wasn't completely brought back to pre-v0.5.0 behavior. While sizes of 10 or higher seemed to be unaffected, smaller had a higher chance of seeing this occaisional bug. While an edge case, the behavior is fixed now. * Exclude tokenizers from packaging * Readable tokenizer file
benbrandt · Jan 20, 2024 · fb21920 · fb21920
1 parent d28f8c0
commit fb21920
Show file tree

Hide file tree

Showing 6 changed files with 255,859 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.6.1
+
+### Fixes
+
+- Fix error in section filtering that didn't fix the chunk behavior regression from v0.5.0 in very tiny chunk capacities. For most commonly used chunk sizes, this shouldn't have been an issue.
+
 ## v0.6.0
 
 ### Breaking Changes

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
 name = "text-splitter"
-version = "0.6.0"
+version = "0.6.1"
 authors = ["Ben Brandt <benjamin.j.brandt@gmail.com>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
 repository = "https://github.com/benbrandt/text-splitter"
 license = "MIT"
 keywords = ["text", "split", "tokenizer", "nlp", "ai"]
 categories = ["text-processing"]
-exclude = ["/tests/snapshots/**", "/tests/inputs/**", "/bindings/**"]
+exclude = ["/tests/snapshots/**", "/tests/inputs/**", "/bindings/**", "/tests/tokenizers/**"]
 rust-version = "1.65.0"
 
 [package.metadata.docs.rs]

diff --git a/src/lib.rs b/src/lib.rs
@@ -787,7 +787,7 @@ where
                 // likely a meaningful breakpoint we want to preserve. We already know that the next highest doesn't fit anyway,
                 // so we should be safe to break once we reach it.
                 .take_while_inclusive(move |(offset, _)| {
-                    max_encoded_offset.map_or(true, |max| offset < &max)
+                    max_encoded_offset.map_or(true, |max| offset <= &max)
                 })
                 .filter(|(_, str)| !str.is_empty()),
         )

diff --git a/tests/text_splitter.rs b/tests/text_splitter.rs
@@ -128,3 +128,16 @@ fn random_chunk_range() {
         }
     }
 }
+
+#[cfg(feature = "tokenizers")]
+#[test]
+fn huggingface_small_chunk_behavior() {
+    let tokenizer =
+        tokenizers::Tokenizer::from_file("./tests/tokenizers/huggingface.json").unwrap();
+    let splitter = TextSplitter::new(tokenizer);
+
+    let text = "notokenexistsforthisword";
+    let chunks = splitter.chunks(text, 5).collect::<Vec<_>>();
+
+    assert_eq!(chunks, ["notokenexistsforth", "isword"]);
+}
diff --git a/tests/text_splitter_snapshots.rs b/tests/text_splitter_snapshots.rs
@@ -2,7 +2,9 @@ use std::fs;
 
 use once_cell::sync::Lazy;
 use text_splitter::{Characters, ChunkSizer, TextSplitter};
+#[cfg(feature = "tiktoken-rs")]
 use tiktoken_rs::{cl100k_base, CoreBPE};
+#[cfg(feature = "tokenizers")]
 use tokenizers::Tokenizer;
 
 #[test]
@@ -75,9 +77,11 @@ fn characters_range_trim() {
     });
 }
 
+#[cfg(feature = "tokenizers")]
 static HUGGINGFACE_TOKENIZER: Lazy<Tokenizer> =
     Lazy::new(|| Tokenizer::from_pretrained("bert-base-cased", None).unwrap());
 
+#[cfg(feature = "tokenizers")]
 #[test]
 fn huggingface_default() {
     insta::glob!("inputs/text/*.txt", |path| {
@@ -99,6 +103,7 @@ fn huggingface_default() {
     });
 }
 
+#[cfg(feature = "tokenizers")]
 #[test]
 fn huggingface_trim() {
     insta::glob!("inputs/text/*.txt", |path| {
@@ -119,8 +124,10 @@ fn huggingface_trim() {
     });
 }
 
+#[cfg(feature = "tiktoken-rs")]
 static TIKTOKEN_TOKENIZER: Lazy<CoreBPE> = Lazy::new(|| cl100k_base().unwrap());
 
+#[cfg(feature = "tiktoken-rs")]
 #[test]
 fn tiktoken_default() {
     insta::glob!("inputs/text/*.txt", |path| {
@@ -142,6 +149,7 @@ fn tiktoken_default() {
     });
 }
 
+#[cfg(feature = "tiktoken-rs")]
 #[test]
 fn tiktoken_trim() {
     insta::glob!("inputs/text/*.txt", |path| {