docs: add advanced chunking example

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
DS4SD · Oct 31, 2024 · aa31846 · aa31846
1 parent 9d88658
commit aa31846
Showing 1 changed file with 233 additions and 0 deletions.
diff --git a/docs/examples/advanced_chunking.ipynb b/docs/examples/advanced_chunking.ipynb
@@ -0,0 +1,233 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Advanced Chunking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook, we demonstrate an advanced chunking example, showcasing how a user can:\n",
+    "- serialize and include some parts of the metadata (as per application logic) into the final chunk text, and\n",
+    "- leverage a tokenizer to build specialized chunking logic, e.g. to impose a maximum token length and futher split chunks beyond that."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We first convert an example document:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "source = \"https://arxiv.org/pdf/2408.09869\"\n",
+    "converter = DocumentConverter()\n",
+    "doc = converter.convert(source=source).document"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We set up the tokenizer we want to use:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "model_id = \"BAAI/bge-small-en-v1.5\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we define the metadata serialization logic and the specific usage of the tokenizer for applying the token limits.\n",
+    "\n",
+    "The whole process is wrapped as a `BaseChunker` implementation internally using a `HierarchicalChunker` and applying the logic on top of the results of the latter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from copy import deepcopy\n",
+    "from typing import Iterable, Iterator\n",
+    "\n",
+    "from docling_core.transforms.chunker import (\n",
+    "    BaseChunk,\n",
+    "    BaseChunker,\n",
+    "    DocMeta,\n",
+    "    HierarchicalChunker,\n",
+    ")\n",
+    "from docling_core.types.doc import DoclingDocument as DLDocument\n",
+    "from pydantic import PositiveInt\n",
+    "\n",
+    "\n",
+    "class MaxTokenLimitingChunker(BaseChunker):\n",
+    "    inner_chunker: BaseChunker = HierarchicalChunker()\n",
+    "    max_tokens: PositiveInt = 512\n",
+    "    delim: str = \"\\n\"\n",
+    "\n",
+    "    def _serialize_meta_to_include(self, meta: DocMeta) -> str:\n",
+    "        meta_parts = []\n",
+    "        headings_part = self.delim.join(meta.headings or [])\n",
+    "        if headings_part:\n",
+    "            meta_parts.append(headings_part)\n",
+    "        captions_part = self.delim.join(meta.captions or [])\n",
+    "        if captions_part:\n",
+    "            meta_parts.append(captions_part)\n",
+    "        return self.delim.join(meta_parts)\n",
+    "\n",
+    "    def _split_above_max_tokens(self, chunk_iter: Iterable[BaseChunk]):\n",
+    "        for chunk in chunk_iter:\n",
+    "            meta = DocMeta.model_validate(chunk.meta)\n",
+    "            meta_text = self._serialize_meta_to_include(meta=meta)\n",
+    "            meta_list = [meta_text] if meta_text else []\n",
+    "            full_ser = self.delim.join(meta_list + ([chunk.text] if chunk.text else []))\n",
+    "\n",
+    "            meta_tokens = tokenizer(\n",
+    "                meta_text, return_offsets_mapping=True, add_special_tokens=False\n",
+    "            )[\"offset_mapping\"]\n",
+    "            delim_tokens = (\n",
+    "                tokenizer(\n",
+    "                    self.delim, return_offsets_mapping=True, add_special_tokens=False\n",
+    "                )[\"offset_mapping\"]\n",
+    "                if meta_text\n",
+    "                else []\n",
+    "            )\n",
+    "            num_tokens_avail_for_text = self.max_tokens - (\n",
+    "                len(meta_tokens) + len(delim_tokens)\n",
+    "            )\n",
+    "\n",
+    "            text_tokens = tokenizer(\n",
+    "                chunk.text, return_offsets_mapping=True, add_special_tokens=False\n",
+    "            )[\"offset_mapping\"]\n",
+    "            num_text_tokens = len(text_tokens)\n",
+    "\n",
+    "            if (\n",
+    "                num_text_tokens <= num_tokens_avail_for_text\n",
+    "            ):  # chunk already within token limit\n",
+    "                c = deepcopy(chunk)\n",
+    "                c.text = full_ser\n",
+    "                yield c\n",
+    "            else:  # chunk requires further splitting to meet token limit\n",
+    "                fitting_texts = [\n",
+    "                    chunk.text[\n",
+    "                        text_tokens[base][0] : text_tokens[\n",
+    "                            min(base + num_tokens_avail_for_text, num_text_tokens) - 1\n",
+    "                        ][1]\n",
+    "                    ]\n",
+    "                    for base in range(0, num_text_tokens, num_tokens_avail_for_text)\n",
+    "                ]\n",
+    "                for text in fitting_texts:\n",
+    "                    c = deepcopy(chunk)\n",
+    "                    c.text = self.delim.join(meta_list + [text])\n",
+    "                    yield c\n",
+    "\n",
+    "    def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:\n",
+    "        chunk_iter = self.inner_chunker.chunk(dl_doc=dl_doc, **kwargs)\n",
+    "        yield from self._split_above_max_tokens(chunk_iter=chunk_iter)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the example invocation shown below, one can see how a single original chunk (`self_ref == \"#/texts/8\"`) is split into multiple ones:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'len=64 text=1 Introduction\\nConverting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation ('"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'len=64 text=1 Introduction\\nRAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'len=26 text=1 Introduction\\n, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.'"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "chunker = MaxTokenLimitingChunker(max_tokens=64)\n",
+    "chunk_iter = chunker.chunk(dl_doc=doc)\n",
+    "\n",
+    "for chunk in chunk_iter:\n",
+    "    meta = DocMeta.model_validate(chunk.meta)\n",
+    "    if meta.doc_items[0].self_ref == \"#/texts/8\":\n",
+    "        display(\n",
+    "            f\"len={len(tokenizer(chunk.text, return_offsets_mapping=True, add_special_tokens=False)['offset_mapping'])} text={chunk.text}\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}