docs: add Haystack RAG example (#615)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
DS4SD · Dec 17, 2024 · 3e599c7 · 3e599c7
1 parent 3b53bd3
commit 3e599c7
Show file tree

Hide file tree

Showing 2 changed files with 387 additions and 0 deletions.
diff --git a/docs/examples/rag_haystack.ipynb b/docs/examples/rag_haystack.ipynb
@@ -0,0 +1,386 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_haystack.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RAG with Haystack"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This example leverages the\n",
+    "[Haystack Docling extension](https://github.com/DS4SD/docling-haystack), along with\n",
+    "Milvus-based document store and retriever instances, as well as sentence-transformers\n",
+    "embeddings.\n",
+    "\n",
+    "The presented `DoclingConverter` component enables you to:\n",
+    "- use various document types in your LLM applications with ease and speed, and\n",
+    "- leverage Docling's rich format for advanced, document-native grounding.\n",
+    "\n",
+    "`DoclingConverter` supports two different export modes:\n",
+    "- `ExportType.MARKDOWN`: if you want to capture each input document as a separate\n",
+    "  Haystack document, or\n",
+    "- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n",
+    "  to then capture each individual chunk as a separate Haystack document downstream.\n",
+    "\n",
+    "The example allows to explore both modes via parameter `EXPORT_TYPE`; depending on the\n",
+    "value set, the ingestion and RAG pipelines are then set up accordingly."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- 👉 For best conversion speed, use GPU acceleration whenever available; e.g. if running on Colab, use GPU-enabled runtime.\n",
+    "- Notebook uses HuggingFace's Inference API; for increased LLM quota, token can be provided via env var `HF_TOKEN`.\n",
+    "- Requirements can be installed as shown below (`--no-warn-conflicts` meant for Colab's pre-populated Python env; feel free to remove for stricter usage):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q --progress-bar off --no-warn-conflicts docling-haystack haystack-ai docling pymilvus milvus-haystack sentence-transformers python-dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "from tempfile import mkdtemp\n",
+    "\n",
+    "from docling_haystack.converter import ExportType\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "def _get_env_from_colab_or_os(key):\n",
+    "    try:\n",
+    "        from google.colab import userdata\n",
+    "\n",
+    "        try:\n",
+    "            return userdata.get(key)\n",
+    "        except userdata.SecretNotFoundError:\n",
+    "            pass\n",
+    "    except ImportError:\n",
+    "        pass\n",
+    "    return os.getenv(key)\n",
+    "\n",
+    "load_dotenv()\n",
+    "HF_TOKEN = _get_env_from_colab_or_os(\"HF_TOKEN\")\n",
+    "PATHS = [\"https://arxiv.org/pdf/2408.09869\"]  # Docling Technical Report\n",
+    "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "GENERATION_MODEL_ID = \"mistralai/Mixtral-8x7B-Instruct-v0.1\"\n",
+    "EXPORT_TYPE = ExportType.DOC_CHUNKS\n",
+    "QUESTION = \"Which are the main AI models in Docling?\"\n",
+    "TOP_K = 3\n",
+    "MILVUS_URI = str(Path(mkdtemp()) / \"docling.db\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Indexing pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 512). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "80beca8762c34095a21467fb7f056059",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'writer': {'documents_written': 54}}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from docling_haystack.converter import DoclingConverter\n",
+    "from haystack import Pipeline\n",
+    "from haystack.components.embedders import (\n",
+    "    SentenceTransformersDocumentEmbedder,\n",
+    "    SentenceTransformersTextEmbedder,\n",
+    ")\n",
+    "from haystack.components.preprocessors import DocumentSplitter\n",
+    "from haystack.components.writers import DocumentWriter\n",
+    "from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever\n",
+    "\n",
+    "from docling.chunking import HybridChunker\n",
+    "\n",
+    "document_store = MilvusDocumentStore(\n",
+    "    connection_args={\"uri\": MILVUS_URI},\n",
+    "    drop_old=True,\n",
+    "    text_field=\"txt\",  # set for preventing conflict with same-name metadata field\n",
+    ")\n",
+    "\n",
+    "idx_pipe = Pipeline()\n",
+    "idx_pipe.add_component(\n",
+    "    \"converter\",\n",
+    "    DoclingConverter(\n",
+    "        export_type=EXPORT_TYPE,\n",
+    "        chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
+    "    ),\n",
+    ")\n",
+    "idx_pipe.add_component(\n",
+    "    \"embedder\",\n",
+    "    SentenceTransformersDocumentEmbedder(model=EMBED_MODEL_ID),\n",
+    ")\n",
+    "idx_pipe.add_component(\"writer\", DocumentWriter(document_store=document_store))\n",
+    "if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
+    "    idx_pipe.connect(\"converter\", \"embedder\")\n",
+    "elif EXPORT_TYPE == ExportType.MARKDOWN:\n",
+    "    idx_pipe.add_component(\n",
+    "        \"splitter\",\n",
+    "        DocumentSplitter(split_by=\"sentence\", split_length=1),\n",
+    "    )\n",
+    "    idx_pipe.connect(\"converter.documents\", \"splitter.documents\")\n",
+    "    idx_pipe.connect(\"splitter.documents\", \"embedder.documents\")\n",
+    "else:\n",
+    "    raise ValueError(f\"Unexpected export type: {EXPORT_TYPE}\")\n",
+    "idx_pipe.connect(\"embedder\", \"writer\")\n",
+    "idx_pipe.run({\"converter\": {\"paths\": PATHS}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RAG pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d753748e2b624896ad2caf5e8368b041",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py:2232: FutureWarning: `stop_sequences` is a deprecated argument for `text_generation` task and will be removed in version '0.28.0'. Use `stop` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from haystack.components.builders import AnswerBuilder\n",
+    "from haystack.components.builders.prompt_builder import PromptBuilder\n",
+    "from haystack.components.generators import HuggingFaceAPIGenerator\n",
+    "from haystack.utils import Secret\n",
+    "\n",
+    "prompt_template = \"\"\"\n",
+    "    Given these documents, answer the question.\n",
+    "    Documents:\n",
+    "    {% for doc in documents %}\n",
+    "        {{ doc.content }}\n",
+    "    {% endfor %}\n",
+    "    Question: {{query}}\n",
+    "    Answer:\n",
+    "    \"\"\"\n",
+    "\n",
+    "rag_pipe = Pipeline()\n",
+    "rag_pipe.add_component(\n",
+    "    \"embedder\",\n",
+    "    SentenceTransformersTextEmbedder(model=EMBED_MODEL_ID),\n",
+    ")\n",
+    "rag_pipe.add_component(\n",
+    "    \"retriever\",\n",
+    "    MilvusEmbeddingRetriever(document_store=document_store, top_k=TOP_K),\n",
+    ")\n",
+    "rag_pipe.add_component(\"prompt_builder\", PromptBuilder(template=prompt_template))\n",
+    "rag_pipe.add_component(\n",
+    "    \"llm\",\n",
+    "    HuggingFaceAPIGenerator(\n",
+    "        api_type=\"serverless_inference_api\",\n",
+    "        api_params={\"model\": GENERATION_MODEL_ID},\n",
+    "        token=Secret.from_token(HF_TOKEN) if HF_TOKEN else None,\n",
+    "    ),\n",
+    ")\n",
+    "rag_pipe.add_component(\"answer_builder\", AnswerBuilder())\n",
+    "rag_pipe.connect(\"embedder.embedding\", \"retriever\")\n",
+    "rag_pipe.connect(\"retriever\", \"prompt_builder.documents\")\n",
+    "rag_pipe.connect(\"prompt_builder\", \"llm\")\n",
+    "rag_pipe.connect(\"llm.replies\", \"answer_builder.replies\")\n",
+    "rag_pipe.connect(\"llm.meta\", \"answer_builder.meta\")\n",
+    "rag_pipe.connect(\"retriever\", \"answer_builder.documents\")\n",
+    "rag_res = rag_pipe.run(\n",
+    "    {\n",
+    "        \"embedder\": {\"text\": QUESTION},\n",
+    "        \"prompt_builder\": {\"query\": QUESTION},\n",
+    "        \"answer_builder\": {\"query\": QUESTION},\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below we print out the RAG results. If you have used `ExportType.DOC_CHUNKS`, notice how\n",
+    "the sources contain document-level grounding (e.g. page number or bounding box\n",
+    "information):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question:\n",
+      "Which are the main AI models in Docling?\n",
+      "\n",
+      "Answer:\n",
+      "The main AI models in Docling are a layout analysis model and TableFormer. The layout analysis model is an accurate object-detector for page elements, while TableFormer is a state-of-the-art table structure recognition model. These models are provided with pre-trained weights and a separate package for the inference code as docling-ibm-models. They are also used in the open-access deepsearch-experience, a cloud-native service for knowledge exploration tasks. Additionally, Docling plans to extend its model library with a figure-classifier model, an equation-recognition model, a code-recognition model, and more in the future.\n",
+      "\n",
+      "Sources:\n",
+      "- text: 'As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.'\n",
+      "  file: 2408.09869v5.pdf\n",
+      "  section: 3.2 AI models\n",
+      "  page: 3, bounding box: [107, 406, 504, 330]\n",
+      "- text: 'Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.'\n",
+      "  file: 2408.09869v5.pdf\n",
+      "  section: 3 Processing pipeline\n",
+      "  page: 2, bounding box: [107, 273, 504, 176]\n",
+      "- text: 'Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.\\nWe encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.'\n",
+      "  section: 6 Future work and contributions\n",
+      "  page: 5, bounding box: [106, 323, 504, 258]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from docling.chunking import DocChunk\n",
+    "\n",
+    "print(f\"Question:\\n{QUESTION}\\n\")\n",
+    "print(f\"Answer:\\n{rag_res['answer_builder']['answers'][0].data.strip()}\\n\")\n",
+    "print(\"Sources:\")\n",
+    "sources = rag_res[\"answer_builder\"][\"answers\"][0].documents\n",
+    "for source in sources:\n",
+    "    if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
+    "        doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
+    "        print(f\"- text: {repr(doc_chunk.text)}\")\n",
+    "        if doc_chunk.meta.origin:\n",
+    "            print(f\"  file: {doc_chunk.meta.origin.filename}\")\n",
+    "        if doc_chunk.meta.headings:\n",
+    "            print(f\"  section: {' / '.join(doc_chunk.meta.headings)}\")\n",
+    "        bbox = doc_chunk.meta.doc_items[0].prov[0].bbox\n",
+    "        print(\n",
+    "            f\"  page: {doc_chunk.meta.doc_items[0].prov[0].page_no}, \"\n",
+    "            f\"bounding box: [{int(bbox.l)}, {int(bbox.t)}, {int(bbox.r)}, {int(bbox.b)}]\"\n",
+    "        )\n",
+    "    elif EXPORT_TYPE == ExportType.MARKDOWN:\n",
+    "        print(repr(source.content))\n",
+    "    else:\n",
+    "        raise ValueError(f\"Unexpected export type: {EXPORT_TYPE}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -79,6 +79,7 @@ nav:
     - Chunking:
       - "Hybrid chunking": examples/hybrid_chunking.ipynb
     - RAG / QA:
+      - "RAG with Haystack": examples/rag_haystack.ipynb
       - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
       - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
       - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb