diff --git a/docs/docs/how_to/markdown_header_metadata_splitter.ipynb b/docs/docs/how_to/markdown_header_metadata_splitter.ipynb index 24ae5e421f6d1..2f4abafd00340 100644 --- a/docs/docs/how_to/markdown_header_metadata_splitter.ipynb +++ b/docs/docs/how_to/markdown_header_metadata_splitter.ipynb @@ -261,6 +261,50 @@ "splits = text_splitter.split_documents(md_header_splits)\n", "splits" ] + }, + { + "cell_type": "markdown", + "id": "b7b557f7", + "metadata": {}, + "source": [ + "### How to preserve whitespace from the original document:\n", + "\n", + "By default, `MarkdownHeaderTextSplitter` strips whitespace and newlines from the resulting documents, which can sometimes can cause issues with markdown sections like code blocks or nested lists. Use the `ExperimentalMarkdownSyntaxTextSplitter` to preserve whitespace in these instances." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ba48193b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'Header 1': 'Foo'}, page_content='# Foo \\nThis is Jim'),\n", + " Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='## Bar \\n* Bullet 1\\n* Sub-bullet a')]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_text_splitters import ExperimentalMarkdownSyntaxTextSplitter\n", + "\n", + "markdown_document = \"# Foo\\n\\n This is Jim \\n\\n## Bar\\n\\n* Bullet 1\\n * Sub-bullet a\"\n", + "\n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + "]\n", + "\n", + "markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(headers_to_split_on, strip_headers=False)\n", + "md_header_splits = markdown_splitter.split_text(markdown_document)\n", + "md_header_splits" + ] } ], "metadata": { @@ -279,7 +323,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.11.6" } }, "nbformat": 4,