From d22ae88c7a52e4974cc0e77c6148cf19bb108918 Mon Sep 17 00:00:00 2001 From: ian_Cin Date: Thu, 21 Mar 2024 22:48:20 +0700 Subject: [PATCH] make default installation faster (#2) * remove cohere as default * refractor dependencies * use llama-index pdf reader as default (pypdf) * fix some lazy docstring * update install scripts * minor fix --- .github/workflows/unit-test.yaml | 2 +- .../kotaemon/indices/ingests/files.py | 3 +- libs/kotaemon/kotaemon/loaders/base.py | 2 +- libs/kotaemon/kotaemon/loaders/docx_loader.py | 2 +- libs/kotaemon/kotaemon/loaders/html_loader.py | 2 +- libs/kotaemon/pyproject.toml | 32 +++++++++++-------- libs/ktem/ktem/index/file/pipelines.py | 6 ++-- libs/ktem/pyproject.toml | 4 --- scripts/run_linux.sh | 2 +- scripts/run_macos.sh | 2 +- scripts/run_windows.bat | 2 +- 11 files changed, 28 insertions(+), 31 deletions(-) diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml index 82b9154d..c94ad194 100644 --- a/.github/workflows/unit-test.yaml +++ b/.github/workflows/unit-test.yaml @@ -89,7 +89,7 @@ jobs: run: | python -m pip install --upgrade pip cd libs/kotaemon - pip install -U --upgrade-strategy eager -e .[dev] + pip install -U --upgrade-strategy eager -e .[all] - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} if: | diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 80044afa..ed00e5cb 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -7,7 +7,6 @@ from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( - AutoReader, DirectoryReader, MathpixPDFReader, OCRReader, @@ -59,7 +58,7 @@ def _get_reader(self, input_files: list[str | Path]): file_extractors[ext] = cls() if self.pdf_mode == "normal": - file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore + pass # use default loader of llama-index which is pypdf elif self.pdf_mode == "ocr": file_extractors[".pdf"] = OCRReader() else: diff --git a/libs/kotaemon/kotaemon/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py index ca27e491..2e52f729 100644 --- a/libs/kotaemon/kotaemon/loaders/base.py +++ b/libs/kotaemon/kotaemon/loaders/base.py @@ -55,7 +55,7 @@ def _get_wrapped_class(self) -> Type["BaseReader"]: def _get_wrapped_class(self) -> Type["LIBaseReader"]: raise NotImplementedError( - "Please return the relevant Langchain class in in _get_lc_class" + "Please return the relevant llama-index class in in _get_wrapped_class" ) def __init__(self, *args, **kwargs): diff --git a/libs/kotaemon/kotaemon/loaders/docx_loader.py b/libs/kotaemon/kotaemon/loaders/docx_loader.py index b8f77b8f..dcec5398 100644 --- a/libs/kotaemon/kotaemon/loaders/docx_loader.py +++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py @@ -33,7 +33,7 @@ def load_data( """Load data using Docx reader Args: - file_path (Path): Path to PDF file + file_path (Path): Path to .docx file Returns: List[Document]: list of documents extracted from the HTML file diff --git a/libs/kotaemon/kotaemon/loaders/html_loader.py b/libs/kotaemon/kotaemon/loaders/html_loader.py index fd0eddd6..1295cfca 100644 --- a/libs/kotaemon/kotaemon/loaders/html_loader.py +++ b/libs/kotaemon/kotaemon/loaders/html_loader.py @@ -37,7 +37,7 @@ def load_data( """Load data using Html reader Args: - file_path: path to pdf file + file_path: path to HTML file extra_info: extra information passed to this reader during extracting data Returns: diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 5abfb98f..1ba69636 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development." dependencies = [ "langchain", "langchain-community", + "langchain-openai", + "openai", "theflow", "llama-index>=0.9.0,<0.10.0", "llama-hub", @@ -27,6 +29,11 @@ dependencies = [ "pandas", "trogon", "tenacity", + "python-dotenv", # currently used to read configs from file, should be remove in the future + "chromadb", + "unstructured", + "pypdf", + "html2text", ] readme = "README.md" license = { text = "MIT License" } @@ -42,31 +49,28 @@ classifiers = [ ] [project.optional-dependencies] -dev = [ - "ipython", - "pytest", - "pre-commit", - "black", - "flake8", - "sphinx", - "coverage", - "openai", - "langchain-openai", - "chromadb", +adv = [ "wikipedia", "duckduckgo-search", "googlesearch-python", "python-docx", - "python-dotenv", "pytest-mock", "unstructured[pdf]", "sentence_transformers", "cohere", "elasticsearch", - "pypdf", - "html2text", "llama-cpp-python", ] +dev = [ + "ipython", + "pytest", + "pre-commit", + "black", + "flake8", + "sphinx", + "coverage", +] +all = ["kotaemon[adv,dev]"] [project.scripts] kh = "kotaemon.cli:main" diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index ff7c8955..d15d2fb4 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -25,7 +25,7 @@ from kotaemon.base import RetrievedDocument from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.indices.ingests import DocumentIngestor -from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking +from kotaemon.indices.rankings import BaseReranking, LLMReranking from .base import BaseFileIndexIndexing, BaseFileIndexRetriever @@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever): vector_retrieval: VectorRetrieval = VectorRetrieval.withx( embedding=embeddings.get_default(), ) - reranker: BaseReranking = CohereReranking.withx( - cohere_api_key=getattr(settings, "COHERE_API_KEY", "") - ) >> LLMReranking.withx(llm=llms.get_lowest_cost()) + reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost()) get_extra_table: bool = False def run( diff --git a/libs/ktem/pyproject.toml b/libs/ktem/pyproject.toml index c3160f19..6fee8a04 100644 --- a/libs/ktem/pyproject.toml +++ b/libs/ktem/pyproject.toml @@ -13,18 +13,14 @@ version = "0.2.0" requires-python = ">= 3.10" description = "RAG-based Question and Answering Application" dependencies = [ - "chromadb", "click", - "cohere", "platformdirs", "pluggy", "python-decouple", - "python-dotenv", "python-pptx", "sqlalchemy", "sqlmodel", "tiktoken", - "unstructured[pdf]", ] readme = "README.md" license = { text = "MIT License" } diff --git a/scripts/run_linux.sh b/scripts/run_linux.sh index 7298b87b..8e2ea056 100755 --- a/scripts/run_linux.sh +++ b/scripts/run_linux.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_macos.sh b/scripts/run_macos.sh index 6ad9901d..de71f75f 100755 --- a/scripts/run_macos.sh +++ b/scripts/run_macos.sh @@ -92,7 +92,7 @@ function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else - local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" + local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" echo "" && echo "Install kotaemon's requirements" diff --git a/scripts/run_windows.bat b/scripts/run_windows.bat index 4e5db862..e365686a 100644 --- a/scripts/run_windows.bat +++ b/scripts/run_windows.bat @@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 ( ECHO Dependencies are already installed ) ELSE ( ECHO Install kotaemon's requirements - CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]" + CALL python -m pip install -e "%CD%\libs\kotaemon" ECHO Install ktem's requirements CALL python -m pip install -e "%CD%\libs\ktem"