Skip to content

Commit

Permalink
make default installation faster (#2)
Browse files Browse the repository at this point in the history
* remove cohere as default

* refractor dependencies

* use llama-index pdf reader as default (pypdf)

* fix some lazy docstring

* update install scripts

* minor fix
  • Loading branch information
lone17 authored Mar 21, 2024
1 parent a8f92b3 commit d22ae88
Show file tree
Hide file tree
Showing 11 changed files with 28 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
run: |
python -m pip install --upgrade pip
cd libs/kotaemon
pip install -U --upgrade-strategy eager -e .[dev]
pip install -U --upgrade-strategy eager -e .[all]
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
if: |
Expand Down
3 changes: 1 addition & 2 deletions libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
MathpixPDFReader,
OCRReader,
Expand Down Expand Up @@ -59,7 +58,7 @@ def _get_reader(self, input_files: list[str | Path]):
file_extractors[ext] = cls()

if self.pdf_mode == "normal":
file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore
pass # use default loader of llama-index which is pypdf
elif self.pdf_mode == "ocr":
file_extractors[".pdf"] = OCRReader()
else:
Expand Down
2 changes: 1 addition & 1 deletion libs/kotaemon/kotaemon/loaders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _get_wrapped_class(self) -> Type["BaseReader"]:

def _get_wrapped_class(self) -> Type["LIBaseReader"]:
raise NotImplementedError(
"Please return the relevant Langchain class in in _get_lc_class"
"Please return the relevant llama-index class in in _get_wrapped_class"
)

def __init__(self, *args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion libs/kotaemon/kotaemon/loaders/docx_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def load_data(
"""Load data using Docx reader
Args:
file_path (Path): Path to PDF file
file_path (Path): Path to .docx file
Returns:
List[Document]: list of documents extracted from the HTML file
Expand Down
2 changes: 1 addition & 1 deletion libs/kotaemon/kotaemon/loaders/html_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def load_data(
"""Load data using Html reader
Args:
file_path: path to pdf file
file_path: path to HTML file
extra_info: extra information passed to this reader during extracting data
Returns:
Expand Down
32 changes: 18 additions & 14 deletions libs/kotaemon/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development."
dependencies = [
"langchain",
"langchain-community",
"langchain-openai",
"openai",
"theflow",
"llama-index>=0.9.0,<0.10.0",
"llama-hub",
Expand All @@ -27,6 +29,11 @@ dependencies = [
"pandas",
"trogon",
"tenacity",
"python-dotenv", # currently used to read configs from file, should be remove in the future
"chromadb",
"unstructured",
"pypdf",
"html2text",
]
readme = "README.md"
license = { text = "MIT License" }
Expand All @@ -42,31 +49,28 @@ classifiers = [
]

[project.optional-dependencies]
dev = [
"ipython",
"pytest",
"pre-commit",
"black",
"flake8",
"sphinx",
"coverage",
"openai",
"langchain-openai",
"chromadb",
adv = [
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"python-dotenv",
"pytest-mock",
"unstructured[pdf]",
"sentence_transformers",
"cohere",
"elasticsearch",
"pypdf",
"html2text",
"llama-cpp-python",
]
dev = [
"ipython",
"pytest",
"pre-commit",
"black",
"flake8",
"sphinx",
"coverage",
]
all = ["kotaemon[adv,dev]"]

[project.scripts]
kh = "kotaemon.cli:main"
Expand Down
6 changes: 2 additions & 4 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from kotaemon.base import RetrievedDocument
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
from kotaemon.indices.rankings import BaseReranking, LLMReranking

from .base import BaseFileIndexIndexing, BaseFileIndexRetriever

Expand Down Expand Up @@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
embedding=embeddings.get_default(),
)
reranker: BaseReranking = CohereReranking.withx(
cohere_api_key=getattr(settings, "COHERE_API_KEY", "")
) >> LLMReranking.withx(llm=llms.get_lowest_cost())
reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost())
get_extra_table: bool = False

def run(
Expand Down
4 changes: 0 additions & 4 deletions libs/ktem/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,14 @@ version = "0.2.0"
requires-python = ">= 3.10"
description = "RAG-based Question and Answering Application"
dependencies = [
"chromadb",
"click",
"cohere",
"platformdirs",
"pluggy",
"python-decouple",
"python-dotenv",
"python-pptx",
"sqlalchemy",
"sqlmodel",
"tiktoken",
"unstructured[pdf]",
]
readme = "README.md"
license = { text = "MIT License" }
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed"
else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/"

echo "" && echo "Install kotaemon's requirements"
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed"
else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/"

echo "" && echo "Install kotaemon's requirements"
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_windows.bat
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 (
ECHO Dependencies are already installed
) ELSE (
ECHO Install kotaemon's requirements
CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]"
CALL python -m pip install -e "%CD%\libs\kotaemon"

ECHO Install ktem's requirements
CALL python -m pip install -e "%CD%\libs\ktem"
Expand Down

0 comments on commit d22ae88

Please sign in to comment.