Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

Commit

Permalink
add qdrant and inmemory support (#6)
Browse files Browse the repository at this point in the history
* add qdrant-client to requirements

* add qdrant module

* add minor TODO

* fix qdrant vectorparams

* minor fix

* update process_and_get_documents via making path_or_files union

* minor fix in markdown_processing module

* add in_memory module

* minor fix

* Fix bug in setup_db module

* add todo to inmemory module
  • Loading branch information
SeeknnDestroy authored Oct 10, 2023
1 parent 7bc2abf commit b5fb999
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 14 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ SIMILARITY_TOP_K = 4

## Run FastAPI Server
```bash
uvicorn fastapi_app:app --host 0.0.0.0 --port 8000 --reload
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
```

### Access the API Documentation
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@ llama-index==0.8.41
gitpython==3.1.37
uvicorn==0.23.2
fastapi==0.103.2
python-dotenv
pinecone-client==2.2.4
python-dotenv
8 changes: 3 additions & 5 deletions setup_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
logger = logging.getLogger(__name__)


def setup_database(index_name: str = "quickstart", read_as_single_doc: bool = True) -> None:
def setup_database(read_as_single_doc: bool = True) -> None:
"""
Perform a one-time setup to initialize the vector database with documents.
This function should be executed once to populate the vector database with initial documents.
It clones or pulls a Git repository to access the latest markdown files, then initializes the database.
Parameters:
index_name (str): The name of the Pinecone index to use. Default is 'quickstart'.
read_as_single_doc (bool): Whether to treat each markdown file as a single document. Default is True.
"""
required_env_variables = ["DOCS_PATH"]
Expand All @@ -29,15 +28,14 @@ def setup_database(index_name: str = "quickstart", read_as_single_doc: bool = Tr
# Get environment variables
git_repo_url = env_utils.read_env_variable("GIT_REPO_URL", "https://github.com/ultralytics/ultralytics.git")
git_repo_path = Path(env_utils.read_env_variable("GIT_REPO_PATH", "./ultralytics"))
docs_path = env_utils.read_env_variable("DOCS_PATH").lstrip('/') # Remove leading slash if present
full_path = git_repo_path.joinpath(docs_path) # Concatenate paths
relative_docs_path = env_utils.read_env_variable("DOCS_PATH").lstrip('/') # Path to get the documents from (default is 'docs')

# Clone or pull the git repository to get the latest markdown files
git_utils.clone_or_pull_repository(git_repo_url, git_repo_path)

# Setup the database
logger.info("Starting database setup.")
llm_utils.initialize_database(index_name, full_path, read_as_single_doc=read_as_single_doc)
llm_utils.initialize_database(git_repo_url, git_repo_path, read_as_single_doc=read_as_single_doc, relative_docs_path=relative_docs_path)
logger.info("Database setup completed successfully.")


Expand Down
3 changes: 2 additions & 1 deletion utils/llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def initialize_database(
# Step 3: Connect to the existing vector store database
pinecone_vs = PineconeVS(index_name=PINECONE_INDEX_NAME) # TODO: utilize vector store factory for generic use
pinecone_vs.initialize_vectorindex()

pinecone_vs.connect_vectorstore()

logger.info("Updating vector store with documents")

# Step 4: Update the index with the documents
Expand Down
24 changes: 20 additions & 4 deletions utils/markdown_processing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union

from llama_index.schema import Document

Expand All @@ -9,7 +9,11 @@
logger = logging.getLogger(__name__)


def process_and_get_documents(folder_path: Path, read_as_single_doc: bool = False, extra_info: Optional[Dict] = None) -> List[Document]:
def process_and_get_documents(
path_or_files: Union[Path, List[Path]],
read_as_single_doc: bool = False,
extra_info: Optional[Dict] = None
) -> List[Document]:
"""
Process markdown files to extract documents.
Expand All @@ -18,14 +22,26 @@ def process_and_get_documents(folder_path: Path, read_as_single_doc: bool = Fals
2. If `read_as_single_doc=True`, it treats each markdown file as a single Document.
Parameters:
folder_path (Path): Path to the folder containing markdown files.
path_or_files (Union[Path, List[Path]]): Path to the folder or list of file paths containing markdown files.
read_as_single_doc (bool): Flag to read the entire markdown file as a single Document.
extra_info (Optional[Dict]): Additional metadata to include.
Returns:
list: List of processed Documents.
"""
multi_markdown_reader = MultiMarkdownReader(read_as_single_doc=read_as_single_doc)
documents = multi_markdown_reader.load_data_from_folder_or_files(folder_path, extra_info=extra_info)

# If path_or_files is a Path, check if it is a folder or a file.
if isinstance(path_or_files, Path):
if path_or_files.is_dir():
documents = multi_markdown_reader.load_data_from_folder_or_files(folder_path=path_or_files, extra_info=extra_info)
elif path_or_files.is_file():
documents = multi_markdown_reader.load_data_from_folder_or_files(files=[path_or_files], extra_info=extra_info)
# If path_or_files is a list of Paths, read all files.
elif isinstance(path_or_files, list):
documents = multi_markdown_reader.load_data_from_folder_or_files(files=path_or_files, extra_info=extra_info)
else:
raise ValueError("Invalid input: path_or_files must be either a Path or a List[Path].")

logger.info(f"Found {len(documents)} {'header-documents' if not read_as_single_doc else 'documents'}.")
return documents
40 changes: 40 additions & 0 deletions vectorstores/in_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from pathlib import Path
from typing import Union, List

from llama_index import VectorStoreIndex
from llama_index.storage.storage_context import StorageContext

from .base import BaseVS
from utils.markdown_processing import process_and_get_documents

class InMemoryVectorStore(BaseVS):
def __init__(self, path_or_files: Union[Path, List[Path]], read_as_single_doc: bool = True, show_progress: bool = True):
self._path_or_files = path_or_files
self._read_as_single_doc = read_as_single_doc
self._show_progress = show_progress
super().__init__()

def _validate_requirements(self):
"""
For in-memory, no special requirements to validate.
"""
pass

def initialize_vectorindex(self):
"""
Create a new vector store index.
"""
# TODO: Add support for other file formats. (pdf, docx, etc.)
documents = process_and_get_documents(path_or_files=self._path_or_files, read_as_single_doc=self._read_as_single_doc)
self._vectorstore = VectorStoreIndex.from_documents(
documents=documents,
show_progress=self._show_progress
)

def connect_vectorstore(self):
"""
Connect to an existing vector store index. Sets self._vectorstore.
"""
# For in-memory, the initialization and connection can be the same.
self.initialize_vectorindex()

2 changes: 1 addition & 1 deletion vectorstores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def _validate_requirements(self):
try:
import pinecone
except ImportError:
raise ImportError("pip install pinecone to use this feature")
raise ImportError("`pinecone` package not found, please run `pip install pinecone-client==2.2.4`")

def initialize_vectorindex(self):
"""
Expand Down
75 changes: 75 additions & 0 deletions vectorstores/qdrant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from llama_index.vector_stores.qdrant import QdrantVectorStore

from utils.env_utils import read_env_variable, validate_environment_variables
from .base import BaseVS

class QdrantVS(BaseVS):
def __init__(self, collection_name: str, size: int = 1536, distance: str = "EUCLID"):
self._collection_name = collection_name
self._size = size
self._distance = distance
self._client = None
super().__init__()

def _validate_requirements(self):
"""
Validate all required env variables are present, and all required packages are installed.
"""
required_env_variables = ["QDRANT_API_KEY", "QDRANT_URL"]

validate_environment_variables(required_env_variables)

try:
import qdrant_client
except ImportError:
raise ImportError("`qdrant-client` package not found, please run `pip install qdrant-client==1.5.4`")

def _initialize_client(self):
"""
Initialize the Qdrant client if not already initialized.
"""
from qdrant_client import QdrantClient

# If client already initialized, return
if self._client is not None:
return

# Read environment variables for Qdrant initialization
url = read_env_variable("QDRANT_URL")
api_key = read_env_variable("QDRANT_API_KEY")

self._client = QdrantClient(
url=self._url,
api_key=self._api_key
)

def initialize_vectorindex(self):
"""
Create a new vector store index.
"""
from qdrant_client.models import Distance, VectorParams

# Initialize client
self._initialize_client()

# Convert string distance measure to Distance Enum equals to Distance.EUCLID
distance = Distance[self._distance]

# Create index
self._client.recreate_collection(
collection_name=self._collection_name,
vectors_config=VectorParams(size=self._size, distance=distance)
)

def connect_vectorstore(self):
"""
Connect to an existing vector store index. Sets self._vectorstore.
"""
# Initialize client
self._initialize_client()

# Construct vector store
self._vectorstore = QdrantVectorStore(
collection_name=self._collection_name,
client=self._client
)

0 comments on commit b5fb999

Please sign in to comment.