add qdrant and inmemory support (#6)

* add qdrant-client to requirements * add qdrant module * add minor TODO * fix qdrant vectorparams * minor fix * update process_and_get_documents via making path_or_files union * minor fix in markdown_processing module * add in_memory module * minor fix * Fix bug in setup_db module * add todo to inmemory module
viddexa · Oct 10, 2023 · b5fb999 · b5fb999
1 parent 7bc2abf
commit b5fb999
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ SIMILARITY_TOP_K = 4
 
 ## Run FastAPI Server
 ```bash
-uvicorn fastapi_app:app --host 0.0.0.0 --port 8000 --reload
+uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
 ```
 
 ### Access the API Documentation

diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,4 @@ llama-index==0.8.41
 gitpython==3.1.37
 uvicorn==0.23.2
 fastapi==0.103.2
-python-dotenv
-pinecone-client==2.2.4
+python-dotenv
diff --git a/setup_db.py b/setup_db.py
@@ -12,15 +12,14 @@
 logger = logging.getLogger(__name__)
 
 
-def setup_database(index_name: str = "quickstart", read_as_single_doc: bool = True) -> None:
+def setup_database(read_as_single_doc: bool = True) -> None:
     """
     Perform a one-time setup to initialize the vector database with documents.
     
     This function should be executed once to populate the vector database with initial documents.
     It clones or pulls a Git repository to access the latest markdown files, then initializes the database.
     
     Parameters:
-        index_name (str): The name of the Pinecone index to use. Default is 'quickstart'.
         read_as_single_doc (bool): Whether to treat each markdown file as a single document. Default is True.
     """
     required_env_variables = ["DOCS_PATH"]
@@ -29,15 +28,14 @@ def setup_database(index_name: str = "quickstart", read_as_single_doc: bool = Tr
     # Get environment variables
     git_repo_url = env_utils.read_env_variable("GIT_REPO_URL", "https://github.com/ultralytics/ultralytics.git")
     git_repo_path = Path(env_utils.read_env_variable("GIT_REPO_PATH", "./ultralytics"))
-    docs_path = env_utils.read_env_variable("DOCS_PATH").lstrip('/') # Remove leading slash if present
-    full_path = git_repo_path.joinpath(docs_path)   # Concatenate paths
+    relative_docs_path = env_utils.read_env_variable("DOCS_PATH").lstrip('/') # Path to get the documents from (default is 'docs')
 
     # Clone or pull the git repository to get the latest markdown files
     git_utils.clone_or_pull_repository(git_repo_url, git_repo_path)
 
     # Setup the database
     logger.info("Starting database setup.")
-    llm_utils.initialize_database(index_name, full_path, read_as_single_doc=read_as_single_doc)
+    llm_utils.initialize_database(git_repo_url, git_repo_path, read_as_single_doc=read_as_single_doc, relative_docs_path=relative_docs_path)
     logger.info("Database setup completed successfully.")
 
 

diff --git a/utils/llm_utils.py b/utils/llm_utils.py
@@ -81,7 +81,8 @@ def initialize_database(
     # Step 3: Connect to the existing vector store database
     pinecone_vs = PineconeVS(index_name=PINECONE_INDEX_NAME)  # TODO: utilize vector store factory for generic use
     pinecone_vs.initialize_vectorindex()
-
+    pinecone_vs.connect_vectorstore()
+
     logger.info("Updating vector store with documents")
 
     # Step 4: Update the index with the documents

diff --git a/utils/markdown_processing.py b/utils/markdown_processing.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 from llama_index.schema import Document
 
@@ -9,7 +9,11 @@
 logger = logging.getLogger(__name__)
 
 
-def process_and_get_documents(folder_path: Path, read_as_single_doc: bool = False, extra_info: Optional[Dict] = None) -> List[Document]:
+def process_and_get_documents(
+        path_or_files: Union[Path, List[Path]],
+        read_as_single_doc: bool = False,
+        extra_info: Optional[Dict] = None
+) -> List[Document]:
     """
     Process markdown files to extract documents.
 
@@ -18,14 +22,26 @@ def process_and_get_documents(folder_path: Path, read_as_single_doc: bool = Fals
     2. If `read_as_single_doc=True`, it treats each markdown file as a single Document.
 
     Parameters:
-        folder_path (Path): Path to the folder containing markdown files.
+        path_or_files (Union[Path, List[Path]]): Path to the folder or list of file paths containing markdown files.
         read_as_single_doc (bool): Flag to read the entire markdown file as a single Document.
         extra_info (Optional[Dict]): Additional metadata to include.
 
     Returns:
         list: List of processed Documents.
     """
     multi_markdown_reader = MultiMarkdownReader(read_as_single_doc=read_as_single_doc)
-    documents = multi_markdown_reader.load_data_from_folder_or_files(folder_path, extra_info=extra_info)
+
+    # If path_or_files is a Path, check if it is a folder or a file.
+    if isinstance(path_or_files, Path):
+        if path_or_files.is_dir():
+            documents = multi_markdown_reader.load_data_from_folder_or_files(folder_path=path_or_files, extra_info=extra_info)
+        elif path_or_files.is_file():
+            documents = multi_markdown_reader.load_data_from_folder_or_files(files=[path_or_files], extra_info=extra_info)
+    # If path_or_files is a list of Paths, read all files.
+    elif isinstance(path_or_files, list):
+        documents = multi_markdown_reader.load_data_from_folder_or_files(files=path_or_files, extra_info=extra_info)
+    else:
+        raise ValueError("Invalid input: path_or_files must be either a Path or a List[Path].")
+
     logger.info(f"Found {len(documents)} {'header-documents' if not read_as_single_doc else 'documents'}.")
     return documents
diff --git a/vectorstores/in_memory.py b/vectorstores/in_memory.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+from typing import Union, List
+
+from llama_index import VectorStoreIndex
+from llama_index.storage.storage_context import StorageContext
+
+from .base import BaseVS
+from utils.markdown_processing import process_and_get_documents
+
+class InMemoryVectorStore(BaseVS):
+    def __init__(self, path_or_files: Union[Path, List[Path]], read_as_single_doc: bool = True, show_progress: bool = True):
+        self._path_or_files = path_or_files
+        self._read_as_single_doc = read_as_single_doc
+        self._show_progress = show_progress
+        super().__init__()
+
+    def _validate_requirements(self):
+        """
+        For in-memory, no special requirements to validate.
+        """
+        pass
+
+    def initialize_vectorindex(self):
+        """
+        Create a new vector store index.
+        """
+        # TODO: Add support for other file formats. (pdf, docx, etc.)
+        documents = process_and_get_documents(path_or_files=self._path_or_files, read_as_single_doc=self._read_as_single_doc)
+        self._vectorstore = VectorStoreIndex.from_documents(
+            documents=documents,
+            show_progress=self._show_progress
+        )
+
+    def connect_vectorstore(self):
+        """
+        Connect to an existing vector store index. Sets self._vectorstore.
+        """
+        # For in-memory, the initialization and connection can be the same.
+        self.initialize_vectorindex()
+
diff --git a/vectorstores/pinecone.py b/vectorstores/pinecone.py
@@ -24,7 +24,7 @@ def _validate_requirements(self):
         try:
             import pinecone
         except ImportError:
-            raise ImportError("pip install pinecone to use this feature")
+            raise ImportError("`pinecone` package not found, please run `pip install pinecone-client==2.2.4`")
 
     def initialize_vectorindex(self):
         """

diff --git a/vectorstores/qdrant.py b/vectorstores/qdrant.py
@@ -0,0 +1,75 @@
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+
+from utils.env_utils import read_env_variable, validate_environment_variables
+from .base import BaseVS
+
+class QdrantVS(BaseVS):
+    def __init__(self, collection_name: str, size: int = 1536, distance: str = "EUCLID"):
+        self._collection_name = collection_name
+        self._size = size
+        self._distance = distance
+        self._client = None
+        super().__init__()
+
+    def _validate_requirements(self):
+        """
+        Validate all required env variables are present, and all required packages are installed.
+        """
+        required_env_variables = ["QDRANT_API_KEY", "QDRANT_URL"]
+
+        validate_environment_variables(required_env_variables)
+
+        try:
+            import qdrant_client
+        except ImportError:
+            raise ImportError("`qdrant-client` package not found, please run `pip install qdrant-client==1.5.4`")
+
+    def _initialize_client(self):
+        """
+        Initialize the Qdrant client if not already initialized.
+        """
+        from qdrant_client import QdrantClient
+
+        # If client already initialized, return
+        if self._client is not None:
+            return
+
+        # Read environment variables for Qdrant initialization
+        url = read_env_variable("QDRANT_URL")
+        api_key = read_env_variable("QDRANT_API_KEY")
+
+        self._client = QdrantClient(
+            url=self._url, 
+            api_key=self._api_key
+        )
+
+    def initialize_vectorindex(self):
+        """
+        Create a new vector store index.
+        """
+        from qdrant_client.models import Distance, VectorParams
+
+        # Initialize client
+        self._initialize_client()
+
+        # Convert string distance measure to Distance Enum equals to Distance.EUCLID
+        distance = Distance[self._distance]
+
+        # Create index
+        self._client.recreate_collection(
+            collection_name=self._collection_name,
+            vectors_config=VectorParams(size=self._size, distance=distance)
+        )
+
+    def connect_vectorstore(self):
+        """
+        Connect to an existing vector store index. Sets self._vectorstore.
+        """
+        # Initialize client
+        self._initialize_client()
+
+        # Construct vector store
+        self._vectorstore = QdrantVectorStore(
+            collection_name=self._collection_name,
+            client=self._client
+        )