Fix library to work with new names

The index was renamed from Timescale Vector to diskann within vectorscale.
timescale · Jul 25, 2024 · 1250a76 · 1250a76
1 parent 659956d
commit 1250a76
Show file tree

Hide file tree

Showing 7 changed files with 307 additions and 188 deletions.
diff --git a/README.md b/README.md
diff --git a/nbs/00_vector.ipynb b/nbs/00_vector.ipynb
diff --git a/nbs/01_pgvectorizer.ipynb b/nbs/01_pgvectorizer.ipynb
@@ -226,7 +226,7 @@
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from timescale_vector import client\n",
     "from langchain_openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores.timescalevector import TimescaleVector\n",
+    "from langchain_community.vectorstores.timescalevector import TimescaleVector\n",
     "from datetime import timedelta"
    ]
   },

diff --git a/nbs/index.ipynb b/nbs/index.ipynb
diff --git a/nbs/tsv_python_getting_started_tutorial.ipynb b/nbs/tsv_python_getting_started_tutorial.ipynb
@@ -13,7 +13,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook shows how to use the PostgreSQL vector database `Timescale Vector` via the [Timescale Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use TimescaleVector for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n",
+    "This notebook shows how to use the PostgreSQL as vector database via the [Python Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use the client for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n",
     "\n",
     "Follow along by downloading the [Jupyter notebook version of this tutorial here](https://github.com/timescale/python-vector/blob/main/nbs/tsv_python_getting_started_tutorial.ipynb).\n",
     "\n",
@@ -914,7 +914,7 @@
     "\n",
     "Important note: In PostgreSQL, each table can only have one index on a particular column. So if you'd like to test the performance of different index types, you can do so either by (1) creating multiple tables with different indexes, (2) creating multiple vector columns in the same table and creating different indexes on each column, or (3) by dropping and recreating the index on the same column and comparing results.\n",
     "\n",
-    "Let's look at how to create each type of index in Timescale Vector, starting with the TimescaleVector (DiskANN) index."
+    "Let's look at how to create each type of index, starting with the StreamingDiskANN index."
    ]
   },
   {
@@ -924,7 +924,7 @@
    "outputs": [],
    "source": [
     "# Create a timescale vector (DiskANN) search index on the embedding column\n",
-    "await vec.create_embedding_index(client.TimescaleVectorIndex())"
+    "await vec.create_embedding_index(client.DiskAnnIndex())"
    ]
   },
   {
@@ -974,7 +974,7 @@
    "outputs": [],
    "source": [
     "await vec.drop_embedding_index()\n",
-    "await vec.create_embedding_index(client.TimescaleVectorIndex())"
+    "await vec.create_embedding_index(client.DiskAnnIndex())"
    ]
   },
   {
@@ -993,13 +993,13 @@
     "- Finding the most recent embeddings that are similar to a query vector (e.g recent news).\n",
     "- Constraining similarity search to a relevant time range (e.g asking time-based questions about a knowledge base)\n",
     "\n",
-    "Let's look at how to run similarity searches with time range filters using the TimescaleVector client.\n",
+    "Let's look at how to run similarity searches with time range filters using the client.\n",
     "\n",
     "- The first step to using time filtering with Timescale Vector is to create a table with the `time_partition_interval` argument set to the desired time interval. This will automatically partition the table into time-based chunks to speed up queries. We completed this step in Part 1 above.\n",
     "\n",
     "- Next, we ensure the `id` of our row is a `uuid` with a datetime portion that reflects the date and time we want to associated with the embedding. We completed this step in Part 2 above, where we used the `uuid_from_time()` method provided by the Timescale Vector library.\n",
     "\n",
-    "- Finally, we can run similarity searches with time range filters using the TimescaleVector client. We'll illustrate this below."
+    "- Finally, we can run similarity searches with time range filters using the client. We'll illustrate this below."
    ]
   },
   {

diff --git a/timescale_vector/_modidx.py b/timescale_vector/_modidx.py
@@ -42,6 +42,15 @@
                                                                                                    'timescale_vector/client.py'),
                                          'timescale_vector.client.BaseIndex.get_index_method': ( 'vector.html#baseindex.get_index_method',
                                                                                                  'timescale_vector/client.py'),
+                                         'timescale_vector.client.DiskAnnIndex': ('vector.html#diskannindex', 'timescale_vector/client.py'),
+                                         'timescale_vector.client.DiskAnnIndex.__init__': ( 'vector.html#diskannindex.__init__',
+                                                                                            'timescale_vector/client.py'),
+                                         'timescale_vector.client.DiskAnnIndex.create_index_query': ( 'vector.html#diskannindex.create_index_query',
+                                                                                                      'timescale_vector/client.py'),
+                                         'timescale_vector.client.DiskAnnIndexParams': ( 'vector.html#diskannindexparams',
+                                                                                         'timescale_vector/client.py'),
+                                         'timescale_vector.client.DiskAnnIndexParams.__init__': ( 'vector.html#diskannindexparams.__init__',
+                                                                                                  'timescale_vector/client.py'),
                                          'timescale_vector.client.HNSWIndex': ('vector.html#hnswindex', 'timescale_vector/client.py'),
                                          'timescale_vector.client.HNSWIndex.__init__': ( 'vector.html#hnswindex.__init__',
                                                                                          'timescale_vector/client.py'),
@@ -152,16 +161,6 @@
                                          'timescale_vector.client.Sync.table_is_empty': ( 'vector.html#sync.table_is_empty',
                                                                                           'timescale_vector/client.py'),
                                          'timescale_vector.client.Sync.upsert': ('vector.html#sync.upsert', 'timescale_vector/client.py'),
-                                         'timescale_vector.client.TimescaleVectorIndex': ( 'vector.html#timescalevectorindex',
-                                                                                           'timescale_vector/client.py'),
-                                         'timescale_vector.client.TimescaleVectorIndex.__init__': ( 'vector.html#timescalevectorindex.__init__',
-                                                                                                    'timescale_vector/client.py'),
-                                         'timescale_vector.client.TimescaleVectorIndex.create_index_query': ( 'vector.html#timescalevectorindex.create_index_query',
-                                                                                                              'timescale_vector/client.py'),
-                                         'timescale_vector.client.TimescaleVectorIndexParams': ( 'vector.html#timescalevectorindexparams',
-                                                                                                 'timescale_vector/client.py'),
-                                         'timescale_vector.client.TimescaleVectorIndexParams.__init__': ( 'vector.html#timescalevectorindexparams.__init__',
-                                                                                                          'timescale_vector/client.py'),
                                          'timescale_vector.client.UUIDTimeRange': ( 'vector.html#uuidtimerange',
                                                                                     'timescale_vector/client.py'),
                                          'timescale_vector.client.UUIDTimeRange.__init__': ( 'vector.html#uuidtimerange.__init__',

diff --git a/timescale_vector/client.py b/timescale_vector/client.py
@@ -2,9 +2,9 @@
 
 # %% auto 0
 __all__ = ['SEARCH_RESULT_ID_IDX', 'SEARCH_RESULT_METADATA_IDX', 'SEARCH_RESULT_CONTENTS_IDX', 'SEARCH_RESULT_EMBEDDING_IDX',
-           'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex',
-           'TimescaleVectorIndex', 'QueryParams', 'TimescaleVectorIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams',
-           'UUIDTimeRange', 'Predicates', 'QueryBuilder', 'Async', 'Sync']
+           'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex', 'DiskAnnIndex',
+           'QueryParams', 'DiskAnnIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams', 'UUIDTimeRange', 'Predicates',
+           'QueryBuilder', 'Async', 'Sync']
 
 # %% ../nbs/00_vector.ipynb 5
 import asyncpg
@@ -153,44 +153,48 @@ def create_index_query(self, table_name_quoted:str, column_name_quoted: str, ind
         return "CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name} {index_method}) {with_clause};"\
             .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, index_method=index_method, with_clause=with_clause)
 
-class TimescaleVectorIndex(BaseIndex):
+class DiskAnnIndex(BaseIndex):
     def __init__(self, 
-                 use_pq: Optional[bool] = None, 
-                 num_neighbors: Optional[int] = None, 
                  search_list_size: Optional[int] = None, 
+                 num_neighbors: Optional[int] = None, 
                  max_alpha: Optional[float] = None,
-                 pq_vector_length: Optional[int] = None,
+                 storage_layout: Optional[str] = None,
+                 num_dimensions: Optional[int] = None,
+                 num_bits_per_dimension: Optional[int] = None,
                  ) -> None:
         """
         Timescale's vector index.
         """
-        self.use_pq = use_pq
-        self.num_neighbors = num_neighbors
         self.search_list_size = search_list_size
+        self.num_neighbors = num_neighbors
         self.max_alpha = max_alpha
-        self.pq_vector_length = pq_vector_length
+        self.storage_layout = storage_layout
+        self.num_dimensions = num_dimensions
+        self.num_bits_per_dimension = num_bits_per_dimension
 
     def create_index_query(self, table_name_quoted:str, column_name_quoted: str, index_name_quoted: str, distance_type: str, num_records_callback: Callable[[], int]) -> str:
         if distance_type != "<=>":
             raise ValueError(f"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}")
 
         with_clauses = []
-        if self.use_pq is not None:
-            with_clauses.append(f"use_pq = {self.use_pq}")
-        if self.num_neighbors is not None:
-            with_clauses.append(f"num_neighbors = {self.num_neighbors}")
         if self.search_list_size is not None:
             with_clauses.append(f"search_list_size = {self.search_list_size}")
+        if self.num_neighbors is not None:
+            with_clauses.append(f"num_neighbors = {self.num_neighbors}")
         if self.max_alpha is not None:
             with_clauses.append(f"max_alpha = {self.max_alpha}")
-        if self.pq_vector_length is not None:
-            with_clauses.append(f"pq_vector_length = {self.pq_vector_length}")
+        if self.storage_layout is not None:
+            with_clauses.append(f"storage_layout = {self.storage_layout}")
+        if self.num_dimensions is not None:
+            with_clauses.append(f"num_dimensions = {self.num_dimensions}")
+        if self.num_bits_per_dimension is not None:
+            with_clauses.append(f"num_bits_per_dimension = {self.num_bits_per_dimension}")
 
         with_clause = ""
         if len(with_clauses) > 0:
             with_clause = "WITH (" + ", ".join(with_clauses) + ")"
 
-        return "CREATE INDEX {index_name} ON {table_name} USING tsv ({column_name}) {with_clause};"\
+        return "CREATE INDEX {index_name} ON {table_name} USING diskann ({column_name}) {with_clause};"\
             .format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, with_clause=with_clause)
 
 
@@ -202,9 +206,14 @@ def __init__(self, params: dict[str, Any]) -> None:
     def get_statements(self) -> List[str]:
         return ["SET LOCAL " + key + " = " + str(value) for key, value in self.params.items()]
 
-class TimescaleVectorIndexParams(QueryParams):
-    def __init__(self, search_list_size: int) -> None:
-        super().__init__({"tsv.query_search_list_size": search_list_size})
+class DiskAnnIndexParams(QueryParams):
+    def __init__(self, search_list_size: Optional[int] = None, rescore: Optional[int] = None) -> None:
+        params = {}
+        if search_list_size is not None:
+            params["diskann.query_search_list_size"] = search_list_size
+        if rescore is not None:
+            params["diskann.query_rescore"] = rescore
+        super().__init__(params)
 
 class IvfflatIndexParams(QueryParams):
     def __init__(self, probes: int) -> None:
@@ -602,7 +611,7 @@ def get_create_query(self):
                 )
         return '''
 CREATE EXTENSION IF NOT EXISTS vector;
-CREATE EXTENSION IF NOT EXISTS timescale_vector;
+CREATE EXTENSION IF NOT EXISTS vectorscale;
 
 
 CREATE TABLE IF NOT EXISTS {table_name} (