Skip to content

Commit

Permalink
Fix library to work with new names
Browse files Browse the repository at this point in the history
The index was renamed from Timescale Vector to
diskann within vectorscale.
  • Loading branch information
cevian committed Jul 25, 2024
1 parent 659956d commit 1250a76
Show file tree
Hide file tree
Showing 7 changed files with 307 additions and 188 deletions.
135 changes: 86 additions & 49 deletions README.md

Large diffs are not rendered by default.

117 changes: 63 additions & 54 deletions nbs/00_vector.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion nbs/01_pgvectorizer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@
"from langchain.text_splitter import CharacterTextSplitter\n",
"from timescale_vector import client\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.timescalevector import TimescaleVector\n",
"from langchain_community.vectorstores.timescalevector import TimescaleVector\n",
"from datetime import timedelta"
]
},
Expand Down
159 changes: 112 additions & 47 deletions nbs/index.ipynb

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions nbs/tsv_python_getting_started_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook shows how to use the PostgreSQL vector database `Timescale Vector` via the [Timescale Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use TimescaleVector for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n",
"This notebook shows how to use the PostgreSQL as vector database via the [Python Vector python client library](https://github.com/timescale/python-vector). You'll learn how to use the client for (1) semantic search, (2) time-based vector search, (3) and how to create indexes to speed up queries.\n",
"\n",
"Follow along by downloading the [Jupyter notebook version of this tutorial here](https://github.com/timescale/python-vector/blob/main/nbs/tsv_python_getting_started_tutorial.ipynb).\n",
"\n",
Expand Down Expand Up @@ -914,7 +914,7 @@
"\n",
"Important note: In PostgreSQL, each table can only have one index on a particular column. So if you'd like to test the performance of different index types, you can do so either by (1) creating multiple tables with different indexes, (2) creating multiple vector columns in the same table and creating different indexes on each column, or (3) by dropping and recreating the index on the same column and comparing results.\n",
"\n",
"Let's look at how to create each type of index in Timescale Vector, starting with the TimescaleVector (DiskANN) index."
"Let's look at how to create each type of index, starting with the StreamingDiskANN index."
]
},
{
Expand All @@ -924,7 +924,7 @@
"outputs": [],
"source": [
"# Create a timescale vector (DiskANN) search index on the embedding column\n",
"await vec.create_embedding_index(client.TimescaleVectorIndex())"
"await vec.create_embedding_index(client.DiskAnnIndex())"
]
},
{
Expand Down Expand Up @@ -974,7 +974,7 @@
"outputs": [],
"source": [
"await vec.drop_embedding_index()\n",
"await vec.create_embedding_index(client.TimescaleVectorIndex())"
"await vec.create_embedding_index(client.DiskAnnIndex())"
]
},
{
Expand All @@ -993,13 +993,13 @@
"- Finding the most recent embeddings that are similar to a query vector (e.g recent news).\n",
"- Constraining similarity search to a relevant time range (e.g asking time-based questions about a knowledge base)\n",
"\n",
"Let's look at how to run similarity searches with time range filters using the TimescaleVector client.\n",
"Let's look at how to run similarity searches with time range filters using the client.\n",
"\n",
"- The first step to using time filtering with Timescale Vector is to create a table with the `time_partition_interval` argument set to the desired time interval. This will automatically partition the table into time-based chunks to speed up queries. We completed this step in Part 1 above.\n",
"\n",
"- Next, we ensure the `id` of our row is a `uuid` with a datetime portion that reflects the date and time we want to associated with the embedding. We completed this step in Part 2 above, where we used the `uuid_from_time()` method provided by the Timescale Vector library.\n",
"\n",
"- Finally, we can run similarity searches with time range filters using the TimescaleVector client. We'll illustrate this below."
"- Finally, we can run similarity searches with time range filters using the client. We'll illustrate this below."
]
},
{
Expand Down
19 changes: 9 additions & 10 deletions timescale_vector/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@
'timescale_vector/client.py'),
'timescale_vector.client.BaseIndex.get_index_method': ( 'vector.html#baseindex.get_index_method',
'timescale_vector/client.py'),
'timescale_vector.client.DiskAnnIndex': ('vector.html#diskannindex', 'timescale_vector/client.py'),
'timescale_vector.client.DiskAnnIndex.__init__': ( 'vector.html#diskannindex.__init__',
'timescale_vector/client.py'),
'timescale_vector.client.DiskAnnIndex.create_index_query': ( 'vector.html#diskannindex.create_index_query',
'timescale_vector/client.py'),
'timescale_vector.client.DiskAnnIndexParams': ( 'vector.html#diskannindexparams',
'timescale_vector/client.py'),
'timescale_vector.client.DiskAnnIndexParams.__init__': ( 'vector.html#diskannindexparams.__init__',
'timescale_vector/client.py'),
'timescale_vector.client.HNSWIndex': ('vector.html#hnswindex', 'timescale_vector/client.py'),
'timescale_vector.client.HNSWIndex.__init__': ( 'vector.html#hnswindex.__init__',
'timescale_vector/client.py'),
Expand Down Expand Up @@ -152,16 +161,6 @@
'timescale_vector.client.Sync.table_is_empty': ( 'vector.html#sync.table_is_empty',
'timescale_vector/client.py'),
'timescale_vector.client.Sync.upsert': ('vector.html#sync.upsert', 'timescale_vector/client.py'),
'timescale_vector.client.TimescaleVectorIndex': ( 'vector.html#timescalevectorindex',
'timescale_vector/client.py'),
'timescale_vector.client.TimescaleVectorIndex.__init__': ( 'vector.html#timescalevectorindex.__init__',
'timescale_vector/client.py'),
'timescale_vector.client.TimescaleVectorIndex.create_index_query': ( 'vector.html#timescalevectorindex.create_index_query',
'timescale_vector/client.py'),
'timescale_vector.client.TimescaleVectorIndexParams': ( 'vector.html#timescalevectorindexparams',
'timescale_vector/client.py'),
'timescale_vector.client.TimescaleVectorIndexParams.__init__': ( 'vector.html#timescalevectorindexparams.__init__',
'timescale_vector/client.py'),
'timescale_vector.client.UUIDTimeRange': ( 'vector.html#uuidtimerange',
'timescale_vector/client.py'),
'timescale_vector.client.UUIDTimeRange.__init__': ( 'vector.html#uuidtimerange.__init__',
Expand Down
51 changes: 30 additions & 21 deletions timescale_vector/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

# %% auto 0
__all__ = ['SEARCH_RESULT_ID_IDX', 'SEARCH_RESULT_METADATA_IDX', 'SEARCH_RESULT_CONTENTS_IDX', 'SEARCH_RESULT_EMBEDDING_IDX',
'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex',
'TimescaleVectorIndex', 'QueryParams', 'TimescaleVectorIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams',
'UUIDTimeRange', 'Predicates', 'QueryBuilder', 'Async', 'Sync']
'SEARCH_RESULT_DISTANCE_IDX', 'uuid_from_time', 'BaseIndex', 'IvfflatIndex', 'HNSWIndex', 'DiskAnnIndex',
'QueryParams', 'DiskAnnIndexParams', 'IvfflatIndexParams', 'HNSWIndexParams', 'UUIDTimeRange', 'Predicates',
'QueryBuilder', 'Async', 'Sync']

# %% ../nbs/00_vector.ipynb 5
import asyncpg
Expand Down Expand Up @@ -153,44 +153,48 @@ def create_index_query(self, table_name_quoted:str, column_name_quoted: str, ind
return "CREATE INDEX {index_name} ON {table_name} USING hnsw ({column_name} {index_method}) {with_clause};"\
.format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, index_method=index_method, with_clause=with_clause)

class TimescaleVectorIndex(BaseIndex):
class DiskAnnIndex(BaseIndex):
def __init__(self,
use_pq: Optional[bool] = None,
num_neighbors: Optional[int] = None,
search_list_size: Optional[int] = None,
num_neighbors: Optional[int] = None,
max_alpha: Optional[float] = None,
pq_vector_length: Optional[int] = None,
storage_layout: Optional[str] = None,
num_dimensions: Optional[int] = None,
num_bits_per_dimension: Optional[int] = None,
) -> None:
"""
Timescale's vector index.
"""
self.use_pq = use_pq
self.num_neighbors = num_neighbors
self.search_list_size = search_list_size
self.num_neighbors = num_neighbors
self.max_alpha = max_alpha
self.pq_vector_length = pq_vector_length
self.storage_layout = storage_layout
self.num_dimensions = num_dimensions
self.num_bits_per_dimension = num_bits_per_dimension

def create_index_query(self, table_name_quoted:str, column_name_quoted: str, index_name_quoted: str, distance_type: str, num_records_callback: Callable[[], int]) -> str:
if distance_type != "<=>":
raise ValueError(f"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}")

with_clauses = []
if self.use_pq is not None:
with_clauses.append(f"use_pq = {self.use_pq}")
if self.num_neighbors is not None:
with_clauses.append(f"num_neighbors = {self.num_neighbors}")
if self.search_list_size is not None:
with_clauses.append(f"search_list_size = {self.search_list_size}")
if self.num_neighbors is not None:
with_clauses.append(f"num_neighbors = {self.num_neighbors}")
if self.max_alpha is not None:
with_clauses.append(f"max_alpha = {self.max_alpha}")
if self.pq_vector_length is not None:
with_clauses.append(f"pq_vector_length = {self.pq_vector_length}")
if self.storage_layout is not None:
with_clauses.append(f"storage_layout = {self.storage_layout}")
if self.num_dimensions is not None:
with_clauses.append(f"num_dimensions = {self.num_dimensions}")
if self.num_bits_per_dimension is not None:
with_clauses.append(f"num_bits_per_dimension = {self.num_bits_per_dimension}")

with_clause = ""
if len(with_clauses) > 0:
with_clause = "WITH (" + ", ".join(with_clauses) + ")"

return "CREATE INDEX {index_name} ON {table_name} USING tsv ({column_name}) {with_clause};"\
return "CREATE INDEX {index_name} ON {table_name} USING diskann ({column_name}) {with_clause};"\
.format(index_name=index_name_quoted, table_name=table_name_quoted, column_name=column_name_quoted, with_clause=with_clause)


Expand All @@ -202,9 +206,14 @@ def __init__(self, params: dict[str, Any]) -> None:
def get_statements(self) -> List[str]:
return ["SET LOCAL " + key + " = " + str(value) for key, value in self.params.items()]

class TimescaleVectorIndexParams(QueryParams):
def __init__(self, search_list_size: int) -> None:
super().__init__({"tsv.query_search_list_size": search_list_size})
class DiskAnnIndexParams(QueryParams):
def __init__(self, search_list_size: Optional[int] = None, rescore: Optional[int] = None) -> None:
params = {}
if search_list_size is not None:
params["diskann.query_search_list_size"] = search_list_size
if rescore is not None:
params["diskann.query_rescore"] = rescore
super().__init__(params)

class IvfflatIndexParams(QueryParams):
def __init__(self, probes: int) -> None:
Expand Down Expand Up @@ -602,7 +611,7 @@ def get_create_query(self):
)
return '''
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS timescale_vector;
CREATE EXTENSION IF NOT EXISTS vectorscale;
CREATE TABLE IF NOT EXISTS {table_name} (
Expand Down

0 comments on commit 1250a76

Please sign in to comment.