-
Notifications
You must be signed in to change notification settings - Fork 15.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Community][fix] Fix Azure cosmos db no SQL similarity search with score and mmr #28479
base: master
Are you sure you want to change the base?
Changes from 11 commits
205300a
296f1c3
f001717
24f39c5
477a79e
d1e32cb
d6179dd
6ac62bc
292e64b
9bbbd79
a417523
2aa26ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
from __future__ import annotations | ||
|
||
import logging | ||
import uuid | ||
import warnings | ||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple | ||
from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple | ||
|
||
import numpy as np | ||
from langchain_core.documents import Document | ||
|
@@ -14,6 +15,8 @@ | |
if TYPE_CHECKING: | ||
from azure.cosmos.cosmos_client import CosmosClient | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class AzureCosmosDBNoSqlVectorSearch(VectorStore): | ||
"""`Azure Cosmos DB for NoSQL` vector store. | ||
|
@@ -30,7 +33,7 @@ def __init__( | |
*, | ||
cosmos_client: CosmosClient, | ||
embedding: Embeddings, | ||
vector_embedding_policy: Dict[str, Any], | ||
vector_embedding_policy: Optional[Dict[str, Any]], | ||
indexing_policy: Dict[str, Any], | ||
cosmos_container_properties: Dict[str, Any], | ||
cosmos_database_properties: Dict[str, Any], | ||
|
@@ -50,17 +53,27 @@ def __init__( | |
indexing_policy: Indexing Policy for the container. | ||
cosmos_container_properties: Container Properties for the container. | ||
cosmos_database_properties: Database Properties for the container. | ||
create_container: If True validates Properties for container creation. | ||
""" | ||
self._cosmos_client = cosmos_client | ||
self._database_name = database_name | ||
self._container_name = container_name | ||
self._embedding = embedding | ||
self._vector_embedding_policy = vector_embedding_policy | ||
self._indexing_policy = indexing_policy | ||
self._cosmos_container_properties = cosmos_container_properties | ||
self._cosmos_database_properties = cosmos_database_properties | ||
self._create_container = create_container | ||
|
||
# validate vector_embedding_policy if specified | ||
if (vector_embedding_policy is not None) and ( | ||
"vectorEmbeddings" not in vector_embedding_policy | ||
or len(vector_embedding_policy["vectorEmbeddings"]) == 0 | ||
): | ||
raise ValueError( | ||
"vectorEmbeddings must be present and cannot be null or empty" | ||
" in the vector_embedding_policy if specified." | ||
) | ||
|
||
if self._create_container: | ||
if ( | ||
indexing_policy["vectorIndexes"] is None | ||
|
@@ -69,13 +82,9 @@ def __init__( | |
raise ValueError( | ||
"vectorIndexes cannot be null or empty in the indexing_policy." | ||
) | ||
if ( | ||
vector_embedding_policy is None | ||
or len(vector_embedding_policy["vectorEmbeddings"]) == 0 | ||
): | ||
if vector_embedding_policy is None: | ||
raise ValueError( | ||
"vectorEmbeddings cannot be null " | ||
"or empty in the vector_embedding_policy." | ||
"vector_embedding_policy cannot be null when creating a container." | ||
) | ||
if self._cosmos_container_properties["partition_key"] is None: | ||
raise ValueError( | ||
|
@@ -115,12 +124,44 @@ def __init__( | |
match_condition=self._cosmos_container_properties.get("match_condition"), | ||
session_token=self._cosmos_container_properties.get("session_token"), | ||
initial_headers=self._cosmos_container_properties.get("initial_headers"), | ||
vector_embedding_policy=self._vector_embedding_policy, | ||
vector_embedding_policy=vector_embedding_policy, | ||
) | ||
|
||
# Validate that the created container has the correct vector embedding policy | ||
# properties | ||
container_vector_embedding_policy = self._container.read().get( | ||
"vector_embedding_policy" | ||
) | ||
if container_vector_embedding_policy is not None: | ||
# Container already has vector search exposed, verify it matches if | ||
# specified | ||
if ( | ||
vector_embedding_policy is not None | ||
and container_vector_embedding_policy != vector_embedding_policy | ||
): | ||
logger.warning( | ||
"The specified container's vector embedding policy" | ||
f" '{self._container_name}'" | ||
" does not match the specified configuration." | ||
) | ||
else: | ||
# Container doesn't have vector search exposed | ||
# (may be available but not exposed), use specified policy | ||
if vector_embedding_policy is None: | ||
raise ValueError( | ||
"The created container does not have vector search exposed" | ||
" and no vector_embedding_policy was specified." | ||
) | ||
container_vector_embedding_policy = vector_embedding_policy | ||
|
||
# Set vector embedding policy fields | ||
self._vector_embedding_policy = container_vector_embedding_policy | ||
self._embedding_key = self._vector_embedding_policy["vectorEmbeddings"][0][ | ||
"path" | ||
][1:] | ||
self._distance_strategy = self._vector_embedding_policy["vectorEmbeddings"][0][ | ||
"distanceFunction" | ||
] | ||
|
||
def add_texts( | ||
self, | ||
|
@@ -260,6 +301,28 @@ def delete_document_by_id(self, document_id: Optional[str] = None) -> None: | |
raise ValueError("No document ids provided to delete.") | ||
self._container.delete_item(document_id, partition_key=document_id) | ||
|
||
def _select_relevance_score_fn(self) -> Callable[[float], float]: | ||
""" | ||
The 'correct' relevance function | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. values are fetched from here : https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/vector-search#container-vector-policies |
||
may differ depending on a few things, including: | ||
- the distance / similarity metric used by the VectorStore | ||
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!) | ||
- embedding dimensionality | ||
- etc. | ||
""" | ||
if self._distance_strategy == "cosine": | ||
return self._cosine_relevance_score_fn | ||
elif self._distance_strategy == "euclidean": | ||
# Default behavior is to use euclidean distance relevancy | ||
return self._euclidean_relevance_score_fn | ||
elif self._distance_strategy == "dot product": | ||
return self._max_inner_product_relevance_score_fn | ||
else: | ||
raise ValueError( | ||
"Unknown distance strategy, must be cosine, dot product," | ||
" or euclidean" | ||
) | ||
|
||
def _similarity_search_with_score( | ||
self, | ||
embeddings: List[float], | ||
|
@@ -273,8 +336,12 @@ def _similarity_search_with_score( | |
if pre_filter is None or pre_filter.get("limit_offset_clause") is None: | ||
query += "TOP @limit " | ||
|
||
embedding_field = "" | ||
if with_embedding: | ||
embedding_field = "c[@embeddingKey] as embeddingKey, " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a minor optimization but its impact can be substantial: embeddings needed only when the When we ran our tests, even with chunk of size 4K tokens, the retrieval is still so much faster when we don't query for the embeddings (we go from 0.3s/chunk on a test to retrieve 100 chunks to ~0.02s/chunk) |
||
|
||
query += ( | ||
"c.id, c[@embeddingKey], c.text, c.metadata, " | ||
f"c.id, {embedding_field}c.text, c.metadata, " | ||
"VectorDistance(c[@embeddingKey], @embeddings) AS SimilarityScore FROM c" | ||
) | ||
|
||
|
@@ -305,7 +372,7 @@ def _similarity_search_with_score( | |
metadata = item["metadata"] | ||
score = item["SimilarityScore"] | ||
if with_embedding: | ||
metadata[self._embedding_key] = item[self._embedding_key] | ||
metadata[self._embedding_key] = item["embeddingKey"] | ||
docs_and_scores.append( | ||
(Document(page_content=text, metadata=metadata), score) | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this guaranteed to always be specified? seems like we only check that self._vector_embedding_policy isn't empty if self._create_container is True
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not really sure on what's the intended logic of this code, when I wrote it I followed the
embedding_key
up above.But you are right.
vector_embedding_policy
in this case is a dict. Nothing enforces what this dict needs to be ifcreate_container
is not specified. I think there's a minor issue in the code here that can be addressed.In essence, if we want to use an azure cosmos db no sql vector store, it must have the vector search feature enabled.
To enable and specify it, when we create the container, we must specify (among other things) a Vector Policy (the dict in question). microsoft link
Once created, it's possible to query for these info in the properties of the container. (through(vector search may be activated but not exposed, see comment down below)container.read()
)Additionally, once created,
_database.create_container_if_not_exists
(the method we use here to create our connection to the container) will not be overriden by the new specified policy (and it's totally valid also in this case to give it None policy).So here are my suggested changes:
vector_embedding_policy
optional since we only force the checks if create container is trueself._vector_embedding_policy shouldn't be acquired fromself._vector_embedding_policy should be acquired from container if it is exposed otherwise from user specified optionsvector_embedding_policy
but from the container itselfvector_embedding_policy
and used specifiedvector_embedding_policy
LMK if this is okay with you ! (I added these changes)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I was testing and this assumption is not always valid
It seems there is a bug in azure cosmos db no sql but if the container was the result of a partition_key migration then it seems the information on the vector policy is lost (so not possible to read through
container.read()
). However, the container still fully supports vector search !I'll readapt the implementation to account for this case. It seems no longer possible to use container.read() as the one source of truth.
I'm sorry for the long post I wanted to be thorooughly descriptive and argumentative in my implementation choices!