From d6e34ee754f6189b176255a261cc3e4c0e80298a Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 11 Sep 2024 16:56:06 +0200 Subject: [PATCH 1/4] Increment dev version to v3.2.0.dev0 --- pyproject.toml | 2 +- sentence_transformers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8862e17f4..99d35cd51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sentence-transformers" -version = "3.1.0.dev0" +version = "3.2.0.dev0" description = "Multilingual text embeddings" license = { file = "LICENSE" } readme = "README.md" diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index 79a390ca4..1d6c5f0f5 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -__version__ = "3.1.0.dev0" +__version__ = "3.2.0.dev0" __MODEL_HUB_ORGANIZATION__ = "sentence-transformers" import importlib From dafe2b6c88c6940dffffb0c5faf2932f1461e94c Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:40:36 +0200 Subject: [PATCH 2/4] [`deps`] Attempt to remove numpy restrictions (#2937) --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 99d35cd51..d74079871 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,6 @@ dependencies = [ "transformers>=4.38.0,<5.0.0", "tqdm", "torch>=1.11.0", - "numpy<2.0.0", "scikit-learn", "scipy", "huggingface-hub>=0.19.3", From a201c6d5eff22a79ce5335bf14f9113246c09ea7 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:11:55 +0200 Subject: [PATCH 3/4] [`metadata`] Extend pyproject.toml metadata (#2943) This correctly puts Nils back as one of the Authors on pypi.org --- pyproject.toml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d74079871..092f334c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,15 @@ [project] name = "sentence-transformers" version = "3.2.0.dev0" -description = "Multilingual text embeddings" -license = { file = "LICENSE" } +description = "State-of-the-Art Text Embeddings" +license = { text = "Apache 2.0" } readme = "README.md" authors = [ { name = "Nils Reimers", email = "info@nils-reimers.de" }, - { name = "Tom Aarsen" }, + { name = "Tom Aarsen", email = "tom.aarsen@huggingface.co" }, +] +maintainers = [ + { name = "Tom Aarsen", email = "tom.aarsen@huggingface.co" } ] requires-python = ">=3.8" keywords = [ From 7290448809cb73f08f63c955550815775434beb4 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:59:28 +0200 Subject: [PATCH 4/4] [`fix`] Ensure that the embeddings from hard negative mining are normalized (#2944) --- sentence_transformers/util.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 59e7bf0c4..5b8baa5f7 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -714,8 +714,12 @@ def mine_hard_negatives( except Exception: pass - corpus_embeddings = model.encode(corpus, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True) - query_embeddings = model.encode(queries, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True) + corpus_embeddings = model.encode( + corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) + query_embeddings = model.encode( + queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) index.add(corpus_embeddings) scores_list = [] @@ -731,8 +735,12 @@ def mine_hard_negatives( else: # Embed the corpus and the queries - corpus_embeddings = model.encode(corpus, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True) - query_embeddings = model.encode(queries, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True) + corpus_embeddings = model.encode( + corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) + query_embeddings = model.encode( + queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) scores = model.similarity(query_embeddings, corpus_embeddings).to(device) # Keep only the range_max + max_positives highest scores. We offset by 1 to potentially include the positive pair