Merge branch 'master' into v2.5-release

UKPLab · Mar 1, 2024 · aaec753 · aaec753
2 parents c4b32c2 + 66e0ee3
commit aaec753
Show file tree

Hide file tree

Showing 77 changed files with 138 additions and 39 deletions.
diff --git a/examples/applications/clustering/agglomerative.py b/examples/applications/clustering/agglomerative.py
@@ -3,9 +3,9 @@
 
 Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
 """
+
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering
-import numpy as np
 
 embedder = SentenceTransformer("all-MiniLM-L6-v2")
 
@@ -25,8 +25,8 @@
 ]
 corpus_embeddings = embedder.encode(corpus)
 
-# Normalize the embeddings to unit length
-corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
+# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
+# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
 
 # Perform kmean clustering
 clustering_model = AgglomerativeClustering(

diff --git a/examples/applications/clustering/fast_clustering.py b/examples/applications/clustering/fast_clustering.py
@@ -11,6 +11,7 @@
 
 In this example, we download a large set of questions from Quora and then find similar questions in this set.
 """
+
 from sentence_transformers import SentenceTransformer, util
 import os
 import csv

diff --git a/examples/applications/clustering/kmeans.py b/examples/applications/clustering/kmeans.py
@@ -3,6 +3,7 @@
 
 Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
 """
+
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import KMeans
 

diff --git a/examples/applications/computing-embeddings/computing_embeddings_streaming.py b/examples/applications/computing-embeddings/computing_embeddings_streaming.py
@@ -4,7 +4,7 @@
 when encoding large text collections.
 It also demonstrates how to stream data which is helpful in case you don't
 want to wait for an extremely large dataset to download, or if you want to
-limit the amount of memory used. More info about dataset streaming: 
+limit the amount of memory used. More info about dataset streaming:
 https://huggingface.co/docs/datasets/stream
 """
 

diff --git a/examples/applications/cross-encoder/cross-encoder_reranking.py b/examples/applications/cross-encoder/cross-encoder_reranking.py
@@ -6,6 +6,7 @@
 
 Then, we re-rank the hits from the Bi-Encoder using a Cross-Encoder.
 """
+
 from sentence_transformers import SentenceTransformer, util
 from sentence_transformers import CrossEncoder
 import os

diff --git a/examples/applications/cross-encoder/cross-encoder_usage.py b/examples/applications/cross-encoder/cross-encoder_usage.py
@@ -3,6 +3,7 @@
 sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
 It output then the most similar sentences for the given query.
 """
+
 from sentence_transformers.cross_encoder import CrossEncoder
 import numpy as np
 

diff --git a/examples/applications/parallel-sentence-mining/bitext_mining.py b/examples/applications/parallel-sentence-mining/bitext_mining.py
@@ -12,6 +12,7 @@
 This script requires that you have FAISS installed:
 https://github.com/facebookresearch/faiss
 """
+
 from sentence_transformers import SentenceTransformer, models
 import numpy as np
 from bitext_mining_utils import score_candidates, kNN, file_open

diff --git a/examples/applications/parallel-sentence-mining/bucc2018.py b/examples/applications/parallel-sentence-mining/bucc2018.py
@@ -9,6 +9,7 @@
 This script requires that you have FAISS installed:
 https://github.com/facebookresearch/faiss
 """
+
 from sentence_transformers import SentenceTransformer, models
 from collections import defaultdict
 import os

diff --git a/examples/applications/semantic-search/semantic_search.py b/examples/applications/semantic-search/semantic_search.py
@@ -6,6 +6,7 @@
 
 This script outputs for various queries the top 5 most similar sentences in the corpus.
 """
+
 from sentence_transformers import SentenceTransformer, util
 import torch
 

diff --git a/examples/applications/semantic-search/semantic_search_publications.py b/examples/applications/semantic-search/semantic_search_publications.py
@@ -1,13 +1,14 @@
 """
 This example demonstrates how we can perform semantic search for scientific publications.
 
-As model, we use SPECTER (https://github.com/allenai/specter), which encodes paper titles and abstracts 
+As model, we use SPECTER (https://github.com/allenai/specter), which encodes paper titles and abstracts
 into a vector space.
 
 When can then use util.semantic_search() to find the most similar papers.
 
 Colab example: https://colab.research.google.com/drive/12hfBveGHRsxhPIUMmJYrll2lFU4fOX06
 """
+
 import json
 import os
 from sentence_transformers import SentenceTransformer, util

diff --git a/examples/applications/semantic-search/semantic_search_quora_annoy.py b/examples/applications/semantic-search/semantic_search_quora_annoy.py
@@ -25,6 +25,7 @@
 that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
 return the closest questions in the corpus (questions in the corpus are mainly in English).
 """
+
 from sentence_transformers import SentenceTransformer, util
 import os
 import csv

diff --git a/examples/applications/semantic-search/semantic_search_quora_faiss.py b/examples/applications/semantic-search/semantic_search_quora_faiss.py
@@ -22,6 +22,7 @@
 that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
 return the closest questions in the corpus (questions in the corpus are mainly in English).
 """
+
 from sentence_transformers import SentenceTransformer, util
 import os
 import csv

diff --git a/examples/applications/semantic-search/semantic_search_quora_hnswlib.py b/examples/applications/semantic-search/semantic_search_quora_hnswlib.py
@@ -20,6 +20,7 @@
 that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
 return the closest questions in the corpus (questions in the corpus are mainly in English).
 """
+
 from sentence_transformers import SentenceTransformer, util
 import os
 import csv

diff --git a/examples/applications/semantic-search/semantic_search_quora_pytorch.py b/examples/applications/semantic-search/semantic_search_quora_pytorch.py
@@ -12,6 +12,7 @@
 
 Google Colab example: https://colab.research.google.com/drive/12cn5Oo0v3HfQQ8Tv6-ukgxXSmT3zl35A?usp=sharing
 """
+
 from sentence_transformers import SentenceTransformer, util
 import os
 import csv

diff --git a/examples/applications/semantic-search/semantic_search_wikipedia_qa.py b/examples/applications/semantic-search/semantic_search_wikipedia_qa.py
@@ -12,6 +12,7 @@
 
 Google Colab Example: https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing
 """
+
 import json
 from sentence_transformers import SentenceTransformer, util
 import time

diff --git a/examples/applications/text-summarization/text-summarization.py b/examples/applications/text-summarization/text-summarization.py
@@ -17,6 +17,7 @@
 
 Note: Requires NLTK: `pip install nltk`
 """
+
 import nltk
 from sentence_transformers import SentenceTransformer, util
 import numpy as np

diff --git a/examples/evaluation/evaluation_inference_speed.py b/examples/evaluation/evaluation_inference_speed.py
@@ -6,6 +6,7 @@
 OR
 python evaluation_inference_speed.py model_name
 """
+
 from sentence_transformers import SentenceTransformer, util
 import sys
 import os

diff --git a/examples/evaluation/evaluation_stsbenchmark.py b/examples/evaluation/evaluation_stsbenchmark.py
@@ -6,6 +6,7 @@
 OR
 python evaluation_stsbenchmark.py model_name
 """
+
 from sentence_transformers import SentenceTransformer, util, LoggingHandler, InputExample
 from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 import logging

diff --git a/examples/training/adaptive_layer/adaptive_layer_nli.py b/examples/training/adaptive_layer/adaptive_layer_nli.py
@@ -10,6 +10,7 @@
 OR
 python adaptive_layer_nli.py pretrained_transformer_model_name
 """
+
 import math
 from datasets import load_dataset
 from sentence_transformers import models, losses, datasets

diff --git a/examples/training/adaptive_layer/adaptive_layer_sts.py b/examples/training/adaptive_layer/adaptive_layer_sts.py
@@ -9,6 +9,7 @@
 OR
 python adaptive_layer_sts.py pretrained_transformer_model_name
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util

diff --git a/examples/training/avg_word_embeddings/training_stsbenchmark_avg_word_embeddings.py b/examples/training/avg_word_embeddings/training_stsbenchmark_avg_word_embeddings.py
@@ -6,6 +6,7 @@
 See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/
 for available word embeddings files
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import models, losses, util

diff --git a/examples/training/avg_word_embeddings/training_stsbenchmark_bilstm.py b/examples/training/avg_word_embeddings/training_stsbenchmark_bilstm.py
@@ -4,6 +4,7 @@
 
 Note, you can also pass BERT embeddings to the BiLSTM.
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import models, losses, util

diff --git a/examples/training/avg_word_embeddings/training_stsbenchmark_bow.py b/examples/training/avg_word_embeddings/training_stsbenchmark_bow.py
@@ -4,6 +4,7 @@
 
 To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN).
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import models, losses, util

diff --git a/examples/training/avg_word_embeddings/training_stsbenchmark_cnn.py b/examples/training/avg_word_embeddings/training_stsbenchmark_cnn.py
@@ -4,6 +4,7 @@
 
 
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import models, losses, util

diff --git a/examples/training/avg_word_embeddings/training_stsbenchmark_tf-idf_word_embeddings.py b/examples/training/avg_word_embeddings/training_stsbenchmark_tf-idf_word_embeddings.py
@@ -8,6 +8,7 @@
 You can get term-document frequencies from here:
 https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import models, losses, util

diff --git a/examples/training/cross-encoder/training_nli.py b/examples/training/cross-encoder/training_nli.py
@@ -7,6 +7,7 @@
 Usage:
 python training_nli.py
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import LoggingHandler, util

diff --git a/examples/training/cross-encoder/training_quora_duplicate_questions.py b/examples/training/cross-encoder/training_quora_duplicate_questions.py
@@ -8,6 +8,7 @@
 python training_quora_duplicate_questions.py
 
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import LoggingHandler, util

diff --git a/examples/training/cross-encoder/training_stsbenchmark.py b/examples/training/cross-encoder/training_stsbenchmark.py
@@ -7,6 +7,7 @@
 Usage:
 python training_stsbenchmark.py
 """
+
 from torch.utils.data import DataLoader
 import math
 from sentence_transformers import LoggingHandler, util

diff --git a/examples/training/data_augmentation/train_sts_indomain_bm25.py b/examples/training/data_augmentation/train_sts_indomain_bm25.py
@@ -9,9 +9,9 @@
 Or to run it with Docker: https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html
 
 Methodology:
-Three steps are followed for AugSBERT data-augmentation with BM25 Sampling - 
+Three steps are followed for AugSBERT data-augmentation with BM25 Sampling -
     1. Fine-tune cross-encoder (BERT) on gold STSb dataset
-    2. Fine-tuned Cross-encoder is used to label on BM25 sampled unlabeled pairs (silver STSb dataset) 
+    2. Fine-tuned Cross-encoder is used to label on BM25 sampled unlabeled pairs (silver STSb dataset)
     3. Bi-encoder (SBERT) is finally fine-tuned on both gold + silver STSb dataset
 
 Citation: https://arxiv.org/abs/2010.08240
@@ -25,6 +25,7 @@
 python train_sts_indomain_bm25.py bert-base-uncased 3
 
 """
+
 from torch.utils.data import DataLoader
 from sentence_transformers import models, losses, util
 from sentence_transformers.cross_encoder import CrossEncoder

diff --git a/examples/training/data_augmentation/train_sts_indomain_nlpaug.py b/examples/training/data_augmentation/train_sts_indomain_nlpaug.py
@@ -3,12 +3,12 @@
 We utilise nlpaug (https://github.com/makcedward/nlpaug) for data augmentation strategies over a single sentence.
 
 We chose synonym replacement for our example with (can be extended to other techniques) -
-    1. Word-embeddings (word2vec) 
+    1. Word-embeddings (word2vec)
     2. WordNet
     3. Contextual word-embeddings (BERT)
 
 Methodology:
-Take a gold STSb pair, like (A, B, 0.6) Then replace synonyms in A and B, which gives you (A', B', 0.6) 
+Take a gold STSb pair, like (A, B, 0.6) Then replace synonyms in A and B, which gives you (A', B', 0.6)
 These are the silver data and SBERT is finally trained on (gold + silver) STSb data.
 
 Additional requirements:
@@ -28,6 +28,7 @@
 Usage:
 python train_sts_indomain_nlpaug.py
 """
+
 from torch.utils.data import DataLoader
 import torch
 import math

diff --git a/examples/training/data_augmentation/train_sts_indomain_semantic.py b/examples/training/data_augmentation/train_sts_indomain_semantic.py
@@ -3,9 +3,9 @@
 
 
 Methodology:
-Three steps are followed for AugSBERT data-augmentation strategy with Semantic Search - 
+Three steps are followed for AugSBERT data-augmentation strategy with Semantic Search -
     1. Fine-tune cross-encoder (BERT) on gold STSb dataset
-    2. Fine-tuned Cross-encoder is used to label on Sem. Search sampled unlabeled pairs (silver STSb dataset) 
+    2. Fine-tuned Cross-encoder is used to label on Sem. Search sampled unlabeled pairs (silver STSb dataset)
     3. Bi-encoder (SBERT) is finally fine-tuned on both gold + silver STSb dataset
 
 Citation: https://arxiv.org/abs/2010.08240
@@ -18,6 +18,7 @@
 
 python train_sts_indomain_semantic.py bert-base-uncased 3
 """
+
 from torch.utils.data import DataLoader
 from sentence_transformers import models, losses, util
 from sentence_transformers import LoggingHandler, SentenceTransformer

diff --git a/examples/training/data_augmentation/train_sts_qqp_crossdomain.py b/examples/training/data_augmentation/train_sts_qqp_crossdomain.py
@@ -3,7 +3,7 @@
 For our example below we consider STSb (source) and QQP (target) datasets respectively.
 
 Methodology:
-Three steps are followed for AugSBERT data-augmentation strategy with Domain Transfer / Cross-Domain - 
+Three steps are followed for AugSBERT data-augmentation strategy with Domain Transfer / Cross-Domain -
 1. Cross-Encoder aka BERT is trained over STSb (source) dataset.
 2. Cross-Encoder is used to label QQP training (target) dataset (Assume no labels/no annotations are provided).
 3. Bi-encoder aka SBERT is trained over the labeled QQP (target) dataset.
@@ -16,6 +16,7 @@
 OR
 python train_sts_qqp_crossdomain.py pretrained_transformer_model_name
 """
+
 from torch.utils.data import DataLoader
 from sentence_transformers import models, losses, util, LoggingHandler, SentenceTransformer
 from sentence_transformers.cross_encoder import CrossEncoder

diff --git a/examples/training/data_augmentation/train_sts_seed_optimization.py b/examples/training/data_augmentation/train_sts_seed_optimization.py
@@ -4,13 +4,13 @@
 
 For more details refer to -
 Fine-Tuning Pretrained Language Models:
-Weight Initializations, Data Orders, and Early Stopping by Dodge et al. 2020 
+Weight Initializations, Data Orders, and Early Stopping by Dodge et al. 2020
 https://arxiv.org/pdf/2002.06305.pdf
 
 Why Seed Optimization?
-Dodge et al. (2020) show a high dependence on the random seed for transformer based models like BERT, 
-as it converges to different minima that generalize differently to unseen data. This is especially the 
-case for small training datasets. 
+Dodge et al. (2020) show a high dependence on the random seed for transformer based models like BERT,
+as it converges to different minima that generalize differently to unseen data. This is especially the
+case for small training datasets.
 
 Citation: https://arxiv.org/abs/2010.08240
 
@@ -22,6 +22,7 @@
 
 python train_sts_seed_optimization.py bert-base-uncased 10 0.3
 """
+
 from torch.utils.data import DataLoader
 import math
 import torch

diff --git a/examples/training/distillation/dimensionality_reduction.py b/examples/training/distillation/dimensionality_reduction.py
@@ -14,6 +14,7 @@
 the new SentenceTransformer model will produce directly embeddings with 128 dimensions
 without further changes needed.
 """
+
 from sklearn.decomposition import PCA
 from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
 import logging

diff --git a/examples/training/distillation/model_distillation.py b/examples/training/distillation/model_distillation.py
@@ -19,6 +19,7 @@
 There is a performance - speed trade-off. However, we found that a student with 4 instead of 12 layers keeps about 99.4%
 of the teacher performance, while being 2.3 times faster.
 """
+
 from torch.utils.data import DataLoader
 from sentence_transformers import models, losses, evaluation
 from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample

diff --git a/examples/training/distillation/model_quantization.py b/examples/training/distillation/model_quantization.py
@@ -8,6 +8,7 @@
 For more details:
 https://pytorch.org/docs/stable/quantization.html
 """
+
 import logging
 import os
 import torch

diff --git a/examples/training/matryoshka/2d_matryoshka_nli.py b/examples/training/matryoshka/2d_matryoshka_nli.py
@@ -10,6 +10,7 @@
 OR
 python 2d_matryoshka_nli.py pretrained_transformer_model_name
 """
+
 import math
 from datasets import load_dataset
 from sentence_transformers import models, losses, datasets
@@ -147,10 +148,10 @@ def add_to_samples(sent1, sent2, label):
 # It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
 model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
 try:
-    model.save_to_hub(f"{model_name}-nli-matryoshka")
+    model.save_to_hub(f"{model_name}-nli-2d-matryoshka")
 except Exception:
     logging.error(
         "Error uploading model to the Hugging Face Hub. To upload it manually, you can run "
         f"`huggingface-cli login`, followed by loading the model using `model = SentenceTransformer({model_save_path!r})` "
-        f"and saving it using `model.save_to_hub('{model_name}-nli-matryoshka')`."
+        f"and saving it using `model.save_to_hub('{model_name}-nli-2d-matryoshka')`."
     )