fix: Added way more training dataset annotations (#1765)

* fix: Leaderboard: `K` instead of `M` Fixes #1752 * format * fixed existing annotations to refer to task name instead of hf dataset * added annotation to nvidia * added voyage * added uae annotations * Added stella annotations * sentence trf models * added salesforce and e5 * jina * bge + model2vec * added llm2vec annotations * add jasper * format * format * Updated annotations and moved jina models * fix: add even more training dataset annotations (#1793) * fix: update max tokens for OpenAI (#1772) update max tokens * ci: skip AfriSentiLID for now (#1785) * skip AfriSentiLID for now * skip relevant test case instead --------- Co-authored-by: Isaac Chung <isaac.chung@team.wrike.com> * 1.28.7 Automatically generated by python-semantic-release * ci: fix model loading test (#1775) * pass base branch into the make command as an arg * test a file that has custom wrapper * what about overview * just dont check overview * revert instance check * explicitly omit overview and init * remove test change * try on a lot of models * revert test model file --------- Co-authored-by: Isaac Chung <isaac.chung@team.wrike.com> * feat: Update task filtering, fixing bug which included cross-lingual tasks in overly many benchmarks (#1787) * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas * fix: Add gritlm * 1.29.0 Automatically generated by python-semantic-release * fix: Added more annotations! * fix: Added C-MTEB (#1786) Added C-MTEB * 1.29.1 Automatically generated by python-semantic-release * docs: Add contact to MMTEB benchmarks (#1796) * Add myself to MMTEB benchmarks * lint * fix: loading pre 11 (#1798) * fix loading pre 11 * add similarity * lint * run all task types * 1.29.2 Automatically generated by python-semantic-release * fix: allow to load no revision available (#1801) * fix allow to load no revision available * lint * add require_model_meta to leaderboard * lint * 1.29.3 Automatically generated by python-semantic-release --------- Co-authored-by: Roman Solomatin <samoed.roman@gmail.com> Co-authored-by: Isaac Chung <chungisaac1217@gmail.com> Co-authored-by: Isaac Chung <isaac.chung@team.wrike.com> Co-authored-by: github-actions <github-actions@github.com> Co-authored-by: Márton Kardos <power.up1163@gmail.com> --------- Co-authored-by: Roman Solomatin <samoed.roman@gmail.com> Co-authored-by: Isaac Chung <chungisaac1217@gmail.com> Co-authored-by: Isaac Chung <isaac.chung@team.wrike.com> Co-authored-by: github-actions <github-actions@github.com> Co-authored-by: Márton Kardos <power.up1163@gmail.com>
embeddings-benchmark · Jan 17, 2025 · 3b2d074 · 3b2d074
1 parent 96420a2
commit 3b2d074
Show file tree

Hide file tree

Showing 16 changed files with 609 additions and 672 deletions.
diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -298,6 +298,60 @@
     "zho_Hans",  # zh
 ]
 
+bge_m_training_data = {
+    # source: https://arxiv.org/pdf/2402.03216
+    "MIRACLRetrieval": ["train"],
+    "MIRACLRetrievalHardNegatives": ["train"],
+    "MIRACLReranking": ["train"],
+    "LeCaRDv2": ["train"],
+    "CMedQAv1-reranking": ["train"],
+    "CMedQAv2-reranking": ["train"],
+    "MrTidyRetrieval": ["train"],
+    "T2Reranking": ["train"],
+    "MSMARCO": ["train"],
+    "MSMARCOHardNegatives": ["train"],
+    "NanoMSMARCORetrieval": ["train"],
+    "MSMARCO-PL": ["train"],  # translation not trained on
+    "NQ": ["train"],
+    "NQHardNegatives": ["train"],
+    "NanoNQRetrieval": ["train"],
+    "NQ-PL": ["train"],  # translation not trained on
+    "HotpotQA": ["train"],
+    "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQAHardNegatives": ["train"],
+    # + synthetic data
+}
+
+bge_training_data = {
+    # source: https://data.baai.ac.cn/details/BAAI-MTP
+    "NQ": ["test"],
+    "NQHardNegatives": ["test"],
+    "AmazonReviewsClassification": [
+        "validation",
+        "test",
+    ],  # assumed from: amazon_reviews_multi
+    "MLQARetrieval": [
+        "validation",
+        "test",
+    ],  # assumed from mlqa	(question, context)
+    # not in mteb
+    # Dataset	Pairs
+    # wudao	(title, passage)
+    # cmrc2018	(query, context)
+    # dureader	(query, context)
+    # simclue	(sentence_a, sentence_b)
+    # csl	(title, abstract)
+    # amazon_reviews_multi	(title, body)
+    # wiki_atomic_edits	(base_sentence, edited_sentence)
+    # mlqa	(question, context)
+    # xlsum	(title, summary) (title, text)
+    # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
+    # "wikipedia": [],  # title + section title, passage
+    # "reddit": [],  # title, body
+    # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
+    # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
+}
+
 bge_small_en_v1_5 = ModelMeta(
     loader=partial(  # type: ignore
         sentence_transformers_loader,
@@ -321,35 +375,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_base_en_v1_5 = ModelMeta(
@@ -375,35 +401,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_large_en_v1_5 = ModelMeta(
@@ -429,35 +427,7 @@
     use_instructions=True,
     public_training_data=True,  # https://data.baai.ac.cn/details/BAAI-MTP
     public_training_code=None,  # seemingly released (at least for some models, but the link is broken
-    training_datasets={
-        # source: https://data.baai.ac.cn/details/BAAI-MTP
-        "NQ": ["test"],
-        "NQHardNegatives": ["test"],
-        "AmazonReviewsClassification": [
-            "validation",
-            "test",
-        ],  # assumed from: amazon_reviews_multi
-        "MLQARetrieval": [
-            "validation",
-            "test",
-        ],  # assumed from mlqa	(question, context)
-        # not in mteb
-        # Dataset	Pairs
-        # wudao	(title, passage)
-        # cmrc2018	(query, context)
-        # dureader	(query, context)
-        # simclue	(sentence_a, sentence_b)
-        # csl	(title, abstract)
-        # amazon_reviews_multi	(title, body)
-        # wiki_atomic_edits	(base_sentence, edited_sentence)
-        # mlqa	(question, context)
-        # xlsum	(title, summary) (title, text)
-        # "sentence-transformers data": [],  # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further
-        # "wikipedia": [],  # title + section title, passage
-        # "reddit": [],  # title, body
-        # "stackexchange": [],  # (title, upvoted answer) (title+body, upvoted answer)
-        # "s2orc": [],  # (title, abstract) (title, citation title) (abstract, citation abstract)
-    },
+    training_datasets=bge_training_data,
 )
 
 bge_small_zh_v1_5 = ModelMeta(

diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -6,7 +6,7 @@
 
 from mteb.model_meta import ModelMeta
 
-from .e5_models import E5_PAPER_RELEASE_DATE, XLMR_LANGUAGES
+from .e5_models import E5_PAPER_RELEASE_DATE, E5_TRAINING_DATA, XLMR_LANGUAGES
 from .instruct_wrapper import instruct_wrapper
 
 MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"]
@@ -40,6 +40,9 @@
     embed_dim=1024,
     license="mit",
     max_tokens=514,
+    public_training_data=False,
+    public_training_code=False,
+    training_datasets=E5_TRAINING_DATA,
 )
 
 e5_mistral = ModelMeta(
@@ -69,4 +72,7 @@
     embed_dim=4096,
     license="mit",
     max_tokens=32768,
+    public_training_data=False,
+    public_training_code=False,
+    training_datasets=E5_TRAINING_DATA,
 )