embeddings-benchmark · MartinBernstorff · Jul 12, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/_add_stats.grit b/_add_stats.grit
@@ -0,0 +1,28 @@
+engine marzano(0.1)
+language python
+
+`TaskMetadata($args)` where {
+    $args <: any {
+        contains keyword_argument(name="n_samples") as $ns_kwarg where {
+            $ns_kwarg <: `n_samples = $ns_val` => .
+        },
+        contains keyword_argument(name="avg_character_length") as $avg_kwarg where {
+            $avg_kwarg <: `avg_character_length = $avg_val` => `stats={"n_samples": $ns_val, "avg_character_length": $avg_val}`
+        },
+        contains keyword_argument(name="form") as $form_kwarg where {
+            $form_kwarg <: or {`form = [$form_val]`, `form = $form_val`} => .
+        },
+        contains keyword_argument(name="domains") as $domains_kwarg where {
+            $domains_kwarg <: `domains = [$domains_val]` => `domains=[$domains_val, $form_val]`
+        },
+        contains keyword_argument(name="text_creation") as $text_creation_kwarg where {
+            $text_creation_kwarg <: `text_creation = $text_creation_val` => `sample_creation=$text_creation_val`
+        },
+        contains keyword_argument(name="socioeconomic_status") as $socio where {
+            $socio <: `socioeconomic_status = $val` => .
+        },
+        and {contains keyword_argument(name="category") as $category, !contains keyword_argument(name="modalities")} where {
+            $category <: `category = $c_val` => `category=$c_val,\nmodalities=["text"]`
+        },
+    },
+}
diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md
@@ -38,7 +38,7 @@ class SciDocsReranking(AbsTaskReranking):
             "revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab",
         }
         date=("2000-01-01", "2020-12-31"), # best guess
-        form="written",
+        form="Written",
         domains=["Academic", "Non-fiction"],
         task_subtypes=["Scientific Reranking"],
         license="cc-by-4.0",
@@ -102,7 +102,7 @@ class VGClustering(AbsTaskClustering):
             "revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
         },
         date=("2012-01-01", "2020-01-01"),
-        form="written",
+        form="Written",
         domains=["Academic", "Non-fiction"],
         task_subtypes=["Scientific Reranking"],
         license="cc-by-nc",

diff --git a/mteb/abstasks/AbsTaskPairClassification.py b/mteb/abstasks/AbsTaskPairClassification.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import logging
-from collections import defaultdict
 
 from datasets import Dataset
 

diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -2,7 +2,7 @@
 
 import logging
 from datetime import date
-from typing import Any, List, Mapping, Union
+from typing import Any, Dict, List, Mapping, Optional, Union
 
 from pydantic import AnyUrl, BaseModel, BeforeValidator, TypeAdapter, field_validator
 from typing_extensions import Annotated, Literal
@@ -57,10 +57,12 @@
     "Spoken",
     "Subtitles",
     "Web",
+    "Written",
     "Programming",
+    None,
 ]
 
-TEXT_CREATION_METHOD = Literal[
+SAMPLE_CREATION_METHOD = Literal[
     "found",
     "created",
     "machine-translated",
@@ -71,13 +73,6 @@
     "LM-generated and verified",
 ]
 
-SOCIOECONOMIC_STATUS = Literal[
-    "high",
-    "medium",
-    "low",
-    "mixed",
-]
-
 TASK_TYPE = Literal[
     "BitextMining",
     "Classification",
@@ -134,9 +129,19 @@
     "shell",
 ]
 
+METRIC_NAME = str
+METRIC_VALUE = Union[int, float, Dict[str, Any]]
+
 logger = logging.getLogger(__name__)
 
 
+class GeneralDescriptiveStats(BaseModel):
+    """General descriptive statistics for a dataset."""
+
+    n_samples: dict[SPLIT_NAME, int]
+    avg_character_length: dict[SPLIT_NAME, float]
+
+
 class TaskMetadata(BaseModel):
     """Metadata for a task.
 
@@ -155,16 +160,15 @@ class TaskMetadata(BaseModel):
             huggingface dataset contain different languages).
         main_score: The main score used for evaluation.
         date: The date when the data was collected. Specified as a tuple of two dates.
-        form: The form of the data. Either "spoken", "written".
         domains: The domains of the data. These includes "Non-fiction", "Social", "Fiction", "News", "Academic", "Blog", "Encyclopaedic",
-            "Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken". A dataset can belong to multiple domains.
+            "Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken", "Written". A dataset can belong to multiple domains.
         task_subtypes: The subtypes of the task. E.g. includes "Sentiment/Hate speech", "Thematic Clustering". Feel free to update the list as needed.
         license: The license of the data.
         socioeconomic_status: The socioeconomic status of the data. Includes "high", "medium", "low", "mixed".
         annotations_creators: The type of the annotators. Includes "expert-annotated" (annotated by experts), "human-annotated" (annotated e.g. by
             mturkers), "derived" (derived from structure in the data).
         dialect: The dialect of the data, if applicable. Ideally specified as a BCP-47 language tag. Empty list if no dialects are present.
-        text_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and
+        sample_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and
             "machine-translated and localized".
         bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available.
         n_samples: The number of samples in the dataset. This should only be for the splits evaluated on. For retrieval tasks, this should be the
@@ -178,6 +182,7 @@ class TaskMetadata(BaseModel):
     name: str
     description: str
     type: TASK_TYPE
+    modalities: list[Literal["text"]]
     category: TASK_CATEGORY
     reference: STR_URL | None  # URL to documentation, e.g. published paper
 
@@ -186,22 +191,17 @@ class TaskMetadata(BaseModel):
     main_score: str  # Might want a literal here
 
     date: tuple[STR_DATE, STR_DATE] | None  # When the data was collected
-    form: list[Literal["spoken", "written"]] | None
     domains: list[TASK_DOMAIN] | None
     task_subtypes: list[TASK_SUBTYPE] | None
     license: str | None
 
-    socioeconomic_status: SOCIOECONOMIC_STATUS | None
     annotations_creators: ANNOTATOR_TYPE | None
     dialect: list[str] | None
 
-    text_creation: TEXT_CREATION_METHOD | None
+    sample_creation: SAMPLE_CREATION_METHOD | None
     bibtex_citation: str | None
 
-    n_samples: dict[SPLIT_NAME, int] | None
-    avg_character_length: (
-        Union[dict[SPLIT_NAME, float], dict[SPLIT_NAME, dict[str, Any]]] | None
-    )
+    stats: dict[METRIC_NAME, Optional[dict[SPLIT_NAME, METRIC_VALUE]]]
 
     @field_validator("dataset")
     def _check_dataset_path_is_specified(cls, dataset):

diff --git a/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py b/mteb/tasks/BitextMining/dan/BornholmskBitextMining.py
@@ -16,18 +16,17 @@ class BornholmBitextMining(AbsTaskBitextMining):
         reference="https://aclanthology.org/W19-6138/",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=["test"],
         eval_langs=["dan-Latn"],
         main_score="f1",
         date=("2019-01-01", "2019-12-31"),
-        form=["written"],
-        domains=["Web", "Social", "Fiction"],
+        domains=["Web", "Social", "Fiction", "Written"],
         license="CC-BY-4.0",
         task_subtypes=["Dialect pairing"],
-        socioeconomic_status="mixed",
         annotations_creators="expert-annotated",
         dialect=["da-dan-bornholm"],
-        text_creation="created",
+        sample_creation="created",
         bibtex_citation="""
 @inproceedings{derczynskiBornholmskNaturalLanguage2019,
 	title = {Bornholmsk natural language processing: Resources and tools},
@@ -42,8 +41,7 @@ class BornholmBitextMining(AbsTaskBitextMining):
 	file = {Available Version (via Google Scholar):/Users/au554730/Zotero/storage/FBQ73ZYN/Derczynski and Kjeldsen - 2019 - Bornholmsk natural language processing Resources .pdf:application/pdf},
 }
 """,
-        avg_character_length={"test": 89.7},
-        n_samples={"test": 500},
+        stats={"n_samples": {"test": 500}, "avg_character_length": {"test": 89.7}},
     )
 
     def dataset_transform(self):

diff --git a/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py b/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py
@@ -28,22 +28,23 @@ class TbilisiCityHallBitextMining(AbsTaskBitextMining, MultilingualTask):
         description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=[_EVAL_SPLIT],
         eval_langs=_EVAL_LANGS,
         main_score="f1",
-        domains=["News"],
-        text_creation="created",
-        n_samples={_EVAL_SPLIT: 1820},
+        domains=["News", "Written"],
+        sample_creation="created",
         reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles",
         date=("2024-05-02", "2024-05-03"),
-        form=["written"],
         task_subtypes=[],
         license="Not specified",
-        socioeconomic_status="mixed",
         annotations_creators="derived",
         dialect=[],
         bibtex_citation="",
-        avg_character_length={_EVAL_SPLIT: 78},
+        stats={
+            "n_samples": {_EVAL_SPLIT: 1820},
+            "avg_character_length": {_EVAL_SPLIT: 78},
+        },
     )
 
     def load_data(self, **kwargs) -> None:

diff --git a/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py b/mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
@@ -27,18 +27,17 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
         reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=_SPLITS,
         eval_langs=_LANGUAGES,
         main_score="f1",
         date=("2017-01-01", "2018-12-31"),
-        form=["written"],
-        domains=[],
+        domains=["Written"],
         task_subtypes=[],
         license="unknown",
-        socioeconomic_status="mixed",
         annotations_creators="human-annotated",
         dialect=[],
-        text_creation="human-translated",
+        sample_creation="human-translated",
         bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview,
     title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora",
     author = "Zweigenbaum, Pierre  and
@@ -57,8 +56,7 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
     pages = "60--67",
     abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.",
 }""",
-        n_samples={"test": 641684},
-        avg_character_length={"test": 101.3},
+        stats={"n_samples": {"test": 641684}, "avg_character_length": {"test": 101.3}},
     )
 
     def dataset_transform(self):

diff --git a/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py b/mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py
@@ -27,18 +27,17 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask):
         reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=_SPLITS,
         eval_langs=_LANGUAGES,
         main_score="f1",
         date=("2017-01-01", "2018-12-31"),
-        form=["written"],
-        domains=[],
+        domains=["Written"],
         task_subtypes=[],
         license="unknown",
-        socioeconomic_status="mixed",
         annotations_creators="human-annotated",
         dialect=[],
-        text_creation="human-translated",
+        sample_creation="human-translated",
         bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview,
     title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora",
     author = "Zweigenbaum, Pierre  and
@@ -57,6 +56,5 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask):
     pages = "60--67",
     abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.",
 }""",
-        n_samples={"test": 641684},
-        avg_character_length={"test": 101.3},
+        stats={"n_samples": {"test": 641684}, "avg_character_length": {"test": 101.3}},
     )
diff --git a/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py b/mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py
@@ -872,21 +872,19 @@ class BibleNLPBitextMining(AbsTaskBitextMining, MultilingualTask):
         reference="https://arxiv.org/abs/2304.09919",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=_SPLIT,
         eval_langs=_LANGUAGES_MAPPING,
         main_score="f1",
         # World English Bible (WEB) first draft 1997, finished 2020
         date=("1997-01-01", "2020-12-31"),
-        form=["written"],
-        domains=["Religious"],
+        domains=["Religious", "Written"],
         task_subtypes=[],
         license="CC-BY-SA-4.0",
-        socioeconomic_status="medium",
         annotations_creators="expert-annotated",
         dialect=[],
-        text_creation="created",
-        n_samples={"train": _N},
-        avg_character_length={"train": 120},
+        sample_creation="created",
+        stats={"n_samples": {"train": _N}, "avg_character_length": {"train": 120}},
         bibtex_citation="""@article{akerman2023ebible,
             title={The eBible Corpus: Data and Model Benchmarks for Bible Translation for Low-Resource Languages},
             author={Akerman, Vesa and Baines, David and Daspit, Damien and Hermjakob, Ulf and Jang, Taeho and Leong, Colin and Martin, Michael and Mathew, Joel and Robie, Jonathan and Schwarting, Marcus},

diff --git a/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py b/mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
@@ -19,21 +19,20 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask):
         reference="https://inria.hal.science/hal-03021633",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=["test"],
         eval_langs={
             "fr-en": ["fra-Latn", "eng-Latn"],
             "en-fr": ["eng-Latn", "fra-Latn"],
         },
         main_score="f1",
         date=("2016-01-01", "2017-12-31"),
-        form=["written"],
-        domains=["Social"],
+        domains=["Social", "Written"],
         task_subtypes=[],
         license="CC BY-NC-SA 4.0",
-        socioeconomic_status="mixed",
         annotations_creators="human-annotated",
         dialect=[],
-        text_creation="created",
+        sample_creation="created",
         bibtex_citation="""
         @inproceedings{gonzalez2019diabla,
         title={DiaBLa: A Corpus of Bilingual Spontaneous Written Dialogues for Machine Translation},
@@ -43,8 +42,7 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask):
         year={2019}
         }
         """,
-        n_samples={},
-        avg_character_length={},
+        stats={"n_samples": {}, "avg_character_length": {}},
     )
 
     def load_data(self, **kwargs):

diff --git a/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py b/mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
@@ -248,18 +248,17 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask):
         reference="https://huggingface.co/datasets/facebook/flores",
         type="BitextMining",
         category="s2s",
+        modalities=["text"],
         eval_splits=_SPLIT,
         eval_langs=_LANGUAGES_MAPPING,
         main_score="f1",
         date=("2022-01-01", "2022-12-31"),
-        form=["written"],
-        domains=["Non-fiction", "Encyclopaedic"],
+        domains=["Non-fiction", "Encyclopaedic", "Written"],
         task_subtypes=[],
         license="CC BY-SA 4.0",
-        socioeconomic_status="mixed",
         annotations_creators="human-annotated",
         dialect=[],
-        text_creation="created",
+        sample_creation="created",
         bibtex_citation="""
         @inproceedings{goyal2022flores,
         title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation},
@@ -269,8 +268,7 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask):
         year={2022}
         }
         """,
-        n_samples={"dev": 997, "devtest": 1012},
-        avg_character_length={},
+        stats={"n_samples": {"dev": 997, "devtest": 1012}, "avg_character_length": {}},
     )
 
     def load_data(self, **kwargs: Any) -> None: