diff --git a/FlagEmbedding/evaluation/air_bench/arguments.py b/FlagEmbedding/evaluation/air_bench/arguments.py index 1dce8238..465fe58e 100644 --- a/FlagEmbedding/evaluation/air_bench/arguments.py +++ b/FlagEmbedding/evaluation/air_bench/arguments.py @@ -4,6 +4,9 @@ @dataclass class AIRBenchEvalModelArgs: + """ + Evaluation Model arguments for AIR Bench. + """ embedder_name_or_path: str = field( metadata={"help": "The embedder name or path.", "required": True} ) diff --git a/FlagEmbedding/evaluation/air_bench/runner.py b/FlagEmbedding/evaluation/air_bench/runner.py index b2adff1f..cb9a8416 100644 --- a/FlagEmbedding/evaluation/air_bench/runner.py +++ b/FlagEmbedding/evaluation/air_bench/runner.py @@ -10,6 +10,13 @@ class AIRBenchEvalRunner: + """ + Evaluation runner for AIR Bench. + + Args: + eval_args (AIRBenchEvalArgs): :class:AIRBenchEvalArgs object with the evaluation arguments. + model_args (AIRBenchEvalModelArgs): :class:AIRBenchEvalModelArgs object with the model arguments. + """ def __init__( self, eval_args: AIRBenchEvalArgs, @@ -22,6 +29,12 @@ def __init__( self.retriever, self.reranker = self.load_retriever_and_reranker() def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalReranker, None]]: + """Load retriever and reranker for evaluation + + Returns: + Tuple[EvalDenseRetriever, Union[EvalReranker, None]]: A :class:EvalDenseRetriever object for retrieval, and a + :class:EvalReranker object if reranker provided. + """ embedder, reranker = AbsEvalRunner.get_models(self.model_args) retriever = EvalDenseRetriever( embedder, @@ -33,6 +46,9 @@ def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalRer return retriever, reranker def run(self): + """ + Run the whole evaluation. + """ evaluation = AIRBench( benchmark_version=self.eval_args.benchmark_version, task_types=self.eval_args.task_types, diff --git a/FlagEmbedding/evaluation/beir/arguments.py b/FlagEmbedding/evaluation/beir/arguments.py index 5fe42e7b..e51eba0a 100644 --- a/FlagEmbedding/evaluation/beir/arguments.py +++ b/FlagEmbedding/evaluation/beir/arguments.py @@ -5,6 +5,9 @@ @dataclass class BEIREvalArgs(AbsEvalArgs): + """ + Argument class for BEIR evaluation. + """ use_special_instructions: bool = field( default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"} ) diff --git a/FlagEmbedding/evaluation/beir/data_loader.py b/FlagEmbedding/evaluation/beir/data_loader.py index 8d137dd7..96f0135c 100644 --- a/FlagEmbedding/evaluation/beir/data_loader.py +++ b/FlagEmbedding/evaluation/beir/data_loader.py @@ -13,15 +13,42 @@ class BEIREvalDataLoader(AbsEvalDataLoader): + """ + Data loader class for BEIR. + """ def available_dataset_names(self) -> List[str]: + """ + Get the available dataset names. + + Returns: + List[str]: All the available dataset names. + """ return ['arguana', 'climate-fever', 'cqadupstack', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'msmarco', 'nfcorpus', 'nq', 'quora', 'scidocs', 'scifact', 'trec-covid', 'webis-touche2020'] def available_sub_dataset_names(self, dataset_name: Optional[str] = None) -> List[str]: + """ + Get the available sub-dataset names. + + Args: + dataset_name (Optional[str], optional): All the available sub-dataset names. Defaults to ``None``. + + Returns: + List[str]: All the available sub-dataset names. + """ if dataset_name == 'cqadupstack': return ['android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress'] return None def available_splits(self, dataset_name: Optional[str] = None) -> List[str]: + """ + Get the avaialble splits. + + Args: + dataset_name (str): Dataset name. + + Returns: + List[str]: All the available splits for the dataset. + """ if dataset_name == 'msmarco': return ['dev'] return ['test'] @@ -32,6 +59,16 @@ def _load_remote_corpus( sub_dataset_name: Optional[str] = None, save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the corpus dataset from HF. + + Args: + dataset_name (str): Name of the dataset. + sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of corpus. + """ if dataset_name != 'cqadupstack': corpus = datasets.load_dataset( 'BeIR/{d}'.format(d=dataset_name), @@ -94,6 +131,17 @@ def _load_remote_qrels( split: str = 'dev', save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the qrels from HF. + + Args: + dataset_name (str): Name of the dataset. + sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): Split of the dataset. Defaults to ``'dev'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of qrel. + """ if dataset_name != 'cqadupstack': qrels = datasets.load_dataset( 'BeIR/{d}-qrels'.format(d=dataset_name), @@ -168,6 +216,17 @@ def _load_remote_queries( split: str = 'test', save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the queries from HF. + + Args: + dataset_name (str): Name of the dataset. + sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): Split of the dataset. Defaults to ``'dev'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of queries. + """ qrels = self.load_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split) if dataset_name != 'cqadupstack': @@ -230,6 +289,15 @@ def _load_remote_queries( return datasets.DatasetDict(queries_dict) def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict: + """Load the corpus from the dataset. + + Args: + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: A dict of corpus with id as key, title and text as value. + """ if self.dataset_dir is not None: if dataset_name is None: save_dir = self.dataset_dir @@ -240,6 +308,19 @@ def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Opti return self._load_remote_corpus(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name) def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict: + """Load the qrels from the dataset. + + Args: + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): The split to load relevance from. Defaults to ``'test'``. + + Raises: + ValueError + + Returns: + datasets.DatasetDict: A dict of relevance of query and document. + """ if self.dataset_dir is not None: if dataset_name is None: save_dir = self.dataset_dir @@ -256,6 +337,19 @@ def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optio return self._load_remote_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split) def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict: + """Load the queries from the dataset. + + Args: + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): The split to load queries from. Defaults to ``'test'``. + + Raises: + ValueError + + Returns: + datasets.DatasetDict: A dict of queries with id as key, query text as value. + """ if self.dataset_dir is not None: if dataset_name is None: save_dir = self.dataset_dir @@ -272,6 +366,16 @@ def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Opt return self._load_remote_queries(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split) def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict: + """Load corpus from local dataset. + + Args: + save_dir (str): Path to save the loaded corpus. + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: A dict of corpus with id as key, title and text as value. + """ if sub_dataset_name is None: corpus_path = os.path.join(save_dir, 'corpus.jsonl') else: @@ -291,6 +395,20 @@ def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None, return datasets.DatasetDict(corpus) def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict: + """Load relevance from local dataset. + + Args: + save_dir (str): Path to save the loaded relevance. + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): Split to load from the local dataset. Defaults to ``'test'``. + + Raises: + ValueError + + Returns: + datasets.DatasetDict: A dict of relevance of query and document. + """ checked_split = self.check_splits(split) if len(checked_split) == 0: raise ValueError(f"Split {split} not found in the dataset.") @@ -318,6 +436,20 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s return datasets.DatasetDict(qrels) def _load_local_queries(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict: + """Load queries from local dataset. + + Args: + save_dir (str): Path to save the loaded queries. + dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + split (str, optional): Split to load from the local dataset. Defaults to ``'test'``. + + Raises: + ValueError + + Returns: + datasets.DatasetDict: A dict of queries with id as key, query text as value. + """ checked_split = self.check_splits(split) if len(checked_split) == 0: raise ValueError(f"Split {split} not found in the dataset.") diff --git a/FlagEmbedding/evaluation/beir/evaluator.py b/FlagEmbedding/evaluation/beir/evaluator.py index ba19ec38..b960ba3e 100644 --- a/FlagEmbedding/evaluation/beir/evaluator.py +++ b/FlagEmbedding/evaluation/beir/evaluator.py @@ -10,6 +10,9 @@ class BEIREvaluator(AbsEvaluator): + """ + Evaluator class of BEIR + """ def check_data_info( self, data_info: Dict[str, str], @@ -19,6 +22,23 @@ def check_data_info( dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, ): + """Check the validity of data info. + + Args: + data_info (Dict[str, str]): The loaded data info to be check. + model_name (str): Name of model used. + reranker_name (str): Name of reranker used. + split (str): Split used in searching. + dataset_name (Optional[str], optional): Name of dataset used. Defaults to None. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + + Raises: + ValueError: eval_name mismatch + ValueError: model_name or reranker_name mismatch + ValueError: split mismatch + ValueError: dataset_name mismatch + ValueError: sub_dataset_name mismatch + """ if data_info["eval_name"] != self.eval_name: raise ValueError( f'eval_name mismatch: {data_info["eval_name"]} vs {self.eval_name}' @@ -317,11 +337,21 @@ def __call__( self.output_eval_results_to_json(reranker_eval_results, eval_results_save_path) if reranker is not None: reranker.stop_multi_process_pool() + def evaluate_results( self, search_results_save_dir: str, k_values: List[int] = [1, 3, 5, 10, 100, 1000] ): + """Compute metrics according to the results in the directory. + + Args: + search_results_save_dir (str): Path to the search results. + k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`. + + Returns: + dict: Evaluation results. + """ eval_results_dict = {} cqadupstack_results = None cqadupstack_num = 0 @@ -386,6 +416,18 @@ def save_search_results( dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, ): + """Save the metadata and search results into a file. + + Args: + eval_name (str): The experiment name of current evaluation. + model_name (str): Name of model used. + reranker_name (str): Name of reranker used. + search_results (Dict[str, Dict[str, float]]): Dictionary of search results. + output_path (str): Output path to write the results. + split (str): Split used in searching. + dataset_name (Optional[str], optional): Name of dataset used. Defaults to ``None``. + sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``. + """ data = { "eval_name": eval_name, "model_name": model_name, diff --git a/FlagEmbedding/evaluation/beir/runner.py b/FlagEmbedding/evaluation/beir/runner.py index d7e77c1a..d0d6e63f 100644 --- a/FlagEmbedding/evaluation/beir/runner.py +++ b/FlagEmbedding/evaluation/beir/runner.py @@ -9,7 +9,13 @@ class BEIREvalRunner(AbsEvalRunner): + """ + Runner class of BEIR evaluation. + """ def run(self): + """ + Run the whole evaluation. + """ if self.eval_args.dataset_names is None: dataset_names = self.data_loader.available_dataset_names() else: @@ -54,6 +60,11 @@ def run(self): ) def load_data_loader(self) -> BEIREvalDataLoader: + """Load the data loader + + Returns: + BEIREvalDataLoader: BEIR data loader object. + """ data_loader = BEIREvalDataLoader( eval_name=self.eval_args.eval_name, dataset_dir=self.eval_args.dataset_dir, @@ -64,6 +75,11 @@ def load_data_loader(self) -> BEIREvalDataLoader: return data_loader def load_evaluator(self) -> BEIREvaluator: + """Load the evaluator for evaluation + + Returns: + BEIREvaluator: The BEIR evaluator to run the evaluation. + """ evaluator = BEIREvaluator( eval_name=self.eval_args.eval_name, data_loader=self.data_loader, diff --git a/FlagEmbedding/evaluation/mldr/data_loader.py b/FlagEmbedding/evaluation/mldr/data_loader.py index 0b3070d7..8c5357b3 100644 --- a/FlagEmbedding/evaluation/mldr/data_loader.py +++ b/FlagEmbedding/evaluation/mldr/data_loader.py @@ -11,10 +11,28 @@ class MLDREvalDataLoader(AbsEvalDataLoader): + """ + Data loader class for MLDR. + """ def available_dataset_names(self) -> List[str]: + """ + Get the available dataset names. + + Returns: + List[str]: All the available dataset names. + """ return ["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "th", "zh"] def available_splits(self, dataset_name: Optional[str] = None) -> List[str]: + """ + Get the avaialble splits. + + Args: + dataset_name (Optional[str], optional): Dataset name. Defaults to ``None``. + + Returns: + List[str]: All the available splits for the dataset. + """ return ["train", "dev", "test"] def _load_remote_corpus( @@ -22,6 +40,15 @@ def _load_remote_corpus( dataset_name: str, save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the corpus dataset from HF. + + Args: + dataset_name (str): Name of the dataset. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of corpus. + """ corpus = datasets.load_dataset( "Shitao/MLDR", f"corpus-{dataset_name}", cache_dir=self.cache_dir, @@ -53,6 +80,16 @@ def _load_remote_qrels( split: str = "test", save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the qrels from HF. + + Args: + dataset_name (str): Name of the dataset. + split (str, optional): Split of the dataset. Defaults to ``'test'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of qrel. + """ qrels_data = datasets.load_dataset( "Shitao/MLDR", dataset_name, cache_dir=self.cache_dir, @@ -108,6 +145,16 @@ def _load_remote_queries( split: str = "test", save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the queries from HF. + + Args: + dataset_name (str): Name of the dataset. + split (str, optional): Split of the dataset. Defaults to ``'test'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of queries. + """ queries_data = datasets.load_dataset( "Shitao/MLDR", dataset_name, cache_dir=self.cache_dir, diff --git a/FlagEmbedding/evaluation/mldr/runner.py b/FlagEmbedding/evaluation/mldr/runner.py index d80d2676..f191d036 100644 --- a/FlagEmbedding/evaluation/mldr/runner.py +++ b/FlagEmbedding/evaluation/mldr/runner.py @@ -4,7 +4,15 @@ class MLDREvalRunner(AbsEvalRunner): + """ + Evaluation runner of MIRACL. + """ def load_data_loader(self) -> MLDREvalDataLoader: + """Load the data loader instance by args. + + Returns: + MLDREvalDataLoader: The MLDR data loader instance. + """ data_loader = MLDREvalDataLoader( eval_name=self.eval_args.eval_name, dataset_dir=self.eval_args.dataset_dir, diff --git a/FlagEmbedding/evaluation/msmarco/data_loader.py b/FlagEmbedding/evaluation/msmarco/data_loader.py index 50cd2d8b..8f90191c 100644 --- a/FlagEmbedding/evaluation/msmarco/data_loader.py +++ b/FlagEmbedding/evaluation/msmarco/data_loader.py @@ -11,10 +11,28 @@ class MSMARCOEvalDataLoader(AbsEvalDataLoader): + """ + Data loader class for MSMARCO. + """ def available_dataset_names(self) -> List[str]: + """ + Get the available dataset names. + + Returns: + List[str]: All the available dataset names. + """ return ["passage", "document"] def available_splits(self, dataset_name: Optional[str] = None) -> List[str]: + """ + Get the avaialble splits. + + Args: + dataset_name (Optional[str], optional): Dataset name. Defaults to ``None``. + + Returns: + List[str]: All the available splits for the dataset. + """ return ["dev", "dl19", "dl20"] def _load_remote_corpus( @@ -22,6 +40,15 @@ def _load_remote_corpus( dataset_name: str, save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the corpus dataset from HF. + + Args: + dataset_name (str): Name of the dataset. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of corpus. + """ if dataset_name == 'passage': corpus = datasets.load_dataset( 'Tevatron/msmarco-passage-corpus', @@ -80,6 +107,16 @@ def _load_remote_qrels( split: str = 'dev', save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the qrels from HF. + + Args: + dataset_name (str): Name of the dataset. + split (str, optional): Split of the dataset. Defaults to ``'dev'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of qrel. + """ if dataset_name == 'passage': if split == 'dev': qrels = datasets.load_dataset( @@ -164,6 +201,16 @@ def _load_remote_queries( split: str = 'test', save_dir: Optional[str] = None ) -> datasets.DatasetDict: + """Load the queries from HF. + + Args: + dataset_name (str): Name of the dataset. + split (str, optional): Split of the dataset. Defaults to ``'test'``. + save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``. + + Returns: + datasets.DatasetDict: Loaded datasets instance of queries. + """ if split == 'dev': if dataset_name == 'passage': queries = datasets.load_dataset( diff --git a/FlagEmbedding/evaluation/msmarco/runner.py b/FlagEmbedding/evaluation/msmarco/runner.py index 98876456..094bf2e1 100644 --- a/FlagEmbedding/evaluation/msmarco/runner.py +++ b/FlagEmbedding/evaluation/msmarco/runner.py @@ -4,7 +4,15 @@ class MSMARCOEvalRunner(AbsEvalRunner): + """ + Evaluation runner of MSMARCO. + """ def load_data_loader(self) -> MSMARCOEvalDataLoader: + """Load the data loader instance by args. + + Returns: + MSMARCOEvalDataLoader: The MSMARCO data loader instance. + """ data_loader = MSMARCOEvalDataLoader( eval_name=self.eval_args.eval_name, dataset_dir=self.eval_args.dataset_dir, diff --git a/FlagEmbedding/evaluation/mteb/arguments.py b/FlagEmbedding/evaluation/mteb/arguments.py index bdeb884e..ea370dc2 100644 --- a/FlagEmbedding/evaluation/mteb/arguments.py +++ b/FlagEmbedding/evaluation/mteb/arguments.py @@ -6,6 +6,9 @@ @dataclass class MTEBEvalArgs(AbsEvalArgs): + """ + Argument class for MTEB evaluation. + """ languages: List[str] = field( default=None, metadata={"help": "Languages to evaluate. Default: eng"} ) diff --git a/FlagEmbedding/evaluation/mteb/runner.py b/FlagEmbedding/evaluation/mteb/runner.py index cb1d23ae..4d63bc3f 100644 --- a/FlagEmbedding/evaluation/mteb/runner.py +++ b/FlagEmbedding/evaluation/mteb/runner.py @@ -20,6 +20,9 @@ def ensure_dir(file_path): os.makedirs(directory) class MTEBEvalRunner(AbsEvalRunner): + """ + Evaluation runner of MTEB. + """ def __init__( self, eval_args: MTEBEvalArgs, @@ -31,6 +34,11 @@ def __init__( self.retriever, self.reranker = self.load_retriever_and_reranker() def load_retriever_and_reranker(self) -> Tuple[MTEBEvalDenseRetriever, Union[MTEBEvalReranker, None]]: + """Load the retriever and reranker + + Returns: + Tuple[MTEBEvalDenseRetriever, Union[MTEBEvalReranker, None]]: The retriever and reranker instances. + """ embedder, reranker = self.get_models(self.model_args) retriever = MTEBEvalDenseRetriever( embedder, @@ -42,6 +50,15 @@ def load_retriever_and_reranker(self) -> Tuple[MTEBEvalDenseRetriever, Union[MTE return retriever, reranker def read_results(self, output_folder, tasks): + """Read the evaluation results from directory. + + Args: + output_folder (str): Path to the directory with results. + tasks (list): List of MTEB tasks. + + Returns: + dict: The results of all the tasks. + """ tasks_results = {} task_types = list(set([t.metadata.type for t in tasks])) for t_type in task_types: @@ -77,6 +94,12 @@ def read_results(self, output_folder, tasks): return tasks_results def output_json(self, tasks_results, save_file): + """Save the tasks results into a json file. + + Args: + tasks_results (dict): The task results. + save_file (str): Path to a file to save the results. + """ all_results = 0 all_results_num = 0 cqa_results = 0 @@ -110,6 +133,9 @@ def output_json(self, tasks_results, save_file): json.dump(new_results, f) def run(self): + """ + Run the evaluation. + """ task_types = self.eval_args.task_types tasks = self.eval_args.tasks languages = self.eval_args.languages diff --git a/FlagEmbedding/evaluation/mteb/searcher.py b/FlagEmbedding/evaluation/mteb/searcher.py index 04233188..d0f1776b 100644 --- a/FlagEmbedding/evaluation/mteb/searcher.py +++ b/FlagEmbedding/evaluation/mteb/searcher.py @@ -3,28 +3,67 @@ class MTEBEvalDenseRetriever(EvalDenseRetriever): + """ + Child class of :class:EvalRetriever for MTEB dense retrieval. + """ def __init__(self, embedder, **kwargs): super().__init__(embedder, **kwargs) def set_examples(self, examples_for_task: Optional[List[dict]] = None): + """Set examples for the model. + + Args: + examples_for_task (Optional[List[dict]], optional): Examples for the task. Defaults to None. + """ self.embedder.set_examples(examples_for_task) def set_instruction(self, instruction: Optional[str] = None): + """Set the instruction to use for the embedding model. + + Args: + instruction (Optional[str], optional): _description_. Defaults to None. + """ self.embedder.query_instruction_for_retrieval = instruction def get_instruction(self): + """Get the instruction of embedding model. + + Returns: + str: Instruction + """ return self.embedder.query_instruction_for_retrieval def set_normalize_embeddings(self, normalize_embeddings: bool = True): + """Set whether normalize the output embeddings + + Args: + normalize_embeddings (bool, optional): Boolean to control whether or not normalize the embeddings. Defaults to ``True``. + """ self.embedder.normalize_embeddings = normalize_embeddings def encode_queries(self, queries: List[str], **kwargs): + """Encode input queries. + + Args: + queries (List[str]): Input queries. + + Returns: + Union[np.ndarray, torch.Tensor]: Query embeddings. + """ emb = self.embedder.encode_queries(queries) if isinstance(emb, dict): emb = emb["dense_vecs"] return emb def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs): + """Encode input corpus. + + Args: + corpus (List[Dict[str, str]]): Input corpus. + + Returns: + Union[np.ndarray, torch.Tensor]: Corpus embeddings. + """ if isinstance(corpus[0], dict): input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus] else: @@ -35,6 +74,14 @@ def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs): return emb def encode(self, corpus: List[Dict[str, str]], **kwargs): + """Encode the imput. + + Args: + corpus (List[Dict[str, str]]): Input corpus or sentences. + + Returns: + Union[np.ndarray, torch.Tensor]: Corpus embeddings. + """ if isinstance(corpus[0], dict): input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus] else: @@ -45,5 +92,8 @@ def encode(self, corpus: List[Dict[str, str]], **kwargs): return emb class MTEBEvalReranker(EvalReranker): + """ + Child class of :class:EvalReranker for reranker in MTEB. + """ def __init__(self, reranker, **kwargs): super().__init__(reranker, **kwargs) diff --git a/docs/source/API/evaluation/airbench.rst b/docs/source/API/evaluation/airbench.rst new file mode 100644 index 00000000..6080554f --- /dev/null +++ b/docs/source/API/evaluation/airbench.rst @@ -0,0 +1,42 @@ +AIR-Bench +========= + +`AIR-Bench `_ (Automated heterogeneous Information Retrieval Benchmark) is a dynamic (actively being updated) benchmark for information retrieval. +Now the benchmark contains two versions. Notice that the testing data is generated by LLMs with out human intervention. +This helps the evaluation of new domains easier and faster to be updated. It also makes it impossible for any models to have the test data covered in their training sets. + +You can evaluate model's performance on AIR-Bench by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/air_bench/eval_air_bench.sh + ./examples/evaluation/air_bench/eval_air_bench.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.air_bench \ + --benchmark_version AIR-Bench_24.05 \ + --task_types qa long-doc \ + --domains arxiv \ + --languages en \ + --splits dev test \ + --output_dir ./air_bench/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_dir /root/.cache/huggingface/hub \ + --overwrite False \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --model_cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + airbench/arguments + airbench/runner \ No newline at end of file diff --git a/docs/source/API/evaluation/airbench/arguments.rst b/docs/source/API/evaluation/airbench/arguments.rst new file mode 100644 index 00000000..48c2d61c --- /dev/null +++ b/docs/source/API/evaluation/airbench/arguments.rst @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalModelArgs \ No newline at end of file diff --git a/docs/source/API/evaluation/airbench/runner.rst b/docs/source/API/evaluation/airbench/runner.rst new file mode 100644 index 00000000..0f3b36d8 --- /dev/null +++ b/docs/source/API/evaluation/airbench/runner.rst @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalRunner \ No newline at end of file diff --git a/docs/source/API/evaluation/beir.rst b/docs/source/API/evaluation/beir.rst new file mode 100644 index 00000000..c5aa5a51 --- /dev/null +++ b/docs/source/API/evaluation/beir.rst @@ -0,0 +1,48 @@ +BEIR +==== + +`BEIR `_ (Benchmarking-IR) is a heterogeneous evaluation benchmark for information retrieval. +It is designed for evaluating the performance of NLP-based retrieval models and widely used by research of modern embedding models. + +You can evaluate model's performance on the BEIR benchmark by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/beir/eval_beir.sh + ./examples/evaluation/beir/eval_beir.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.beir \ + --eval_name beir \ + --dataset_dir ./beir/data \ + --dataset_names fiqa arguana cqadupstack \ + --splits test dev \ + --corpus_embd_save_dir ./beir/corpus_embd \ + --output_dir ./beir/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./beir/beir_eval_results.md \ + --eval_metrics ndcg_at_10 recall_at_100 \ + --ignore_identical_ids True \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --cache_dir \ + --reranker_max_length 1024 \ + +change the embedder, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + beir/arguments + beir/data_loader + beir/evaluator + beir/runner \ No newline at end of file diff --git a/docs/source/API/evaluation/beir/arguments.rst b/docs/source/API/evaluation/beir/arguments.rst new file mode 100644 index 00000000..71593837 --- /dev/null +++ b/docs/source/API/evaluation/beir/arguments.rst @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.bier.BEIREvalArgs \ No newline at end of file diff --git a/docs/source/API/evaluation/beir/data_loader.rst b/docs/source/API/evaluation/beir/data_loader.rst new file mode 100644 index 00000000..de224fa1 --- /dev/null +++ b/docs/source/API/evaluation/beir/data_loader.rst @@ -0,0 +1,4 @@ +data loader +=========== + +.. autoclass:: FlagEmbedding.abc.evaluation.BEIREvalDataLoader \ No newline at end of file diff --git a/docs/source/API/evaluation/beir/evaluator.rst b/docs/source/API/evaluation/beir/evaluator.rst new file mode 100644 index 00000000..cc752f6a --- /dev/null +++ b/docs/source/API/evaluation/beir/evaluator.rst @@ -0,0 +1,4 @@ +evaluator +========= + +.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvaluator \ No newline at end of file diff --git a/docs/source/API/evaluation/beir/runner.rst b/docs/source/API/evaluation/beir/runner.rst new file mode 100644 index 00000000..e2866159 --- /dev/null +++ b/docs/source/API/evaluation/beir/runner.rst @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvalRunner \ No newline at end of file diff --git a/docs/source/API/evaluation/mkqa.rst b/docs/source/API/evaluation/mkqa.rst index 0f242362..0c8bd222 100644 --- a/docs/source/API/evaluation/mkqa.rst +++ b/docs/source/API/evaluation/mkqa.rst @@ -2,6 +2,7 @@ MKQA ==== `MKQA `_ is an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages. + Each example in the dataset has the following structure: .. code:: bash diff --git a/docs/source/API/evaluation/mldr.rst b/docs/source/API/evaluation/mldr.rst new file mode 100644 index 00000000..7865536c --- /dev/null +++ b/docs/source/API/evaluation/mldr.rst @@ -0,0 +1,95 @@ +MLDR +==== + +`MLDR `_ is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages. +Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. +Then we use GPT-3.5 to generate questions based on these paragraphs. +The generated question and the sampled article constitute a new text pair to the dataset. + +An example of ``train`` set looks like: + +.. code:: bash + + { + 'query_id': 'q-zh-<...>', + 'query': '...', + 'positive_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + ], + 'negative_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + }, + ... + ] + } + +An example of ``dev`` and ``test`` set looks like: + +.. code:: bash + + { + 'query_id': 'q-zh-<...>', + 'query': '...', + 'positive_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + ], + 'negative_passages': [] + } + +An example of ``corpus`` looks like: + +.. code:: bash + + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + +You can evaluate model's performance on MLDR simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/mldr/eval_mldr.sh + ./examples/evaluation/mldr/eval_mldr.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.mldr \ + --eval_name mldr \ + --dataset_dir ./mldr/data \ + --dataset_names hi \ + --splits test \ + --corpus_embd_save_dir ./mldr/corpus_embd \ + --output_dir ./mldr/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./mldr/mldr_eval_results.md \ + --eval_metrics ndcg_at_10 \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --cache_dir /root/.cache/huggingface/hub \ + --embedder_passage_max_length 8192 \ + --reranker_max_length 8192 + +change the args of embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + mldr/data_loader + mldr/runner \ No newline at end of file diff --git a/docs/source/API/evaluation/mldr/data_loader.rst b/docs/source/API/evaluation/mldr/data_loader.rst new file mode 100644 index 00000000..f0fe313e --- /dev/null +++ b/docs/source/API/evaluation/mldr/data_loader.rst @@ -0,0 +1,13 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/docs/source/API/evaluation/mldr/runner.rst b/docs/source/API/evaluation/mldr/runner.rst new file mode 100644 index 00000000..d6c1ee6d --- /dev/null +++ b/docs/source/API/evaluation/mldr/runner.rst @@ -0,0 +1,5 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalRunner + :members: \ No newline at end of file diff --git a/docs/source/API/evaluation/msmarco.rst b/docs/source/API/evaluation/msmarco.rst new file mode 100644 index 00000000..db8ff200 --- /dev/null +++ b/docs/source/API/evaluation/msmarco.rst @@ -0,0 +1,46 @@ +MSMARCO +======= + +`MS Marco `_ (Microsoft MAchine Reading Comprehension) is a large scale real-world reading comprehension dataset. +It is widely used in information retrieval, question answering, and natural language processing research. + + +You can evaluate model's performance on MS MARCO simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/msmarco/eval_msmarco.sh + ./examples/evaluation/msmarco/eval_msmarco.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.msmarco \ + --eval_name msmarco \ + --dataset_dir ./msmarco/data \ + --dataset_names passage \ + --splits dev \ + --corpus_embd_save_dir ./msmarco/corpus_embd \ + --output_dir ./msmarco/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite True \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./msmarco/msmarco_eval_results.md \ + --eval_metrics ndcg_at_10 recall_at_100 \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \ + --cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + msmarco/data_loader + msmarco/runner \ No newline at end of file diff --git a/docs/source/API/evaluation/msmarco/data_loader.rst b/docs/source/API/evaluation/msmarco/data_loader.rst new file mode 100644 index 00000000..f886eda5 --- /dev/null +++ b/docs/source/API/evaluation/msmarco/data_loader.rst @@ -0,0 +1,13 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/docs/source/API/evaluation/msmarco/runner.rst b/docs/source/API/evaluation/msmarco/runner.rst new file mode 100644 index 00000000..ae56a455 --- /dev/null +++ b/docs/source/API/evaluation/msmarco/runner.rst @@ -0,0 +1,5 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalRunner + :members: \ No newline at end of file diff --git a/docs/source/API/evaluation/mteb.rst b/docs/source/API/evaluation/mteb.rst new file mode 100644 index 00000000..044b5e68 --- /dev/null +++ b/docs/source/API/evaluation/mteb.rst @@ -0,0 +1,37 @@ +MTEB +==== + +`MTEB `_ (The Massive Text Embedding Benchmark) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of NLP tasks. +Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications. +It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation. +It also holds the well known MTEB `leaderboard `_, which contains a ranking of the latest first-class embedding models. + +You can evaluate model's performance on the whole MTEB benchmark by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/mteb/eval_mteb.sh + ./examples/evaluation/mteb/eval_mteb.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.mteb \ + --eval_name mteb \ + --output_dir ./mteb/search_results \ + --languages eng \ + --tasks NFCorpus BiorxivClusteringS2S SciDocsRR \ + --eval_output_path ./mteb/mteb_eval_results.json \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --devices cuda:7 \ + --cache_dir /root/.cache/huggingface/hub + +change the embedder, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + mteb/arguments + mteb/searcher + mteb/runner \ No newline at end of file diff --git a/docs/source/API/evaluation/mteb/arguments.rst b/docs/source/API/evaluation/mteb/arguments.rst new file mode 100644 index 00000000..b07f3a97 --- /dev/null +++ b/docs/source/API/evaluation/mteb/arguments.rst @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalArgs \ No newline at end of file diff --git a/docs/source/API/evaluation/mteb/runner.rst b/docs/source/API/evaluation/mteb/runner.rst new file mode 100644 index 00000000..495a929c --- /dev/null +++ b/docs/source/API/evaluation/mteb/runner.rst @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalRunner \ No newline at end of file diff --git a/docs/source/API/evaluation/mteb/searcher.rst b/docs/source/API/evaluation/mteb/searcher.rst new file mode 100644 index 00000000..f51873a3 --- /dev/null +++ b/docs/source/API/evaluation/mteb/searcher.rst @@ -0,0 +1,6 @@ +searcher +======== + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalDenseRetriever + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalReranker \ No newline at end of file