Skip to content

Commit

Permalink
evaluation docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ZiyiXia committed Nov 14, 2024
1 parent 7caa598 commit d1c3b3f
Show file tree
Hide file tree
Showing 32 changed files with 744 additions and 0 deletions.
3 changes: 3 additions & 0 deletions FlagEmbedding/evaluation/air_bench/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

@dataclass
class AIRBenchEvalModelArgs:
"""
Evaluation Model arguments for AIR Bench.
"""
embedder_name_or_path: str = field(
metadata={"help": "The embedder name or path.", "required": True}
)
Expand Down
16 changes: 16 additions & 0 deletions FlagEmbedding/evaluation/air_bench/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@


class AIRBenchEvalRunner:
"""
Evaluation runner for AIR Bench.
Args:
eval_args (AIRBenchEvalArgs): :class:AIRBenchEvalArgs object with the evaluation arguments.
model_args (AIRBenchEvalModelArgs): :class:AIRBenchEvalModelArgs object with the model arguments.
"""
def __init__(
self,
eval_args: AIRBenchEvalArgs,
Expand All @@ -22,6 +29,12 @@ def __init__(
self.retriever, self.reranker = self.load_retriever_and_reranker()

def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalReranker, None]]:
"""Load retriever and reranker for evaluation
Returns:
Tuple[EvalDenseRetriever, Union[EvalReranker, None]]: A :class:EvalDenseRetriever object for retrieval, and a
:class:EvalReranker object if reranker provided.
"""
embedder, reranker = AbsEvalRunner.get_models(self.model_args)
retriever = EvalDenseRetriever(
embedder,
Expand All @@ -33,6 +46,9 @@ def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalRer
return retriever, reranker

def run(self):
"""
Run the whole evaluation.
"""
evaluation = AIRBench(
benchmark_version=self.eval_args.benchmark_version,
task_types=self.eval_args.task_types,
Expand Down
3 changes: 3 additions & 0 deletions FlagEmbedding/evaluation/beir/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

@dataclass
class BEIREvalArgs(AbsEvalArgs):
"""
Argument class for BEIR evaluation.
"""
use_special_instructions: bool = field(
default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"}
)
132 changes: 132 additions & 0 deletions FlagEmbedding/evaluation/beir/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,42 @@


class BEIREvalDataLoader(AbsEvalDataLoader):
"""
Data loader class for BEIR.
"""
def available_dataset_names(self) -> List[str]:
"""
Get the available dataset names.
Returns:
List[str]: All the available dataset names.
"""
return ['arguana', 'climate-fever', 'cqadupstack', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'msmarco', 'nfcorpus', 'nq', 'quora', 'scidocs', 'scifact', 'trec-covid', 'webis-touche2020']

def available_sub_dataset_names(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the available sub-dataset names.
Args:
dataset_name (Optional[str], optional): All the available sub-dataset names. Defaults to ``None``.
Returns:
List[str]: All the available sub-dataset names.
"""
if dataset_name == 'cqadupstack':
return ['android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress']
return None

def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the avaialble splits.
Args:
dataset_name (str): Dataset name.
Returns:
List[str]: All the available splits for the dataset.
"""
if dataset_name == 'msmarco':
return ['dev']
return ['test']
Expand All @@ -32,6 +59,16 @@ def _load_remote_corpus(
sub_dataset_name: Optional[str] = None,
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the corpus dataset from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of corpus.
"""
if dataset_name != 'cqadupstack':
corpus = datasets.load_dataset(
'BeIR/{d}'.format(d=dataset_name),
Expand Down Expand Up @@ -94,6 +131,17 @@ def _load_remote_qrels(
split: str = 'dev',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the qrels from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of qrel.
"""
if dataset_name != 'cqadupstack':
qrels = datasets.load_dataset(
'BeIR/{d}-qrels'.format(d=dataset_name),
Expand Down Expand Up @@ -168,6 +216,17 @@ def _load_remote_queries(
split: str = 'test',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the queries from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of queries.
"""
qrels = self.load_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)

if dataset_name != 'cqadupstack':
Expand Down Expand Up @@ -230,6 +289,15 @@ def _load_remote_queries(
return datasets.DatasetDict(queries_dict)

def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load the corpus from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
Expand All @@ -240,6 +308,19 @@ def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Opti
return self._load_remote_corpus(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name)

def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the qrels from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): The split to load relevance from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
Expand All @@ -256,6 +337,19 @@ def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optio
return self._load_remote_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)

def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the queries from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): The split to load queries from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
Expand All @@ -272,6 +366,16 @@ def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Opt
return self._load_remote_queries(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)

def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load corpus from local dataset.
Args:
save_dir (str): Path to save the loaded corpus.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
if sub_dataset_name is None:
corpus_path = os.path.join(save_dir, 'corpus.jsonl')
else:
Expand All @@ -291,6 +395,20 @@ def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None,
return datasets.DatasetDict(corpus)

def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load relevance from local dataset.
Args:
save_dir (str): Path to save the loaded relevance.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
checked_split = self.check_splits(split)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")
Expand Down Expand Up @@ -318,6 +436,20 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s
return datasets.DatasetDict(qrels)

def _load_local_queries(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load queries from local dataset.
Args:
save_dir (str): Path to save the loaded queries.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
checked_split = self.check_splits(split)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")
Expand Down
42 changes: 42 additions & 0 deletions FlagEmbedding/evaluation/beir/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@


class BEIREvaluator(AbsEvaluator):
"""
Evaluator class of BEIR
"""
def check_data_info(
self,
data_info: Dict[str, str],
Expand All @@ -19,6 +22,23 @@ def check_data_info(
dataset_name: Optional[str] = None,
sub_dataset_name: Optional[str] = None,
):
"""Check the validity of data info.
Args:
data_info (Dict[str, str]): The loaded data info to be check.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to None.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Raises:
ValueError: eval_name mismatch
ValueError: model_name or reranker_name mismatch
ValueError: split mismatch
ValueError: dataset_name mismatch
ValueError: sub_dataset_name mismatch
"""
if data_info["eval_name"] != self.eval_name:
raise ValueError(
f'eval_name mismatch: {data_info["eval_name"]} vs {self.eval_name}'
Expand Down Expand Up @@ -317,11 +337,21 @@ def __call__(
self.output_eval_results_to_json(reranker_eval_results, eval_results_save_path)
if reranker is not None:
reranker.stop_multi_process_pool()

def evaluate_results(
self,
search_results_save_dir: str,
k_values: List[int] = [1, 3, 5, 10, 100, 1000]
):
"""Compute metrics according to the results in the directory.
Args:
search_results_save_dir (str): Path to the search results.
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
Returns:
dict: Evaluation results.
"""
eval_results_dict = {}
cqadupstack_results = None
cqadupstack_num = 0
Expand Down Expand Up @@ -386,6 +416,18 @@ def save_search_results(
dataset_name: Optional[str] = None,
sub_dataset_name: Optional[str] = None,
):
"""Save the metadata and search results into a file.
Args:
eval_name (str): The experiment name of current evaluation.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
search_results (Dict[str, Dict[str, float]]): Dictionary of search results.
output_path (str): Output path to write the results.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
"""
data = {
"eval_name": eval_name,
"model_name": model_name,
Expand Down
16 changes: 16 additions & 0 deletions FlagEmbedding/evaluation/beir/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@


class BEIREvalRunner(AbsEvalRunner):
"""
Runner class of BEIR evaluation.
"""
def run(self):
"""
Run the whole evaluation.
"""
if self.eval_args.dataset_names is None:
dataset_names = self.data_loader.available_dataset_names()
else:
Expand Down Expand Up @@ -54,6 +60,11 @@ def run(self):
)

def load_data_loader(self) -> BEIREvalDataLoader:
"""Load the data loader
Returns:
BEIREvalDataLoader: BEIR data loader object.
"""
data_loader = BEIREvalDataLoader(
eval_name=self.eval_args.eval_name,
dataset_dir=self.eval_args.dataset_dir,
Expand All @@ -64,6 +75,11 @@ def load_data_loader(self) -> BEIREvalDataLoader:
return data_loader

def load_evaluator(self) -> BEIREvaluator:
"""Load the evaluator for evaluation
Returns:
BEIREvaluator: The BEIR evaluator to run the evaluation.
"""
evaluator = BEIREvaluator(
eval_name=self.eval_args.eval_name,
data_loader=self.data_loader,
Expand Down
Loading

0 comments on commit d1c3b3f

Please sign in to comment.