Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This commit implements the F-beta score metric #1543

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions src/ragas/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import inspect
import sys

from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness
from ragas.metrics._answer_relevance import (
AnswerRelevancy,
Expand Down Expand Up @@ -120,10 +117,3 @@
"MultiModalRelevance",
"multimodal_relevance",
]

current_module = sys.modules[__name__]
ALL_METRICS = [
obj
for name, obj in inspect.getmembers(current_module)
if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj)
]
9 changes: 8 additions & 1 deletion src/ragas/metrics/_answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SingleTurnMetric,
get_segmenter,
)
from ragas.metrics.utils import fbeta_score
from ragas.prompt import PydanticPrompt
from ragas.run_config import RunConfig

Expand Down Expand Up @@ -167,6 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
default_factory=LongFormAnswerPrompt
)
weights: list[float] = field(default_factory=lambda: [0.75, 0.25])
beta: float = 1.0
answer_similarity: t.Optional[AnswerSimilarity] = None
sentence_segmenter: t.Optional[HasSegmentMethod] = None
max_retries: int = 1
Expand All @@ -185,6 +187,11 @@ def __post_init__(self: t.Self):
language = self.long_form_answer_prompt.language
self.sentence_segmenter = get_segmenter(language=language, clean=False)

if type(self.beta) is not float:
raise ValueError(
"Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
)

def init(self, run_config: RunConfig):
super().init(run_config)
if self.answer_similarity is None and self.weights[1] != 0:
Expand All @@ -198,7 +205,7 @@ def _compute_statement_presence(
tp = len(prediction.TP)
fp = len(prediction.FP)
fn = len(prediction.FN)
score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0
score = fbeta_score(tp, fp, fn, self.beta)
return score

async def _create_simplified_statements(
Expand Down
44 changes: 35 additions & 9 deletions src/ragas/metrics/_factual_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
SingleTurnMetric,
get_segmenter,
)
from ragas.metrics.utils import fbeta_score
from ragas.prompt import PydanticPrompt

if t.TYPE_CHECKING:
Expand Down Expand Up @@ -181,11 +182,32 @@ class ClaimDecompositionPrompt(

@dataclass
class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
"""
FactualCorrectness is a metric class that evaluates the factual correctness of responses
generated by a language model. It uses claim decomposition and natural language inference (NLI)
to verify the claims made in the responses against reference texts.

Attributes:
name (str): The name of the metric, default is "factual_correctness".
_required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns
for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}.
mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision",
"recall", or "f1". Default is "f1".
beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight
to recall, while beta < 1 favors precision. Default is 1.0.
atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low".
coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low".
claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition.
nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI).

"""

name: str = "factual_correctness" # type: ignore
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
)
mode: t.Literal["precision", "recall", "f1"] = "f1"
beta: float = 1.0
atomicity: t.Literal["low", "high"] = "low"
coverage: t.Literal["low", "high"] = "low"
claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
Expand All @@ -204,6 +226,11 @@ def __post_init__(self):
)
self.segmenter = get_segmenter(language="english")

if type(self.beta) is not float:
raise ValueError(
"Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
)

async def decompose_claims(
self, response: str, callbacks: Callbacks
) -> t.List[str]:
Expand Down Expand Up @@ -253,21 +280,20 @@ async def _single_turn_ascore(
else:
response_reference = np.array([])

true_positives = sum(reference_response)
false_positives = sum(~reference_response)
tp = sum(reference_response)
fp = sum(~reference_response)
if self.mode != "precision":
false_negatives = sum(~response_reference)
fn = sum(~response_reference)
else:
false_negatives = 0
fn = 0


if self.mode == "precision":
score = true_positives / (true_positives + false_positives + 1e-8)
score = tp / (tp + fp + 1e-8)
elif self.mode == "recall":
score = true_positives / (true_positives + false_negatives + 1e-8)
score = tp / (tp + fp + 1e-8)
else:
precision = true_positives / (true_positives + false_positives + 1e-8)
recall = true_positives / (true_positives + false_negatives + 1e-8)
score = 2 * (precision * recall) / (precision + recall + 1e-8)
score = fbeta_score(tp, fp, fn, self.beta)

return np.round(score, 2)

Expand Down
37 changes: 19 additions & 18 deletions src/ragas/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
from ragas.dataset_schema import EvaluationDataset
from ragas.metrics import ALL_METRICS
from ragas.metrics.base import Metric
from ragas.validation import validate_required_columns
def fbeta_score(tp, fp, fn, beta=1.0):
if tp + fp == 0:
precision = 0
else:
precision = tp / (tp + fp)

if tp + fn == 0:
recall = 0
else:
recall = tp / (tp + fn)

def get_available_metrics(ds: EvaluationDataset) -> list[Metric]:
"""
Get the available metrics for the given dataset.
E.g. if the dataset contains ("question", "answer", "contexts") columns,
the available metrics are those that can be evaluated in [qa, qac, qc] mode.
"""
available_metrics = []
for metric in ALL_METRICS:
try:
validate_required_columns(ds, [metric])
available_metrics.append(metric)
except ValueError:
pass
if precision == 0 and recall == 0:
return 0.0

return available_metrics
beta_squared = beta**2
fbeta = (
(1 + beta_squared)
* (precision * recall)
/ ((beta_squared * precision) + recall)
)

return fbeta
16 changes: 1 addition & 15 deletions tests/unit/test_metric.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,8 @@
import typing as t
from dataclasses import dataclass, field

from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics.base import MetricType
from ragas.metrics.utils import get_available_metrics


def test_get_available_metrics():
sample1 = SingleTurnSample(user_input="What is X", response="Y")
sample2 = SingleTurnSample(user_input="What is Z", response="W")
ds = EvaluationDataset(samples=[sample1, sample2])

assert all(
[
m.required_columns["SINGLE_TURN"] == {"response", "user_input"}
for m in get_available_metrics(ds)
]
), "All metrics should have required columns ('user_input', 'response')"


def test_single_turn_metric():
Expand Down
Loading