Skip to content

Commit

Permalink
add fbeta score
Browse files Browse the repository at this point in the history
  • Loading branch information
shahules786 committed Oct 25, 2024
1 parent 92166c0 commit 69cfa99
Showing 1 changed file with 26 additions and 5 deletions.
31 changes: 26 additions & 5 deletions src/ragas/metrics/_factual_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
SingleTurnMetric,
get_segmenter,
)
from ragas.metrics.utils import fbeta_score
from ragas.prompt import PydanticPrompt

if t.TYPE_CHECKING:
Expand Down Expand Up @@ -181,12 +182,32 @@ class ClaimDecompositionPrompt(

@dataclass
class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
"""
FactualCorrectness is a metric class that evaluates the factual correctness of responses
generated by a language model. It uses claim decomposition and natural language inference (NLI)
to verify the claims made in the responses against reference texts.
Attributes:
name (str): The name of the metric, default is "factual_correctness".
_required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns
for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}.
mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision",
"recall", or "f1". Default is "f1".
beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight
to recall, while beta < 1 favors precision. Default is 1.0.
atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low".
coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low".
claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition.
nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI).
"""

name: str = "factual_correctness" # type: ignore
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
)
mode: t.Literal["precision", "recall", "f1"] = "f1"
beta: float = Field(default_factory=lambda: 1.0)
beta: float = 1.0
atomicity: t.Literal["low", "high"] = "low"
coverage: t.Literal["low", "high"] = "low"
claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
Expand All @@ -206,7 +227,9 @@ def __post_init__(self):
self.segmenter = get_segmenter(language="english")

if type(self.beta) is not float:
raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
raise ValueError(
"Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
)

async def decompose_claims(
self, response: str, callbacks: Callbacks
Expand Down Expand Up @@ -257,14 +280,12 @@ async def _single_turn_ascore(
fp = sum(~reference_response)
fn = sum(~response_reference)

beta = self.beta

if self.mode == "precision":
score = tp / (tp + fp + 1e-8)
elif self.mode == "recall":
score = tp / (tp + fp + 1e-8)
else:
score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
score = fbeta_score(tp, fp, fn, self.beta)

return np.round(score, 2)

Expand Down

0 comments on commit 69cfa99

Please sign in to comment.