add fbeta score

explodinggradients · Oct 25, 2024 · 69cfa99 · 69cfa99
1 parent 92166c0
commit 69cfa99
Showing 1 changed file with 26 additions and 5 deletions.
diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py
@@ -16,6 +16,7 @@
     SingleTurnMetric,
     get_segmenter,
 )
+from ragas.metrics.utils import fbeta_score
 from ragas.prompt import PydanticPrompt
 
 if t.TYPE_CHECKING:
@@ -181,12 +182,32 @@ class ClaimDecompositionPrompt(
 
 @dataclass
 class FactualCorrectness(MetricWithLLM, SingleTurnMetric):
+    """
+    FactualCorrectness is a metric class that evaluates the factual correctness of responses
+    generated by a language model. It uses claim decomposition and natural language inference (NLI)
+    to verify the claims made in the responses against reference texts.
+
+    Attributes:
+        name (str): The name of the metric, default is "factual_correctness".
+        _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns
+            for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}.
+        mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision",
+            "recall", or "f1". Default is "f1".
+        beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight
+            to recall, while beta < 1 favors precision. Default is 1.0.
+        atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low".
+        coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low".
+        claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition.
+        nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI).
+
+    """
+
     name: str = "factual_correctness"  # type: ignore
     _required_columns: t.Dict[MetricType, t.Set[str]] = field(
         default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}}
     )
     mode: t.Literal["precision", "recall", "f1"] = "f1"
-    beta: float = Field(default_factory=lambda: 1.0)
+    beta: float = 1.0
     atomicity: t.Literal["low", "high"] = "low"
     coverage: t.Literal["low", "high"] = "low"
     claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt()
@@ -206,7 +227,9 @@ def __post_init__(self):
         self.segmenter = get_segmenter(language="english")
 
         if type(self.beta) is not float:
-            raise ValueError("Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision.")
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
 
     async def decompose_claims(
         self, response: str, callbacks: Callbacks
@@ -257,14 +280,12 @@ async def _single_turn_ascore(
         fp = sum(~reference_response)
         fn = sum(~response_reference)
 
-        beta = self.beta
-
         if self.mode == "precision":
             score = tp / (tp + fp + 1e-8)
         elif self.mode == "recall":
             score = tp / (tp + fp + 1e-8)
         else:
-            score = ((1 + beta * beta) * tp) / ((1 + beta * beta) * tp + fp + (beta * beta) * fn) if tp > 0 else 0
+            score = fbeta_score(tp, fp, fn, self.beta)
 
         return np.round(score, 2)