feat: improvements in test synthesization (#1621)

PR 2 of improvements in test generation --------- Co-authored-by: Jin Lin Tham <jltham18@gmail.com>
explodinggradients · Nov 7, 2024 · d840b16 · d840b16
1 parent 5f74eb5
commit d840b16
Show file tree

Hide file tree

Showing 32 changed files with 1,336 additions and 1,151 deletions.
diff --git a/docs/getstarted/rag_testset_generation.md b/docs/getstarted/rag_testset_generation.md
@@ -141,9 +141,9 @@ query_distribution = default_query_distribution(generator_llm)
 ```
 ```
 [
-    (AbstractQuerySynthesizer(llm=generator_llm), 0.25),
-    (ComparativeAbstractQuerySynthesizer(llm=generator_llm), 0.25),
-    (SpecificQuerySynthesizer(llm=generator_llm), 0.5),
+        (SingleHopSpecificQuerySynthesizer(llm=llm), 0.5),
+        (MultiHopAbstractQuerySynthesizer(llm=llm), 0.25),
+        (MultiHopSpecificQuerySynthesizer(llm=llm), 0.25),
 ]
 ```
 

diff --git a/docs/references/testset_schema.md b/docs/references/testset_schema.md
@@ -15,12 +15,12 @@
         members:
             - BaseScenario
 
-::: ragas.testset.synthesizers.specific_query.SpecificQueryScenario
+::: ragas.testset.synthesizers.single_hop.specific.SingleHopSpecificQuerySynthesizer
     options:
         show_root_heading: True
         show_root_full_path: False
 
-::: ragas.testset.synthesizers.abstract_query.AbstractQueryScenario
+::: ragas.testset.synthesizers.multi_hop.specific.MultiHopSpecificQuerySynthesizer
     options:
         show_root_heading: True
         show_root_full_path: False
diff --git a/src/ragas/metrics/_string.py b/src/ragas/metrics/_string.py
@@ -13,6 +13,7 @@ class DistanceMeasure(Enum):
     LEVENSHTEIN = "levenshtein"
     HAMMING = "hamming"
     JARO = "jaro"
+    JARO_WINKLER = "jaro_winkler"
 
 
 @dataclass
@@ -77,6 +78,7 @@ def __post_init__(self):
             DistanceMeasure.LEVENSHTEIN: distance.Levenshtein,
             DistanceMeasure.HAMMING: distance.Hamming,
             DistanceMeasure.JARO: distance.Jaro,
+            DistanceMeasure.JARO_WINKLER: distance.JaroWinkler,
         }
 
     def init(self, run_config: RunConfig):

diff --git a/src/ragas/testset/graph.py b/src/ragas/testset/graph.py
@@ -206,11 +206,15 @@ def __repr__(self) -> str:
     def __str__(self) -> str:
         return self.__repr__()
 
-    def find_clusters(
-        self, relationship_condition: t.Callable[[Relationship], bool] = lambda _: True
+    def find_indirect_clusters(
+        self,
+        relationship_condition: t.Callable[[Relationship], bool] = lambda _: True,
+        depth_limit: int = 3,
     ) -> t.List[t.Set[Node]]:
         """
-        Finds clusters of nodes in the knowledge graph based on a relationship condition.
+        Finds indirect clusters of nodes in the knowledge graph based on a relationship condition.
+        Here if A -> B -> C -> D, then A, B, C, and D form a cluster. If there's also a path A -> B -> C -> E,
+        it will form a separate cluster.
 
         Parameters
         ----------
@@ -223,31 +227,95 @@ def find_clusters(
             A list of sets, where each set contains nodes that form a cluster.
         """
         clusters = []
-        visited = set()
+        visited_paths = set()
 
         relationships = [
             rel for rel in self.relationships if relationship_condition(rel)
         ]
 
-        def dfs(node: Node, cluster: t.Set[Node]):
-            visited.add(node)
+        def dfs(node: Node, cluster: t.Set[Node], depth: int, path: t.Tuple[Node, ...]):
+            if depth >= depth_limit or path in visited_paths:
+                return
+            visited_paths.add(path)
             cluster.add(node)
+
             for rel in relationships:
-                if rel.source == node and rel.target not in visited:
-                    dfs(rel.target, cluster)
-                # if the relationship is bidirectional, we need to check the reverse
+                neighbor = None
+                if rel.source == node and rel.target not in cluster:
+                    neighbor = rel.target
                 elif (
                     rel.bidirectional
                     and rel.target == node
-                    and rel.source not in visited
+                    and rel.source not in cluster
                 ):
-                    dfs(rel.source, cluster)
+                    neighbor = rel.source
+
+                if neighbor is not None:
+                    dfs(neighbor, cluster.copy(), depth + 1, path + (neighbor,))
+
+            # Add completed path-based cluster
+            if len(cluster) > 1:
+                clusters.append(cluster)
 
         for node in self.nodes:
-            if node not in visited:
-                cluster = set()
-                dfs(node, cluster)
-                if len(cluster) > 1:
+            initial_cluster = set()
+            dfs(node, initial_cluster, 0, (node,))
+
+        # Remove duplicates by converting clusters to frozensets
+        unique_clusters = [
+            set(cluster) for cluster in set(frozenset(c) for c in clusters)
+        ]
+
+        return unique_clusters
+
+    def find_direct_clusters(
+        self, relationship_condition: t.Callable[[Relationship], bool] = lambda _: True
+    ) -> t.Dict[Node, t.List[t.Set[Node]]]:
+        """
+        Finds direct clusters of nodes in the knowledge graph based on a relationship condition.
+        Here if A->B, and A->C, then A, B, and C form a cluster.
+
+        Parameters
+        ----------
+        relationship_condition : Callable[[Relationship], bool], optional
+            A function that takes a Relationship and returns a boolean, by default lambda _: True
+
+        Returns
+        -------
+        List[Set[Node]]
+            A list of sets, where each set contains nodes that form a cluster.
+        """
+
+        clusters = []
+        relationships = [
+            rel for rel in self.relationships if relationship_condition(rel)
+        ]
+        for node in self.nodes:
+            cluster = set()
+            cluster.add(node)
+            for rel in relationships:
+                if rel.bidirectional:
+                    if rel.source == node:
+                        cluster.add(rel.target)
+                    elif rel.target == node:
+                        cluster.add(rel.source)
+                else:
+                    if rel.source == node:
+                        cluster.add(rel.target)
+
+            if len(cluster) > 1:
+                if cluster not in clusters:
                     clusters.append(cluster)
 
-        return clusters
+        # Remove subsets from clusters
+        unique_clusters = []
+        for cluster in clusters:
+            if not any(cluster < other for other in clusters):
+                unique_clusters.append(cluster)
+        clusters = unique_clusters
+
+        cluster_dict = {}
+        for cluster in clusters:
+            cluster_dict.update({cluster.pop(): cluster})
+
+        return cluster_dict
diff --git a/src/ragas/testset/graph_queries.py b/src/ragas/testset/graph_queries.py
@@ -0,0 +1,38 @@
+import typing as t
+
+from ragas.testset.graph import KnowledgeGraph, Node
+
+
+def get_child_nodes(node: Node, graph: KnowledgeGraph, level: int = 1) -> t.List[Node]:
+    """
+    Get the child nodes of a given node up to a specified level.
+
+    Parameters
+    ----------
+    node : Node
+        The node to get the children of.
+    graph : KnowledgeGraph
+        The knowledge graph containing the node.
+    level : int
+        The maximum level to which child nodes are searched.
+
+    Returns
+    -------
+    List[Node]
+        The list of child nodes up to the specified level.
+    """
+    children = []
+
+    # Helper function to perform depth-limited search for child nodes
+    def dfs(current_node: Node, current_level: int):
+        if current_level > level:
+            return
+        for rel in graph.relationships:
+            if rel.source == current_node and rel.type == "child":
+                children.append(rel.target)
+                dfs(rel.target, current_level + 1)
+
+    # Start DFS from the initial node at level 0
+    dfs(node, 1)
+
+    return children
diff --git a/src/ragas/testset/synthesizers/__init__.py b/src/ragas/testset/synthesizers/__init__.py
@@ -1,38 +1,29 @@
 import typing as t
 
 from ragas.llms import BaseRagasLLM
-
-from .abstract_query import (
-    AbstractQuerySynthesizer,
-    ComparativeAbstractQuerySynthesizer,
+from ragas.testset.synthesizers.multi_hop import (
+    MultiHopAbstractQuerySynthesizer,
+    MultiHopSpecificQuerySynthesizer,
+)
+from ragas.testset.synthesizers.single_hop.specific import (
+    SingleHopSpecificQuerySynthesizer,
 )
+
 from .base import BaseSynthesizer
-from .base_query import QuerySynthesizer
-from .specific_query import SpecificQuerySynthesizer
 
 QueryDistribution = t.List[t.Tuple[BaseSynthesizer, float]]
 
 
 def default_query_distribution(llm: BaseRagasLLM) -> QueryDistribution:
-    """
-    Default query distribution for the test set.
-
-    By default, 25% of the queries are generated using `AbstractQuerySynthesizer`,
-    25% are generated using `ComparativeAbstractQuerySynthesizer`, and 50% are
-    generated using `SpecificQuerySynthesizer`.
-    """
+    """ """
     return [
-        (AbstractQuerySynthesizer(llm=llm), 0.25),
-        (ComparativeAbstractQuerySynthesizer(llm=llm), 0.25),
-        (SpecificQuerySynthesizer(llm=llm), 0.5),
+        (SingleHopSpecificQuerySynthesizer(llm=llm), 0.5),
+        (MultiHopAbstractQuerySynthesizer(llm=llm), 0.25),
+        (MultiHopSpecificQuerySynthesizer(llm=llm), 0.25),
     ]
 
 
 __all__ = [
     "BaseSynthesizer",
-    "QuerySynthesizer",
-    "AbstractQuerySynthesizer",
-    "ComparativeAbstractQuerySynthesizer",
-    "SpecificQuerySynthesizer",
     "default_query_distribution",
 ]