InternScience
diff --git a/‎graphgen/bases/base_partitioner.py‎
Lines changed: 0 additions & 8 deletions b/‎graphgen/bases/base_partitioner.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/models/generator/cot_generator.py‎
Lines changed: 117 additions & 4 deletions b/‎graphgen/models/generator/cot_generator.py‎
Lines changed: 117 additions & 4 deletions
diff --git a/‎graphgen/models/generator/multi_hop_generator.py‎
Lines changed: 0 additions & 6 deletions b/‎graphgen/models/generator/multi_hop_generator.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎graphgen/models/partitioner/bfs_partitioner.py‎
Lines changed: 0 additions & 3 deletions b/‎graphgen/models/partitioner/bfs_partitioner.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎graphgen/models/partitioner/dfs_partitioner.py‎
Lines changed: 0 additions & 3 deletions b/‎graphgen/models/partitioner/dfs_partitioner.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎graphgen/models/partitioner/leiden_partitioner.py‎
Lines changed: 77 additions & 52 deletions b/‎graphgen/models/partitioner/leiden_partitioner.py‎
Lines changed: 77 additions & 52 deletions
@@ -21,14 +21,6 @@ async def partition(
         :return: List of communities
         """
 
-    @abstractmethod
-    def split_communities(self, communities: List[Community]) -> List[Community]:
-        """
-        Split large communities into smaller ones based on max_size.
-        :param communities
-        :return:
-        """
-
     @staticmethod
     async def community2batch(
         communities: List[Community], g: BaseGraphStorage
 
@@ -12,8 +12,8 @@ partition: # graph partition configuration
   method: leiden # leiden is a partitioner detection algorithm
   method_params:
     max_size: 20 # Maximum size of communities
-    use_lcc: false
-    random_seed: 42
+    use_lcc: false # whether to use the largest connected component
+    random_seed: 42 # random seed for partitioning
 generate:
   mode: cot # atomic, aggregated, multi_hop, cot
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML
@@ -1,9 +1,122 @@
+from dataclasses import dataclass
+from typing import Any
+
 from graphgen.bases import BaseGenerator
+from graphgen.templates import COT_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
 
 
+@dataclass
 class CoTGenerator(BaseGenerator):
-    def build_prompt(self, batch) -> str:
-        pass
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for COT Template Design.
+        :param batch:
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_TEMPLATE_DESIGN"].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
+    @staticmethod
+    def build_prompt_for_cot_generation(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]],
+        question: str,
+        reasoning_path: str,
+    ) -> str:
+        """
+        Build prompts for COT Generation.
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = COT_GENERATION_PROMPT[language]["COT_GENERATION"].format(
+            entities=entities_str,
+            relationships=relationships_str,
+            question=question,
+            reasoning_template=reasoning_path,
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if "Question:" in response and "Reasoning-Path Design:" in response:
+            question = (
+                response.split("Question:")[1]
+                .split("Reasoning-Path Design:")[0]
+                .strip()
+            )
+            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
+        elif "问题：" in response and "推理路径设计：" in response:
+            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
+            reasoning_path = response.split("推理路径设计：")[1].strip()
+        else:
+            logger.warning("Failed to parse CoT template: %s", response)
+            return {}
+
+        question = question.strip('"')
+        reasoning_path = reasoning_path.strip('"')
+        logger.info("CoT Question: %s", question)
+        logger.info("CoT Reasoning Path: %s", reasoning_path)
+        return {
+            "question": question,
+            "reasoning_path": reasoning_path,
+        }
 
-    def parse_response(self, response: str):
-        pass
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        response = self.parse_response(response)
+        question, reasoning_path = response["question"], response["reasoning_path"]
+        prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
+        cot_answer = await self.llm_client.generate_answer(prompt)
+        logger.info("CoT Answer: %s", cot_answer)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": cot_answer,
+                "reasoning_path": reasoning_path,
+            }
+        }
+        result.update(qa_pairs)
+        return result
@@ -34,12 +34,6 @@ def build_prompt(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        """
-        AtomicGenerator normally generates one QA pair per response.
-        So we just need to parse one QA pair from the response.
-        :param response:
-        :return:
-        """
         if "Question:" in response and "Answer:" in response:
             question = response.split("Question:")[1].split("Answer:")[0].strip()
             answer = response.split("Answer:")[1].strip()
 
@@ -76,6 +76,3 @@ async def partition(
                 )
 
         return communities
-
-    def split_communities(self, communities: List[Community]) -> List[Community]:
-        raise NotImplementedError("BFSPartitioner does not need to split communities.")
@@ -73,6 +73,3 @@ async def partition(
                 )
 
         return communities
-
-    def split_communities(self, communities: List[Community]) -> List[Community]:
-        raise NotImplementedError("DFSPartitioner does not need to split communities.")
@@ -1,95 +1,120 @@
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Set, Tuple
 
-from graphgen.models.storage.networkx_storage import NetworkXStorage
+import igraph as ig
+from leidenalg import ModularityVertexPartition, find_partition
 
+from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases.datatypes import Community
 
-@dataclass
-class LeidenPartitioner:
-    """Class for partitioner detection algorithms."""
 
-    graph_storage: NetworkXStorage = None
-    method: str = "leiden"
-    method_params: Dict[str, Any] = None
+@dataclass
+class LeidenPartitioner(BasePartitioner):
+    """
+    Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
+    """
+
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_size: int = 20,
+        use_lcc: bool = False,
+        random_seed: int = 42,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Leiden Partition follows these steps:
+        1. export the graph from graph storage
+        2. use the leiden algorithm to detect communities, get {node: community_id}
+        3. split large communities if max_size is given
+        4. convert {node: community_id} to List[Community]
+        :param g
+        :param max_size: maximum size of each community, if None or <=0, no limit
+        :param use_lcc: whether to use the largest connected component only
+        :param random_seed
+        :param kwargs: other parameters for the leiden algorithm
+        :return:
+        """
+        nodes = await g.get_all_nodes()  # List[Tuple[str, dict]]
+        edges = await g.get_all_edges()  # List[Tuple[str, str, dict]]
 
-    async def detect_communities(self) -> Dict[str, int]:
-        if self.method == "leiden":
-            return await self._leiden_communities(**self.method_params or {})
-        raise ValueError(f"Unknown partitioner detection method: {self.method}")
+        node2cid: Dict[str, int] = await self._run_leiden(
+            nodes, edges, use_lcc, random_seed
+        )
 
-    async def get_graph(self):
-        return await self.graph_storage.get_graph()
+        if max_size is not None and max_size > 0:
+            node2cid = await self._split_communities(node2cid, max_size)
 
-    async def _leiden_communities(
-        self, max_size: int = None, **kwargs
-    ) -> Dict[str, int]:
-        """
-        Detect communities using the Leiden algorithm.
-        If max_size is given, any partitioner larger than max_size will be split
-        into smaller sub-communities each having at most max_size nodes.
-        """
-        import igraph as ig
-        import networkx as nx
-        from leidenalg import ModularityVertexPartition, find_partition
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
 
-        graph = await self.get_graph()
-        graph.remove_nodes_from(list(nx.isolates(graph)))
+        communities: List[Community] = []
+        for cid, nodes in cid2nodes.items():
+            node_set: Set[str] = set(nodes)
+            comm_edges: List[Tuple[str, str]] = [
+                (u, v) for u, v, _ in edges if u in node_set and v in node_set
+            ]
+            communities.append(Community(id=cid, nodes=nodes, edges=comm_edges))
+        return communities
 
-        ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
+    @staticmethod
+    async def _run_leiden(
+        nodes: List[Tuple[str, dict]],
+        edges: List[Tuple[str, str, dict]],
+        use_lcc: bool = False,
+        random_seed: int = 42,
+    ) -> Dict[str, int]:
+        # build igraph
+        ig_graph = ig.Graph.TupleList(((u, v) for u, v, _ in edges), directed=False)
 
-        random_seed = kwargs.get("random_seed", 42)
-        use_lcc = kwargs.get("use_lcc", False)
+        # remove isolated nodes
+        ig_graph.delete_vertices(ig_graph.vs.select(_degree_eq=0))
 
-        communities: Dict[str, int] = {}
+        node2cid: Dict[str, int] = {}
         if use_lcc:
             lcc = ig_graph.components().giant()
             partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
-            for part, cluster in enumerate(partition):
+            for part_id, cluster in enumerate(partition):
                 for v in cluster:
-                    communities[lcc.vs[v]["name"]] = part
+                    node2cid[lcc.vs[v]["name"]] = part_id
         else:
             offset = 0
             for component in ig_graph.components():
                 subgraph = ig_graph.induced_subgraph(component)
                 partition = find_partition(
                     subgraph, ModularityVertexPartition, seed=random_seed
                 )
-                for part, cluster in enumerate(partition):
+                for part_id, cluster in enumerate(partition):
                     for v in cluster:
                         original_node = subgraph.vs[v]["name"]
-                        communities[original_node] = part + offset
+                        node2cid[original_node] = part_id + offset
                 offset += len(partition)
-
-        # split large communities if max_size is specified
-        if max_size is None or max_size <= 0:
-            return communities
-
-        return await self._split_communities(communities, max_size)
+        return node2cid
 
     @staticmethod
     async def _split_communities(
-        communities: Dict[str, int], max_size: int
+        node2cid: Dict[str, int], max_size: int
     ) -> Dict[str, int]:
         """
         Split communities larger than max_size into smaller sub-communities.
         """
         cid2nodes: Dict[int, List[str]] = defaultdict(list)
-        for node, cid in communities.items():
-            cid2nodes[cid].append(node)
+        for n, cid in node2cid.items():
+            cid2nodes[cid].append(n)
 
-        new_communities: Dict[str, int] = {}
+        new_mapping: Dict[str, int] = {}
         new_cid = 0
-        for cid, nodes in cid2nodes.items():
+        for nodes in cid2nodes.values():
             if len(nodes) <= max_size:
                 for n in nodes:
-                    new_communities[n] = new_cid
+                    new_mapping[n] = new_cid
                 new_cid += 1
             else:
                 for start in range(0, len(nodes), max_size):
-                    sub = nodes[start : start + max_size]
-                    for n in sub:
-                        new_communities[n] = new_cid
+                    chunk = nodes[start : start + max_size]
+                    for n in chunk:
+                        new_mapping[n] = new_cid
                     new_cid += 1
-
-        return new_communities
+        return new_mapping
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,3 @@ async def partition(`
`76`	`76`	`)`
`77`	`77`
`78`	`78`	`return communities`
`79`		`-`
`80`		`- def split_communities(self, communities: List[Community]) -> List[Community]:`
`81`		`- raise NotImplementedError("BFSPartitioner does not need to split communities.")`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,3 @@ async def partition(`
`73`	`73`	`)`
`74`	`74`
`75`	`75`	`return communities`
`76`		`-`
`77`		`- def split_communities(self, communities: List[Community]) -> List[Community]:`
`78`		`- raise NotImplementedError("DFSPartitioner does not need to split communities.")`