feat: add ECEPartitioner

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 9ed56a6eb4f5 · 2025-10-14T14:40:58.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -231,7 +231,9 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
     @async_to_sync_method
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
-        batches = await partition_kg(self.graph_storage, partition_config)
+        batches = await partition_kg(
+            self.graph_storage, self.tokenizer_instance, partition_config
+        )
 
         # Step 2： generate QA pairs
         results = await generate_qas(
diff --git a/graphgen/models/partitioner/ece_partitioner.py b/graphgen/models/partitioner/ece_partitioner.py
@@ -1,14 +1,21 @@
-from typing import List
+import asyncio
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Set, Tuple
+
+from tqdm.asyncio import tqdm as tqdm_async
 
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
-from graphgen.models import BFSPartitioner
+from graphgen.models.partitioner.bfs_partitioner import BFSPartitioner
 
 
+@dataclass
 class ECEPartitioner(BFSPartitioner):
     """
     ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
-    We calculate ECE for edges in KG(represented as 'comprehension loss') and group edges with similar ECE values into the same community.
+    We calculate ECE for edges in KG(represented as 'comprehension loss')
+    and group edges with similar ECE values into the same community.
     1. Select a sampling strategy.
     2. Choose a unit based on the sampling strategy.
     2. Expand the community using BFS.
@@ -17,21 +24,127 @@ class ECEPartitioner(BFSPartitioner):
     (A unit is a node or an edge.)
     """
 
-    # async def partition(
-    #         self,
-    #         g: BaseGraphStorage,
-    #         *,
-    # ):
-    #     pass
+    @staticmethod
+    def _sort_units(units: list, edge_sampling: str) -> list:
+        """
+        Sort units with edge sampling strategy
+
+        :param units: total units
+        :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+        :return: sorted units
+        """
+        if edge_sampling == "random":
+            random.shuffle(units)
+        elif edge_sampling == "min_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+            )
+        elif edge_sampling == "max_loss":
+            units = sorted(
+                units,
+                key=lambda x: x[-1]["loss"],
+                reverse=True,
+            )
+        else:
+            raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+        return units
+
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 10,
+        max_tokens_per_community: int = 10240,
+        edge_sampling: str = "random",
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes: List[Tuple[str, dict]] = await g.get_all_nodes()
+        edges: List[Tuple[str, str, dict]] = await g.get_all_edges()
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+        node_dict = dict(nodes)
+        edge_dict = {frozenset((u, v)): d for u, v, d in edges}
+
+        all_units: List[Tuple[str, Any, dict]] = [("n", nid, d) for nid, d in nodes] + [
+            ("e", frozenset((u, v)), d) for u, v, d in edges
+        ]
+
+        used_n: Set[str] = set()
+        used_e: Set[frozenset[str]] = set()
+        communities: List = []
+
+        all_units = self._sort_units(all_units, edge_sampling)
+
+        async def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Community:
+            nonlocal used_n, used_e
+
+            community_nodes: Dict[str, dict] = {}
+            community_edges: Dict[frozenset[str], dict] = {}
+            queue: asyncio.Queue = asyncio.Queue()
+            token_sum = 0
+
+            async def _add_unit(u):
+                nonlocal token_sum
+                t, i, d = u
+                if t == "n":
+                    if i in used_n or i in community_nodes:
+                        return False
+                    community_nodes[i] = d
+                    used_n.add(i)
+                else:  # edge
+                    if i in used_e or i in community_edges:
+                        return False
+                    community_edges[i] = d
+                    used_e.add(i)
+                token_sum += d.get("length", 0)
+                return True
+
+            await _add_unit(seed_unit)
+            await queue.put(seed_unit)
+
+            # BFS
+            while not queue.empty():
+                if (
+                    len(community_nodes) + len(community_edges)
+                    >= max_units_per_community
+                    or token_sum >= max_tokens_per_community
+                ):
+                    break
+
+                cur_type, cur_id, _ = await queue.get()
+
+                neighbors: List[Tuple[str, Any, dict]] = []
+                if cur_type == "n":
+                    for nb_id in adj.get(cur_id, []):
+                        e_key = frozenset((cur_id, nb_id))
+                        if e_key not in used_e and e_key not in community_edges:
+                            neighbors.append(("e", e_key, edge_dict[e_key]))
+                else:
+                    for n_id in cur_id:
+                        if n_id not in used_n and n_id not in community_nodes:
+                            neighbors.append(("n", n_id, node_dict[n_id]))
+
+                neighbors = self._sort_units(neighbors, edge_sampling)
+                for nb in neighbors:
+                    if (
+                        len(community_nodes) + len(community_edges)
+                        >= max_units_per_community
+                        or token_sum >= max_tokens_per_community
+                    ):
+                        break
+                    if await _add_unit(nb):
+                        await queue.put(nb)
 
+            return Community(
+                id=len(communities),
+                nodes=list(community_nodes.keys()),
+                edges=[(u, v) for (u, v), _ in community_edges.items()],
+            )
 
-# 修改
-# max_depth 取消
-# expand_method 改名为 xxx
-# edge_sampling
-# loss_strategy取消，因为node和edge可以看作同一种unit
-# bidirectional 取消
-# max_extra_edges 改名为 max_units_per_community
-# max_tokens 改名为 max_tokens_per_community
+        async for unit in tqdm_async(all_units, desc="ECE partition"):
+            utype, uid, _ = unit
+            if (utype == "n" and uid in used_n) or (utype == "e" and uid in used_e):
+                continue
+            communities.append(await _grow_community(unit))
 
-# 可以退化成BFS
+        return communities
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
@@ -1,6 +1,5 @@
 from graphgen.operators.partition.traverse_graph import (
     traverse_graph_for_aggregated,
-    traverse_graph_for_atomic,
     traverse_graph_for_multi_hop,
 )
 
diff --git a/graphgen/operators/partition/partition_kg.py b/graphgen/operators/partition/partition_kg.py
@@ -1,7 +1,6 @@
-from typing import Any, List, Tuple
+from typing import Any
 
-from graphgen.bases import BaseGraphStorage
-from graphgen.bases.datatypes import Community
+from graphgen.bases import BaseGraphStorage, BaseTokenizer
 from graphgen.models import (
     BFSPartitioner,
     DFSPartitioner,
@@ -10,9 +9,12 @@
 )
 from graphgen.utils import logger
 
+from .pre_tokenize import pre_tokenize
+
 
 async def partition_kg(
     kg_instance: BaseGraphStorage,
+    tokenizer: Any = BaseTokenizer,
     partition_config: dict = None,
 ) -> list[
     tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]]
@@ -27,6 +29,12 @@ async def partition_kg(
         partitioner = DFSPartitioner()
     elif method == "ece":
         logger.info("Partitioning knowledge graph using ECE method.")
+        # TODO： before ECE partitioning, we need to:
+        # 1. 'quiz and judge' to get the comprehension loss
+        # 2. pre-tokenize nodes and edges to get the token length
+        edges = await kg_instance.get_all_edges()
+        nodes = await kg_instance.get_all_nodes()
+        await pre_tokenize(kg_instance, tokenizer, edges, nodes)
         partitioner = ECEPartitioner()
     elif method == "leiden":
         logger.info("Partitioning knowledge graph using Leiden method.")
diff --git a/graphgen/operators/partition/pre_tokenize.py b/graphgen/operators/partition/pre_tokenize.py
@@ -0,0 +1,47 @@
+import asyncio
+from typing import List, Tuple
+
+from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.utils import run_concurrent
+
+
+async def pre_tokenize(
+    graph_storage: BaseGraphStorage,
+    tokenizer: BaseTokenizer,
+    edges: List[Tuple],
+    nodes: List[Tuple],
+) -> Tuple[List, List]:
+    """为 edges/nodes 补 token-length 并回写存储，并发 1000，带进度条。"""
+    sem = asyncio.Semaphore(1000)
+
+    async def _patch_and_write(obj: Tuple, *, is_node: bool) -> Tuple:
+        async with sem:
+            data = obj[1] if is_node else obj[2]
+            if "length" not in data:
+                loop = asyncio.get_event_loop()
+                data["length"] = len(
+                    await loop.run_in_executor(
+                        None, tokenizer.encode, data["description"]
+                    )
+                )
+            if is_node:
+                await graph_storage.update_node(obj[0], obj[1])
+            else:
+                await graph_storage.update_edge(obj[0], obj[1], obj[2])
+            return obj
+
+    new_edges, new_nodes = await asyncio.gather(
+        run_concurrent(
+            lambda e: _patch_and_write(e, is_node=False),
+            edges,
+            desc="Pre-tokenizing edges",
+        ),
+        run_concurrent(
+            lambda n: _patch_and_write(n, is_node=True),
+            nodes,
+            desc="Pre-tokenizing nodes",
+        ),
+    )
+
+    await graph_storage.index_done_callback()
+    return new_edges, new_nodes

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`from graphgen.operators.partition.traverse_graph import (`
`2`	`2`	`traverse_graph_for_aggregated,`
`3`		`- traverse_graph_for_atomic,`
`4`	`3`	`traverse_graph_for_multi_hop,`
`5`	`4`	`)`
`6`	`5`