InternScience
diff --git a/‎graphgen/models/partitioner/ece_partitioner.py‎
Lines changed: 32 additions & 12 deletions b/‎graphgen/models/partitioner/ece_partitioner.py‎
Lines changed: 32 additions & 12 deletions
diff --git a/‎graphgen/operators/partition/traverse_graph.py‎
Lines changed: 0 additions & 111 deletions b/‎graphgen/operators/partition/traverse_graph.py‎
Lines changed: 0 additions & 111 deletions
diff --git a/‎graphgen/templates/__init__.py‎
Lines changed: 5 additions & 2 deletions b/‎graphgen/templates/__init__.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎graphgen/templates/community/__init__.py‎ ‎graphgen/templates/generation/__init__.py‎graphgen/templates/community/__init__.py renamed to graphgen/templates/generation/__init__.py
Lines changed: 1 addition & 0 deletions b/‎graphgen/templates/community/__init__.py‎ ‎graphgen/templates/generation/__init__.py‎graphgen/templates/community/__init__.py renamed to graphgen/templates/generation/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎graphgen/templates/atomic_generation.py‎ ‎…emplates/generation/atomic_generation.py‎graphgen/templates/atomic_generation.py renamed to graphgen/templates/generation/atomic_generation.py b/‎graphgen/templates/atomic_generation.py‎ ‎…emplates/generation/atomic_generation.py‎graphgen/templates/atomic_generation.py renamed to graphgen/templates/generation/atomic_generation.py
diff --git a/‎…en/templates/community/cot_generation.py‎ ‎…n/templates/generation/cot_generation.py‎graphgen/templates/community/cot_generation.py renamed to graphgen/templates/generation/cot_generation.py b/‎…en/templates/community/cot_generation.py‎ ‎…n/templates/generation/cot_generation.py‎graphgen/templates/community/cot_generation.py renamed to graphgen/templates/generation/cot_generation.py
diff --git a/‎…mplates/community/cot_template_design.py‎ ‎…plates/generation/cot_template_design.py‎graphgen/templates/community/cot_template_design.py renamed to graphgen/templates/generation/cot_template_design.py b/‎…mplates/community/cot_template_design.py‎ ‎…plates/generation/cot_template_design.py‎graphgen/templates/community/cot_template_design.py renamed to graphgen/templates/generation/cot_template_design.py
@@ -1,17 +1,37 @@
-from typing import Any, List
+from typing import List
 
-from graphgen.bases import BaseGraphStorage, BasePartitioner
+from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
+from graphgen.models import BFSPartitioner
 
 
-class ECEPartitioner(BasePartitioner):
-    def partition(
-        self,
-        g: BaseGraphStorage,
-        bidirectional: bool = False,
-        **kwargs: Any,
-    ) -> List[Community]:
-        pass
+class ECEPartitioner(BFSPartitioner):
+    """
+    ECE partitioner that partitions the graph into communities based on Expected Calibration Error (ECE).
+    We calculate ECE for edges in KG(represented as 'comprehension loss') and group edges with similar ECE values into the same community.
+    1. Select a sampling strategy.
+    2. Choose a unit based on the sampling strategy.
+    2. Expand the community using BFS.
+    3. When expending, prefer to add units with the sampling strategy.
+    4. Stop when the max unit size is reached or the max input length is reached.
+    (A unit is a node or an edge.)
+    """
 
-    def split_communities(self, communities: List[Community]) -> List[Community]:
-        pass
+    # async def partition(
+    #         self,
+    #         g: BaseGraphStorage,
+    #         *,
+    # ):
+    #     pass
+
+
+# 修改
+# max_depth 取消
+# expand_method 改名为 xxx
+# edge_sampling
+# loss_strategy取消，因为node和edge可以看作同一种unit
+# bidirectional 取消
+# max_extra_edges 改名为 max_units_per_community
+# max_tokens 改名为 max_tokens_per_community
+
+# 可以退化成BFS
@@ -299,117 +299,6 @@ async def _process_single_batch(
     return results
 
 
-# pylint: disable=too-many-branches, too-many-statements
-async def traverse_graph_for_atomic(
-    llm_client: OpenAIClient,
-    tokenizer: Tokenizer,
-    graph_storage: NetworkXStorage,
-    traverse_strategy: Dict,
-    text_chunks_storage: JsonKVStorage,
-    progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000,
-) -> dict:
-    """
-    Traverse the graph atomicly
-
-    :param llm_client
-    :param tokenizer
-    :param graph_storage
-    :param traverse_strategy
-    :param text_chunks_storage
-    :param progress_bar
-    :param max_concurrent
-    :return: question and answer
-    """
-
-    semaphore = asyncio.Semaphore(max_concurrent)
-
-    def _parse_qa(qa: str) -> tuple:
-        if "Question:" in qa and "Answer:" in qa:
-            question = qa.split("Question:")[1].split("Answer:")[0].strip()
-            answer = qa.split("Answer:")[1].strip()
-        elif "问题：" in qa and "答案：" in qa:
-            question = qa.split("问题：")[1].split("答案：")[0].strip()
-            answer = qa.split("答案：")[1].strip()
-        else:
-            return None, None
-        return question.strip('"'), answer.strip('"')
-
-    async def _generate_question(node_or_edge: tuple):
-        if len(node_or_edge) == 2:
-            des = node_or_edge[0] + ": " + node_or_edge[1]["description"]
-            loss = node_or_edge[1]["loss"] if "loss" in node_or_edge[1] else -1.0
-        else:
-            des = node_or_edge[2]["description"]
-            loss = node_or_edge[2]["loss"] if "loss" in node_or_edge[2] else -1.0
-
-        async with semaphore:
-            try:
-                language = "Chinese" if detect_main_language(des) == "zh" else "English"
-
-                qa = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]["SINGLE_QA_TEMPLATE"].format(
-                        doc=des
-                    )
-                )
-
-                question, answer = _parse_qa(qa)
-                if question is None or answer is None:
-                    return {}
-
-                question = question.strip('"')
-                answer = answer.strip('"')
-
-                logger.info("Question: %s", question)
-                logger.info("Answer: %s", answer)
-                return {
-                    compute_content_hash(question): {
-                        "question": question,
-                        "answer": answer,
-                        "loss": loss,
-                    }
-                }
-            except Exception as e:  # pylint: disable=broad-except
-                logger.error("Error occurred while generating question: %s", e)
-                return {}
-
-    results = {}
-    edges = list(await graph_storage.get_all_edges())
-    nodes = list(await graph_storage.get_all_nodes())
-
-    edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
-
-    tasks = []
-    for node in nodes:
-        if "<SEP>" in node[1]["description"]:
-            description_list = node[1]["description"].split("<SEP>")
-            for item in description_list:
-                tasks.append((node[0], {"description": item}))
-                if "loss" in node[1]:
-                    tasks[-1][1]["loss"] = node[1]["loss"]
-        else:
-            tasks.append((node[0], node[1]))
-    for edge in edges:
-        if "<SEP>" in edge[2]["description"]:
-            description_list = edge[2]["description"].split("<SEP>")
-            for item in description_list:
-                tasks.append((edge[0], edge[1], {"description": item}))
-                if "loss" in edge[2]:
-                    tasks[-1][2]["loss"] = edge[2]["loss"]
-        else:
-            tasks.append((edge[0], edge[1], edge[2]))
-
-    results_list = await run_concurrent(
-        _generate_question,
-        tasks,
-        progress_bar=progress_bar,
-        desc="[4/4]Generating QAs",
-    )
-    for res in results_list:
-        results.update(res)
-    return results
-
-
 async def traverse_graph_for_multi_hop(
     llm_client: OpenAIClient,
     tokenizer: Tokenizer,
 
@@ -1,8 +1,11 @@
 from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
-from .atomic_generation import ATOMIC_GENERATION_PROMPT
-from .community import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT
 from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
+from .generation import (
+    ATOMIC_GENERATION_PROMPT,
+    COT_GENERATION_PROMPT,
+    COT_TEMPLATE_DESIGN_PROMPT,
+)
 from .kg_extraction import KG_EXTRACTION_PROMPT
 from .kg_summarization import KG_SUMMARIZATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
 
@@ -1,2 +1,3 @@
+from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
 from .cot_template_design import COT_TEMPLATE_DESIGN_PROMPT
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
	`1`	`+from .atomic_generation import ATOMIC_GENERATION_PROMPT`
`1`	`2`	`from .cot_generation import COT_GENERATION_PROMPT`
`2`	`3`	`from .cot_template_design import COT_TEMPLATE_DESIGN_PROMPT`