InternScience
diff --git a/‎graphgen/bases/base_generator.py‎
Lines changed: 4 additions & 2 deletions b/‎graphgen/bases/base_generator.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/models/generator/aggregated_generator.py‎
Lines changed: 122 additions & 4 deletions b/‎graphgen/models/generator/aggregated_generator.py‎
Lines changed: 122 additions & 4 deletions
diff --git a/‎graphgen/models/generator/atomic_generator.py‎
Lines changed: 7 additions & 4 deletions b/‎graphgen/models/generator/atomic_generator.py‎
Lines changed: 7 additions & 4 deletions
@@ -13,14 +13,16 @@ class BaseGenerator(ABC):
 
     llm_client: BaseLLMClient
 
+    @staticmethod
     @abstractmethod
     def build_prompt(
-        self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
     ) -> str:
         """Build prompt for LLM based on the given batch"""
 
+    @staticmethod
     @abstractmethod
-    def parse_response(self, response: str) -> Any:
+    def parse_response(response: str) -> Any:
         """Parse the LLM response and return the generated QAs"""
 
     async def generate(
 
@@ -13,7 +13,7 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    max_units_per_community: 10 # max nodes and edges per community
+    max_units_per_community: 20 # max nodes and edges per community
     max_tokens_per_community: 10240 # max tokens per community
     unit_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
 generate:
 
@@ -1,9 +1,127 @@
+from dataclasses import dataclass
+from typing import Any
+
 from graphgen.bases import BaseGenerator
+from graphgen.templates import AGGREGATED_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
 
 
+@dataclass
 class AggregatedGenerator(BaseGenerator):
-    def build_prompt(self, batch) -> str:
-        pass
+    """
+    Aggregated Generator follows a TWO-STEP process:
+    1. rephrase: Rephrase the input nodes and edges into a coherent text that maintains the original meaning.
+                 The rephrased text is considered as answer to be used in the next step.
+    2. question generation: Generate relevant questions based on the rephrased text.
+    """
+
+    @staticmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """
+        Build prompts for REPHRASE.
+        :param batch
+        :return:
+        """
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+        relations_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relations_str)
+
+        # TODO: configure add_context
+        #     if add_context:
+        #         original_ids = [
+        #             node["source_id"].split("<SEP>")[0] for node in _process_nodes
+        #         ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges]
+        #         original_ids = list(set(original_ids))
+        #         original_text = await text_chunks_storage.get_by_ids(original_ids)
+        #         original_text = "\n".join(
+        #             [
+        #                 f"{index + 1}. {text['content']}"
+        #                 for index, text in enumerate(original_text)
+        #             ]
+        #         )
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
+            language=language, entities=entities_str, relationships=relations_str
+        )
+        return prompt
+
+    @staticmethod
+    def parse_rephrased_text(response: str) -> str:
+        """
+        Parse the rephrased text from the response.
+        :param response:
+        :return: rephrased text
+        """
+        if "Rephrased Text:" in response:
+            rephrased_text = response.split("Rephrased Text:")[1].strip()
+        elif "重述文本:" in response:
+            rephrased_text = response.split("重述文本:")[1].strip()
+        else:
+            rephrased_text = response.strip()
+        return rephrased_text.strip('"')
+
+    @staticmethod
+    def _build_prompt_for_question_generation(answer: str) -> str:
+        """
+        Build prompts for QUESTION GENERATION.
+        :param answer:
+        :return:
+        """
+        language = detect_main_language(answer)
+        prompt = AGGREGATED_GENERATION_PROMPT[language]["QUESTION_GENERATION"].format(
+            answer=answer
+        )
+        return prompt
+
+    @staticmethod
+    def parse_response(response: str) -> dict:
+        if response.startswith("Question:"):
+            question = response[len("Question:") :].strip()
+        elif response.startswith("问题："):
+            question = response[len("问题：") :].strip()
+        else:
+            question = response.strip()
+        return {
+            "question": question,
+        }
 
-    def parse_response(self, response: str):
-        pass
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        rephrasing_prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(rephrasing_prompt)
+        context = self.parse_rephrased_text(response)
+        question_generation_prompt = self._build_prompt_for_question_generation(context)
+        response = await self.llm_client.generate_answer(question_generation_prompt)
+        question = self.parse_response(response)["question"]
+        logger.info("Question: %s", question)
+        logger.info("Answer: %s", context)
+        qa_pairs = {
+            compute_content_hash(question): {
+                "question": question,
+                "answer": context,
+            }
+        }
+        result.update(qa_pairs)
+        return result
@@ -1,14 +1,16 @@
+from dataclasses import dataclass
 from typing import Any
 
-from graphgen.utils import compute_content_hash
 from graphgen.bases import BaseGenerator
 from graphgen.templates import ATOMIC_GENERATION_PROMPT
-from graphgen.utils import detect_main_language, logger
+from graphgen.utils import compute_content_hash, detect_main_language, logger
 
 
+@dataclass
 class AtomicGenerator(BaseGenerator):
+    @staticmethod
     def build_prompt(
-        self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
     ) -> str:
         nodes, edges = batch
         context = ""
@@ -21,7 +23,8 @@ def build_prompt(
         prompt = ATOMIC_GENERATION_PROMPT[language].format(context=context)
         return prompt
 
-    def parse_response(self, response: str) -> dict:
+    @staticmethod
+    def parse_response(response: str) -> dict:
         """
         AtomicGenerator normally generates one QA pair per response.
         So we just need to parse one QA pair from the response.