|
| 1 | +from dataclasses import dataclass |
| 2 | +from typing import Any |
| 3 | + |
1 | 4 | from graphgen.bases import BaseGenerator |
| 5 | +from graphgen.templates import AGGREGATED_GENERATION_PROMPT |
| 6 | +from graphgen.utils import compute_content_hash, detect_main_language, logger |
2 | 7 |
|
3 | 8 |
|
| 9 | +@dataclass |
4 | 10 | class AggregatedGenerator(BaseGenerator): |
5 | | - def build_prompt(self, batch) -> str: |
6 | | - pass |
| 11 | + """ |
| 12 | + Aggregated Generator follows a TWO-STEP process: |
| 13 | + 1. rephrase: Rephrase the input nodes and edges into a coherent text that maintains the original meaning. |
| 14 | + The rephrased text is considered as answer to be used in the next step. |
| 15 | + 2. question generation: Generate relevant questions based on the rephrased text. |
| 16 | + """ |
| 17 | + |
| 18 | + @staticmethod |
| 19 | + def build_prompt( |
| 20 | + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] |
| 21 | + ) -> str: |
| 22 | + """ |
| 23 | + Build prompts for REPHRASE. |
| 24 | + :param batch |
| 25 | + :return: |
| 26 | + """ |
| 27 | + nodes, edges = batch |
| 28 | + entities_str = "\n".join( |
| 29 | + [ |
| 30 | + f"{index + 1}. {node[0]}: {node[1]['description']}" |
| 31 | + for index, node in enumerate(nodes) |
| 32 | + ] |
| 33 | + ) |
| 34 | + relations_str = "\n".join( |
| 35 | + [ |
| 36 | + f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" |
| 37 | + for index, edge in enumerate(edges) |
| 38 | + ] |
| 39 | + ) |
| 40 | + language = detect_main_language(entities_str + relations_str) |
| 41 | + |
| 42 | + # TODO: configure add_context |
| 43 | + # if add_context: |
| 44 | + # original_ids = [ |
| 45 | + # node["source_id"].split("<SEP>")[0] for node in _process_nodes |
| 46 | + # ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges] |
| 47 | + # original_ids = list(set(original_ids)) |
| 48 | + # original_text = await text_chunks_storage.get_by_ids(original_ids) |
| 49 | + # original_text = "\n".join( |
| 50 | + # [ |
| 51 | + # f"{index + 1}. {text['content']}" |
| 52 | + # for index, text in enumerate(original_text) |
| 53 | + # ] |
| 54 | + # ) |
| 55 | + prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format( |
| 56 | + language=language, entities=entities_str, relationships=relations_str |
| 57 | + ) |
| 58 | + return prompt |
| 59 | + |
| 60 | + @staticmethod |
| 61 | + def parse_rephrased_text(response: str) -> str: |
| 62 | + """ |
| 63 | + Parse the rephrased text from the response. |
| 64 | + :param response: |
| 65 | + :return: rephrased text |
| 66 | + """ |
| 67 | + if "Rephrased Text:" in response: |
| 68 | + rephrased_text = response.split("Rephrased Text:")[1].strip() |
| 69 | + elif "重述文本:" in response: |
| 70 | + rephrased_text = response.split("重述文本:")[1].strip() |
| 71 | + else: |
| 72 | + rephrased_text = response.strip() |
| 73 | + return rephrased_text.strip('"') |
| 74 | + |
| 75 | + @staticmethod |
| 76 | + def _build_prompt_for_question_generation(answer: str) -> str: |
| 77 | + """ |
| 78 | + Build prompts for QUESTION GENERATION. |
| 79 | + :param answer: |
| 80 | + :return: |
| 81 | + """ |
| 82 | + language = detect_main_language(answer) |
| 83 | + prompt = AGGREGATED_GENERATION_PROMPT[language]["QUESTION_GENERATION"].format( |
| 84 | + answer=answer |
| 85 | + ) |
| 86 | + return prompt |
| 87 | + |
| 88 | + @staticmethod |
| 89 | + def parse_response(response: str) -> dict: |
| 90 | + if response.startswith("Question:"): |
| 91 | + question = response[len("Question:") :].strip() |
| 92 | + elif response.startswith("问题:"): |
| 93 | + question = response[len("问题:") :].strip() |
| 94 | + else: |
| 95 | + question = response.strip() |
| 96 | + return { |
| 97 | + "question": question, |
| 98 | + } |
7 | 99 |
|
8 | | - def parse_response(self, response: str): |
9 | | - pass |
| 100 | + async def generate( |
| 101 | + self, |
| 102 | + batch: tuple[ |
| 103 | + list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]] |
| 104 | + ], |
| 105 | + ) -> dict[str, Any]: |
| 106 | + """ |
| 107 | + Generate QAs based on a given batch. |
| 108 | + :param batch |
| 109 | + :return: QA pairs |
| 110 | + """ |
| 111 | + result = {} |
| 112 | + rephrasing_prompt = self.build_prompt(batch) |
| 113 | + response = await self.llm_client.generate_answer(rephrasing_prompt) |
| 114 | + context = self.parse_rephrased_text(response) |
| 115 | + question_generation_prompt = self._build_prompt_for_question_generation(context) |
| 116 | + response = await self.llm_client.generate_answer(question_generation_prompt) |
| 117 | + question = self.parse_response(response)["question"] |
| 118 | + logger.info("Question: %s", question) |
| 119 | + logger.info("Answer: %s", context) |
| 120 | + qa_pairs = { |
| 121 | + compute_content_hash(question): { |
| 122 | + "question": question, |
| 123 | + "answer": context, |
| 124 | + } |
| 125 | + } |
| 126 | + result.update(qa_pairs) |
| 127 | + return result |
0 commit comments