feat: add fill-in-blank qa generation

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 2a87d5704f76 · 2026-01-15T18:39:44.000+08:00
diff --git a/examples/generate/generate_fill_in_blank_qa/README.md b/examples/generate/generate_fill_in_blank_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Fill-in-blank QAs
+
+Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
diff --git a/examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml b/examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: fill_in_blank
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh b/examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -11,6 +11,7 @@
     AggregatedGenerator,
     AtomicGenerator,
     CoTGenerator,
+    FillInBlankGenerator,
     MultiAnswerGenerator,
     MultiChoiceGenerator,
     MultiHopGenerator,
diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py
@@ -1,6 +1,7 @@
 from .aggregated_generator import AggregatedGenerator
 from .atomic_generator import AtomicGenerator
 from .cot_generator import CoTGenerator
+from .fill_in_blank_generator import FillInBlankGenerator
 from .multi_answer_generator import MultiAnswerGenerator
 from .multi_choice_generator import MultiChoiceGenerator
 from .multi_hop_generator import MultiHopGenerator
diff --git a/graphgen/models/generator/fill_in_blank_generator.py b/graphgen/models/generator/fill_in_blank_generator.py
@@ -0,0 +1,99 @@
+import re
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import FILL_IN_BLANK_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class FillInBlankGenerator(BaseGenerator):
+    def __init__(self, llm_client, num_of_questions) -> None:
+        super().__init__(llm_client)
+        self.num_of_questions = num_of_questions
+
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse fill-in-the-blank QA pairs from the LLM response.
+        Each QA pair contains question text with placeholders and the correct answer(s).
+
+        :param response: The LLM response containing XML-formatted QA pairs
+        :return: Dictionary mapping question hash to question data, where each
+                 value is a dict with "question", "answer", and "answers" keys
+        """
+        qa_pairs = {}
+
+        # Extract all QA pair blocks
+        qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
+
+        if not qa_blocks:
+            logger.warning("No QA pairs found in response: %s", response)
+            return {}
+
+        for block in qa_blocks:
+            # Extract and clean question text
+            q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
+            if not q_match:
+                logger.warning("Failed to parse question from block: %s", block)
+                continue
+            question = q_match.group(1).strip().strip('"').strip("'")
+
+            # Extract and clean answer text
+            ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
+            if not ans_match:
+                logger.warning("Failed to parse answer from block: %s", block)
+                continue
+
+            answer_text = ans_match.group(1).strip().strip('"').strip("'")
+
+            # Parse multiple answers (e.g., "A8X, 八百万" or "A8X")
+            # Split by comma and strip whitespace from each answer
+            answers = [ans.strip() for ans in answer_text.split(",") if ans.strip()]
+
+            # Ensure at least one valid answer
+            if len(answers) == 0:
+                logger.warning("No valid answers found in: %s", answer_text)
+                continue
+
+            # Build result entry with question hash as key
+            question_hash = compute_content_hash(question)
+            qa_pairs[question_hash] = {
+                "question": question,
+                "answer": answer_text,  # Original answer text with commas
+                "answers": answers,  # List of individual answers: ["A8X"] or ["A8X", "八百万"]
+            }
+
+            logger.debug(
+                "Successfully parsed fill-in-the-blank question: %s", question[:50]
+            )
+
+        if not qa_pairs:
+            logger.error("Failed to parse any valid QA pairs from response")
+
+        return qa_pairs
+
+    # pylint: disable=W0221
+    def build_prompt(
+        self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        context = entities_str + "\n" + relationships_str
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = FILL_IN_BLANK_GENERATION_PROMPT[language].format(
+            context=context,
+            num_of_questions=self.num_of_questions,
+        )
+        return prompt
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -57,6 +57,13 @@ def __init__(
                 self.llm_client,
                 num_of_questions=generate_kwargs.get("num_of_questions", 3),
             )
+        elif self.method == "fill_in_blank":
+            from graphgen.models import FillInBlankGenerator
+
+            self.generator = FillInBlankGenerator(
+                self.llm_client,
+                num_of_questions=generate_kwargs.get("num_of_questions", 5),
+            )
         else:
             raise ValueError(f"Unsupported generation mode: {method}")
 
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
@@ -6,6 +6,7 @@
     AGGREGATED_GENERATION_PROMPT,
     ATOMIC_GENERATION_PROMPT,
     COT_GENERATION_PROMPT,
+    FILL_IN_BLANK_GENERATION_PROMPT,
     MAQ_GENERATION_PROMPT,
     MCQ_GENERATION_PROMPT,
     MULTI_HOP_GENERATION_PROMPT,
diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py
@@ -1,6 +1,7 @@
 from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
 from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
+from .fill_in_blank_generation import FILL_IN_BLANK_GENERATION_PROMPT
 from .multi_answer_generation import MAQ_GENERATION_PROMPT
 from .multi_choice_generation import MCQ_GENERATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
diff --git a/graphgen/templates/generation/fill_in_blank_generation.py b/graphgen/templates/generation/fill_in_blank_generation.py
@@ -0,0 +1,78 @@
+TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答填空题。填空题的答案必须能在原文中直接找到。
+
+生成要求：
+1. **语言一致性**：若上下文资料为中文，则生成中文问题；若为英文，则生成英文问题
+2. **数量**：每个上下文资料生成{num_of_questions}个填空题
+3. **独立性**：每个问题必须完整独立，不依赖其他问题
+4. **准确性**：正确答案必须能从原文直接得出
+5. **占位符格式**：使用________（四个下划线）作为填空占位符
+
+输出格式：
+<qa_pairs>
+<qa_pair>
+<question>问题文本（使用________作为占位符）</question>
+<answer>正确答案文本（多个空用逗号分隔）</answer>
+</qa_pair>
+</qa_pairs>
+
+示例（根据iPad Air 2生成2题）：
+<qa_pairs>
+<qa_pair>
+<question>iPad Air 2 是由________制造的？</question>
+<answer>美国苹果公司（Apple）</answer>
+</qa_pair>
+<qa_pair>
+<question>iPad Air 2 的发布日期是________，上市日期是________。</question>
+<answer>2014年10月16日，2014年10月22日</answer>
+</qa_pair>
+</qa_pairs>
+
+
+上下文资料：
+{{context}}
+
+请为以下资料生成{num_of_questions}个填空题：
+"""
+
+
+TEMPLATE_EN = """Generate independent fill-in-the-blank questions based on the provided context. \
+Answers must be directly derivable from the text.
+
+Requirements:
+1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
+2. **Quantity**: Generate {num_of_questions} questions per context
+3. **Independence**: Each question must be self-contained
+4. **Accuracy**: Correct answer must be directly found in the source text
+5. **Placeholder Format**: Use ________ (four underscores) as the blank placeholder
+
+Output Format:
+<qa_pairs>
+<qa_pair>
+<question>Question text (use ________ as placeholder)</question>
+<answer>Correct answer text (separate multiple blanks with commas)</answer>
+</qa_pair>
+</qa_pairs>
+
+Example (2 questions):
+<qa_pairs>
+<qa_pair>
+<question>The iPad Air 2 was manufactured by ________?</question>
+<answer>Apple Inc.</answer>
+</qa_pair>
+<qa_pair>
+<question>The iPad Air 2 was released on ________ and launched on ________.</question>
+<answer>October 16, 2014, October 22, 2014</answer>
+</qa_pair>
+</qa_pairs>
+
+Context:
+{{context}}
+
+Please generate {num_of_questions} fill-in-the-blank questions for the following context:
+"""
+
+
+FILL_IN_BLANK_GENERATION_PROMPT = {
+    "zh": TEMPLATE_ZH,
+    "en": TEMPLATE_EN,
+}
diff --git a/graphgen/templates/generation/fill_in_the_blank_generation.py b/graphgen/templates/generation/fill_in_the_blank_generation.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Generate Fill-in-blank QAs`
	`2`	`+`
	`3`	`+Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+python3 -m graphgen.run \`
	`2`	`+--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml`