Skip to content

Commit 2a87d57

Browse files
feat: add fill-in-blank qa generation
1 parent d885096 commit 2a87d57

11 files changed

Lines changed: 273 additions & 0 deletions

File tree

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Generate Fill-in-blank QAs
2+
3+
Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
global_params:
2+
working_dir: cache
3+
graph_backend: kuzu # graph database backend, support: kuzu, networkx
4+
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
5+
6+
nodes:
7+
- id: read_files # id is unique in the pipeline, and can be referenced by other steps
8+
op_name: read
9+
type: source
10+
dependencies: []
11+
params:
12+
input_path:
13+
- examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
14+
15+
- id: chunk_documents
16+
op_name: chunk
17+
type: map_batch
18+
dependencies:
19+
- read_files
20+
execution_params:
21+
replicas: 4
22+
params:
23+
chunk_size: 1024 # chunk size for text splitting
24+
chunk_overlap: 100 # chunk overlap for text splitting
25+
26+
- id: build_kg
27+
op_name: build_kg
28+
type: map_batch
29+
dependencies:
30+
- chunk_documents
31+
execution_params:
32+
replicas: 1
33+
batch_size: 128
34+
35+
- id: quiz
36+
op_name: quiz
37+
type: map_batch
38+
dependencies:
39+
- build_kg
40+
execution_params:
41+
replicas: 1
42+
batch_size: 128
43+
params:
44+
quiz_samples: 2 # number of quiz samples to generate
45+
46+
- id: judge
47+
op_name: judge
48+
type: map_batch
49+
dependencies:
50+
- quiz
51+
execution_params:
52+
replicas: 1
53+
batch_size: 128
54+
55+
- id: partition
56+
op_name: partition
57+
type: aggregate
58+
dependencies:
59+
- judge
60+
params:
61+
method: ece # ece is a custom partition method based on comprehension loss
62+
method_params:
63+
max_units_per_community: 20 # max nodes and edges per community
64+
min_units_per_community: 5 # min nodes and edges per community
65+
max_tokens_per_community: 10240 # max tokens per community
66+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
67+
68+
- id: generate
69+
op_name: generate
70+
type: map_batch
71+
dependencies:
72+
- partition
73+
execution_params:
74+
replicas: 1
75+
batch_size: 128
76+
save_output: true # save output
77+
params:
78+
method: fill_in_blank
79+
num_of_questions: 5
80+
data_format: Alpaca # Alpaca, Sharegpt, ChatML
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python3 -m graphgen.run \
2+
--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml

graphgen/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
AggregatedGenerator,
1212
AtomicGenerator,
1313
CoTGenerator,
14+
FillInBlankGenerator,
1415
MultiAnswerGenerator,
1516
MultiChoiceGenerator,
1617
MultiHopGenerator,

graphgen/models/generator/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .aggregated_generator import AggregatedGenerator
22
from .atomic_generator import AtomicGenerator
33
from .cot_generator import CoTGenerator
4+
from .fill_in_blank_generator import FillInBlankGenerator
45
from .multi_answer_generator import MultiAnswerGenerator
56
from .multi_choice_generator import MultiChoiceGenerator
67
from .multi_hop_generator import MultiHopGenerator
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import re
2+
from typing import Any
3+
4+
from graphgen.bases import BaseGenerator
5+
from graphgen.templates import FILL_IN_BLANK_GENERATION_PROMPT
6+
from graphgen.utils import compute_content_hash, detect_main_language, logger
7+
8+
9+
class FillInBlankGenerator(BaseGenerator):
10+
def __init__(self, llm_client, num_of_questions) -> None:
11+
super().__init__(llm_client)
12+
self.num_of_questions = num_of_questions
13+
14+
@staticmethod
15+
def parse_response(response: str) -> Any:
16+
"""
17+
Parse fill-in-the-blank QA pairs from the LLM response.
18+
Each QA pair contains question text with placeholders and the correct answer(s).
19+
20+
:param response: The LLM response containing XML-formatted QA pairs
21+
:return: Dictionary mapping question hash to question data, where each
22+
value is a dict with "question", "answer", and "answers" keys
23+
"""
24+
qa_pairs = {}
25+
26+
# Extract all QA pair blocks
27+
qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
28+
29+
if not qa_blocks:
30+
logger.warning("No QA pairs found in response: %s", response)
31+
return {}
32+
33+
for block in qa_blocks:
34+
# Extract and clean question text
35+
q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
36+
if not q_match:
37+
logger.warning("Failed to parse question from block: %s", block)
38+
continue
39+
question = q_match.group(1).strip().strip('"').strip("'")
40+
41+
# Extract and clean answer text
42+
ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
43+
if not ans_match:
44+
logger.warning("Failed to parse answer from block: %s", block)
45+
continue
46+
47+
answer_text = ans_match.group(1).strip().strip('"').strip("'")
48+
49+
# Parse multiple answers (e.g., "A8X, 八百万" or "A8X")
50+
# Split by comma and strip whitespace from each answer
51+
answers = [ans.strip() for ans in answer_text.split(",") if ans.strip()]
52+
53+
# Ensure at least one valid answer
54+
if len(answers) == 0:
55+
logger.warning("No valid answers found in: %s", answer_text)
56+
continue
57+
58+
# Build result entry with question hash as key
59+
question_hash = compute_content_hash(question)
60+
qa_pairs[question_hash] = {
61+
"question": question,
62+
"answer": answer_text, # Original answer text with commas
63+
"answers": answers, # List of individual answers: ["A8X"] or ["A8X", "八百万"]
64+
}
65+
66+
logger.debug(
67+
"Successfully parsed fill-in-the-blank question: %s", question[:50]
68+
)
69+
70+
if not qa_pairs:
71+
logger.error("Failed to parse any valid QA pairs from response")
72+
73+
return qa_pairs
74+
75+
# pylint: disable=W0221
76+
def build_prompt(
77+
self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
78+
) -> str:
79+
nodes, edges = batch
80+
entities_str = "\n".join(
81+
[
82+
f"{index + 1}. {node[0]}: {node[1]['description']}"
83+
for index, node in enumerate(nodes)
84+
]
85+
)
86+
87+
relationships_str = "\n".join(
88+
[
89+
f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
90+
for index, edge in enumerate(edges)
91+
]
92+
)
93+
context = entities_str + "\n" + relationships_str
94+
language = detect_main_language(entities_str + relationships_str)
95+
prompt = FILL_IN_BLANK_GENERATION_PROMPT[language].format(
96+
context=context,
97+
num_of_questions=self.num_of_questions,
98+
)
99+
return prompt

graphgen/operators/generate/generate_service.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ def __init__(
5757
self.llm_client,
5858
num_of_questions=generate_kwargs.get("num_of_questions", 3),
5959
)
60+
elif self.method == "fill_in_blank":
61+
from graphgen.models import FillInBlankGenerator
62+
63+
self.generator = FillInBlankGenerator(
64+
self.llm_client,
65+
num_of_questions=generate_kwargs.get("num_of_questions", 5),
66+
)
6067
else:
6168
raise ValueError(f"Unsupported generation mode: {method}")
6269

graphgen/templates/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
AGGREGATED_GENERATION_PROMPT,
77
ATOMIC_GENERATION_PROMPT,
88
COT_GENERATION_PROMPT,
9+
FILL_IN_BLANK_GENERATION_PROMPT,
910
MAQ_GENERATION_PROMPT,
1011
MCQ_GENERATION_PROMPT,
1112
MULTI_HOP_GENERATION_PROMPT,

graphgen/templates/generation/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
22
from .atomic_generation import ATOMIC_GENERATION_PROMPT
33
from .cot_generation import COT_GENERATION_PROMPT
4+
from .fill_in_blank_generation import FILL_IN_BLANK_GENERATION_PROMPT
45
from .multi_answer_generation import MAQ_GENERATION_PROMPT
56
from .multi_choice_generation import MCQ_GENERATION_PROMPT
67
from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
TEMPLATE_ZH = """请根据上下文资料生成独立的知识问答填空题。填空题的答案必须能在原文中直接找到。
2+
3+
生成要求:
4+
1. **语言一致性**:若上下文资料为中文,则生成中文问题;若为英文,则生成英文问题
5+
2. **数量**:每个上下文资料生成{num_of_questions}个填空题
6+
3. **独立性**:每个问题必须完整独立,不依赖其他问题
7+
4. **准确性**:正确答案必须能从原文直接得出
8+
5. **占位符格式**:使用________(四个下划线)作为填空占位符
9+
10+
输出格式:
11+
<qa_pairs>
12+
<qa_pair>
13+
<question>问题文本(使用________作为占位符)</question>
14+
<answer>正确答案文本(多个空用逗号分隔)</answer>
15+
</qa_pair>
16+
</qa_pairs>
17+
18+
示例(根据iPad Air 2生成2题):
19+
<qa_pairs>
20+
<qa_pair>
21+
<question>iPad Air 2 是由________制造的?</question>
22+
<answer>美国苹果公司(Apple)</answer>
23+
</qa_pair>
24+
<qa_pair>
25+
<question>iPad Air 2 的发布日期是________,上市日期是________。</question>
26+
<answer>2014年10月16日,2014年10月22日</answer>
27+
</qa_pair>
28+
</qa_pairs>
29+
30+
31+
上下文资料:
32+
{{context}}
33+
34+
请为以下资料生成{num_of_questions}个填空题:
35+
"""
36+
37+
38+
TEMPLATE_EN = """Generate independent fill-in-the-blank questions based on the provided context. \
39+
Answers must be directly derivable from the text.
40+
41+
Requirements:
42+
1. **Language Consistency**: Generate in the same language as the context (Chinese/English)
43+
2. **Quantity**: Generate {num_of_questions} questions per context
44+
3. **Independence**: Each question must be self-contained
45+
4. **Accuracy**: Correct answer must be directly found in the source text
46+
5. **Placeholder Format**: Use ________ (four underscores) as the blank placeholder
47+
48+
Output Format:
49+
<qa_pairs>
50+
<qa_pair>
51+
<question>Question text (use ________ as placeholder)</question>
52+
<answer>Correct answer text (separate multiple blanks with commas)</answer>
53+
</qa_pair>
54+
</qa_pairs>
55+
56+
Example (2 questions):
57+
<qa_pairs>
58+
<qa_pair>
59+
<question>The iPad Air 2 was manufactured by ________?</question>
60+
<answer>Apple Inc.</answer>
61+
</qa_pair>
62+
<qa_pair>
63+
<question>The iPad Air 2 was released on ________ and launched on ________.</question>
64+
<answer>October 16, 2014, October 22, 2014</answer>
65+
</qa_pair>
66+
</qa_pairs>
67+
68+
Context:
69+
{{context}}
70+
71+
Please generate {num_of_questions} fill-in-the-blank questions for the following context:
72+
"""
73+
74+
75+
FILL_IN_BLANK_GENERATION_PROMPT = {
76+
"zh": TEMPLATE_ZH,
77+
"en": TEMPLATE_EN,
78+
}

0 commit comments

Comments
 (0)