feat: add multi choice qa generation

ChenZiHong-Gavin · ChenZiHong-Gavin · commit d849e2326e05 · 2026-01-15T16:11:20.000+08:00
diff --git a/examples/generate/generate_multi_choice_qa/README.md b/examples/generate/generate_multi_choice_qa/README.md
@@ -0,0 +1,3 @@
+# Generate Multi-Choice QAs
+
+Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.
diff --git a/examples/generate/generate_multi_choice_qa/generate_multi_choice.sh b/examples/generate/generate_multi_choice_qa/generate_multi_choice.sh
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
diff --git a/examples/generate/generate_multi_choice_qa/multi_choice_config.yaml b/examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: multi_choice
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/bases/base_generator.py b/graphgen/bases/base_generator.py
@@ -46,38 +46,47 @@ async def generate(
     def format_generation_results(
         results: list[dict], output_data_format: str
     ) -> list[dict[str, Any]]:
-        if output_data_format == "Alpaca":
-            results = [
-                {
-                    "instruction": v["question"],
-                    "input": "",
-                    "output": v["answer"],
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "Sharegpt":
-            results = [
-                {
-                    "conversations": [
-                        {"from": "human", "value": v["question"]},
-                        {"from": "gpt", "value": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "ChatML":
-            results = [
-                {
-                    "messages": [
-                        {"role": "user", "content": v["question"]},
-                        {"role": "assistant", "content": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        else:
-            raise ValueError(f"Unknown output data format: {output_data_format}")
-        return results
+
+        flat_results = []
+        for item in results:
+            for _, qa_data in item.items():
+                question = qa_data.get("question", "")
+                answer = qa_data.get("answer", "")
+                if "options" in qa_data and qa_data["options"]:
+                    options = qa_data["options"]
+                    options_str = "\n".join(
+                        [f"{key}. {options[key]}" for key in sorted(options.keys())]
+                    )
+                    question += f"\nOptions:\n{options_str}"
+
+                if output_data_format == "Alpaca":
+                    flat_results.append(
+                        {
+                            "instruction": question,
+                            "input": "",
+                            "output": answer,
+                        }
+                    )
+                elif output_data_format == "Sharegpt":
+                    flat_results.append(
+                        {
+                            "conversations": [
+                                {"from": "human", "value": question},
+                                {"from": "gpt", "value": answer},
+                            ]
+                        }
+                    )
+                elif output_data_format == "ChatML":
+                    results.append(
+                        {
+                            "messages": [
+                                {"role": "user", "content": question},
+                                {"role": "assistant", "content": answer},
+                            ]
+                        }
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown output data format: {output_data_format}"
+                    )
+        return flat_results
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
@@ -11,6 +11,7 @@
     AggregatedGenerator,
     AtomicGenerator,
     CoTGenerator,
+    MultiChoiceGenerator,
     MultiHopGenerator,
     QuizGenerator,
     VQAGenerator,
diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py
@@ -1,6 +1,7 @@
 from .aggregated_generator import AggregatedGenerator
 from .atomic_generator import AtomicGenerator
 from .cot_generator import CoTGenerator
+from .multi_choice_generator import MultiChoiceGenerator
 from .multi_hop_generator import MultiHopGenerator
 from .quiz_generator import QuizGenerator
 from .vqa_generator import VQAGenerator
diff --git a/graphgen/models/generator/multi_choice_generator.py b/graphgen/models/generator/multi_choice_generator.py
@@ -0,0 +1,122 @@
+import re
+from typing import Any
+
+from graphgen.bases import BaseGenerator
+from graphgen.templates import MCQ_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+
+
+class MultiChoiceGenerator(BaseGenerator):
+    def __init__(self, llm_client, num_of_questions) -> None:
+        super().__init__(llm_client)
+        self.num_of_questions = num_of_questions
+
+    @staticmethod
+    def parse_response(response: str) -> Any:
+        """
+        Parse multiple choice QA pairs from the LLM response.
+        Each QA pair contains question text, four options, and the correct answer.
+
+        :param response: The LLM response containing XML-formatted QA pairs
+        :return: Dictionary mapping question hash to question data, where each
+                 value is a dict with "question", "options", "answer", and
+                 "correct_answer_text" keys
+        """
+        qa_pairs = {}
+
+        # Extract all QA pair blocks
+        qa_blocks = re.findall(r"<qa_pair>(.*?)</qa_pair>", response, re.DOTALL)
+
+        if not qa_blocks:
+            logger.warning("No QA pairs found in response: %s", response)
+            return {}
+
+        for block in qa_blocks:
+            # Extract and clean question text
+            q_match = re.search(r"<question>(.*?)</question>", block, re.DOTALL)
+            if not q_match:
+                logger.warning("Failed to parse question from block: %s", block)
+                continue
+            question = q_match.group(1).strip().strip('"').strip("'")
+
+            # Extract and parse options (A, B, C, D)
+            opt_match = re.search(r"<options>(.*?)</options>", block, re.DOTALL)
+            if not opt_match:
+                logger.warning("Failed to parse options from block: %s", block)
+                continue
+
+            options = {}
+            options_text = opt_match.group(1).strip()
+            for line in options_text.split("\n"):
+                line = line.strip()
+                if not line:
+                    continue
+                # Match patterns like "A. text" or "B. text"
+                if m := re.match(r"^([A-D])[.\s]\s*(.*)$", line):
+                    letter, text = m.groups()
+                    options[letter] = text.strip()
+
+            # Validate options count
+            if len(options) != 4:
+                logger.warning(
+                    "Expected 4 options, found %d: %s", len(options), options_text
+                )
+                continue
+
+            # Extract and validate answer
+            ans_match = re.search(r"<answer>(.*?)</answer>", block, re.DOTALL)
+            if not ans_match:
+                logger.warning("Failed to parse answer from block: %s", block)
+                continue
+            answer = ans_match.group(1).strip().strip('"').strip("'")
+
+            # Ensure answer exists in options
+            if answer not in options:
+                logger.warning(
+                    "Answer '%s' not found in options: %s", answer, list(options.keys())
+                )
+                continue
+
+            # Build result entry with question hash as key
+            question_hash = compute_content_hash(question)
+            qa_pairs[question_hash] = {
+                "question": question,
+                "options": options,  # Dict like {"A": "text", "B": "text", ...}
+                "answer": answer,  # Single letter: "A", "B", "C", or "D"
+                "correct_answer_text": options[
+                    answer
+                ],  # The actual text of correct answer
+            }
+
+            logger.debug("Successfully parsed MCQ: %s", question[:50])
+
+        if not qa_pairs:
+            logger.error("Failed to parse any valid MCQ pairs from response")
+
+        return qa_pairs
+
+    # pylint: disable=W0221
+    def build_prompt(
+        self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
+        )
+
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        context = entities_str + "\n" + relationships_str
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = MCQ_GENERATION_PROMPT[language].format(
+            context=context,
+            num_of_questions=self.num_of_questions,
+        )
+        return prompt
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -2,13 +2,6 @@
 
 from graphgen.bases import BaseLLMWrapper, BaseOperator
 from graphgen.common import init_llm
-from graphgen.models import (
-    AggregatedGenerator,
-    AtomicGenerator,
-    CoTGenerator,
-    MultiHopGenerator,
-    VQAGenerator,
-)
 from graphgen.utils import logger, run_concurrent
 
 
@@ -22,6 +15,7 @@ def __init__(
         working_dir: str = "cache",
         method: str = "aggregated",
         data_format: str = "ChatML",
+        **generate_kwargs,
     ):
         super().__init__(working_dir=working_dir, op_name="generate_service")
         self.llm_client: BaseLLMWrapper = init_llm("synthesizer")
@@ -30,15 +24,32 @@ def __init__(
         self.data_format = data_format
 
         if self.method == "atomic":
+            from graphgen.models import AtomicGenerator
+
             self.generator = AtomicGenerator(self.llm_client)
         elif self.method == "aggregated":
+            from graphgen.models import AggregatedGenerator
+
             self.generator = AggregatedGenerator(self.llm_client)
         elif self.method == "multi_hop":
+            from graphgen.models import MultiHopGenerator
+
             self.generator = MultiHopGenerator(self.llm_client)
         elif self.method == "cot":
+            from graphgen.models import CoTGenerator
+
             self.generator = CoTGenerator(self.llm_client)
-        elif self.method in ["vqa"]:
+        elif self.method == "vqa":
+            from graphgen.models import VQAGenerator
+
             self.generator = VQAGenerator(self.llm_client)
+        elif self.method == "multi_choice":
+            from graphgen.models import MultiChoiceGenerator
+
+            self.generator = MultiChoiceGenerator(
+                self.llm_client,
+                num_of_questions=generate_kwargs.get("num_of_questions", 5),
+            )
         else:
             raise ValueError(f"Unsupported generation mode: {method}")
 
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
@@ -6,10 +6,10 @@
     AGGREGATED_GENERATION_PROMPT,
     ATOMIC_GENERATION_PROMPT,
     COT_GENERATION_PROMPT,
+    MCQ_GENERATION_PROMPT,
     MULTI_HOP_GENERATION_PROMPT,
     VQA_GENERATION_PROMPT,
 )
 from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
-from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py
@@ -1,5 +1,6 @@
 from .aggregated_generation import AGGREGATED_GENERATION_PROMPT
 from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
+from .multi_choice_generation import MCQ_GENERATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
 from .vqa_generation import VQA_GENERATION_PROMPT
diff --git a/graphgen/templates/generation/classification_generation.py b/graphgen/templates/generation/classification_generation.py
diff --git a/graphgen/templates/generation/fill_in_the_blank_generation.py b/graphgen/templates/generation/fill_in_the_blank_generation.py
diff --git a/graphgen/templates/generation/multi_answer_generation.py b/graphgen/templates/generation/multi_answer_generation.py
diff --git a/graphgen/templates/generation/multi_choice_generation.py b/graphgen/templates/generation/multi_choice_generation.py
diff --git a/graphgen/templates/question_generation.py b/graphgen/templates/question_generation.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Generate Multi-Choice QAs`
	`2`	`+`
	`3`	`+Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+python3 -m graphgen.run \`
	`2`	`+--config_file examples/generate/generate_multi_choice_qa/multi_choice_config.yaml`