Added example of multi-choice QA (#1634)

yoavkatz · web-flow · commit 1ab791d226c8 · 2025-04-09T22:20:47.000+03:00
diff --git a/docs/docs/adding_operator.rst b/docs/docs/adding_operator.rst
@@ -55,12 +55,12 @@ To manipulate a single field, inherit from :class:`FieldOperator <operator.Field
 
 .. code-block:: python
 
-    from unitxt.operator import FieldOperator
+    from unitxt.operators import FieldOperator
 
     class AddNumber(FieldOperator):
         number: float
 
-        def process(self, value):
+        def process_value(self, value):
             return value + self.number
 
 **Explanation**: This class adds a specified number to the input value. It inherits from `FieldOperator` which is designed to operate on a single field.
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -74,6 +74,15 @@ Classical f1_micro, f1_macro, and per-entity-type f1 metrics are reported.
 
 Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`NER task in catalog <catalog.tasks.ner.all_entity_types>`, :ref:`Inference Engines <inference>`.
 
+Evaluate a multi choice QA dataset
+===================================================
+
+This example demonstrates how to evaluate a multi choice question answering dataset.
+ 
+`Example code  <https://github.com/IBM/unitxt/blob/main/examples/multiple_choice_qa_evaluation.py>`__
+
+Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`Multiple choice task in catalog <catalog.tasks.qa.multiple_choice.open>`, :ref:`Inference Engines <inference>`.
+
 Evaluate API Call 
 +++++++++++++++++++++++++++++++++++++++++
 
diff --git a/examples/multiple_choice_qa_evaluation.py b/examples/multiple_choice_qa_evaluation.py
@@ -0,0 +1,73 @@
+import json
+
+from unitxt import get_logger, load_dataset
+from unitxt.api import LoadFromDictionary, TaskCard, evaluate
+from unitxt.blocks import Rename
+from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.operators import IndexOf, ListFieldValues
+from unitxt.templates import MultipleChoiceTemplate
+
+logger = get_logger()
+
+# Set up question answer pairs in a dictionary
+data = [
+    {"Question": "What is the capital of Texas?", "Option A": "Austin", "Option B": "Houston", "Option C": "Dallas", "Answer" : "Austin"},
+    {"Question": "What is the color of the sky?", "Option A": "Pink",   "Option B":  "Red", "Option C": "Blue" , "Answer" : "Blue"},
+]
+
+
+# Create a unitxt cards that converts the input data to the format required by the
+# t`asks.qa.multiple_choice.open task`.
+#
+# It concatenates the different options fields to the 'choices' field.
+# And sets the 'answer' field, to the index of the correct answer in the 'choices' field.
+card =  TaskCard(
+        loader=LoadFromDictionary(data = { "test": data }),
+        preprocess_steps=[
+            Rename(
+                field_to_field={"Answer": "answer", "Question" : "question"},
+            ),
+            ListFieldValues(fields=["Option A", "Option B", "Option C"], to_field="choices"),
+            IndexOf(search_in="choices", index_of="answer", to_field="answer")
+        ],
+        task="tasks.qa.multiple_choice.open"
+)
+
+template = MultipleChoiceTemplate(
+        input_format="Answer the following question, returning only a single letter.  Do not any add any explanations. \n\nQuestion: {question}\nAnswers:\n{choices}\nAnswer:",
+        target_field="answer",
+        choices_separator="\n",
+        postprocessors=["processors.lower_case","processors.first_character"],
+    )
+
+dataset = load_dataset(
+    card = card,
+    template=template,
+    split="test",
+    format="formats.chat_api",
+)
+
+# Infer using Llama-3.2-1B base using HF API
+model = HFPipelineBasedInferenceEngine(
+    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
+)
+# Change to this to infer with external APIs:
+#from unitxt.inference import CrossProviderInferenceEngine
+# model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+
+
+predictions = model(dataset)
+results = evaluate(predictions=predictions, data=dataset)
+
+print("Example prompt:")
+print(json.dumps(results.instance_scores[0]["source"], indent=4))
+
+
+print("Instance Results:")
+print(results.instance_scores)
+
+print("Global Results:")
+print(results.global_scores.summary)
+
+