IBM
diff --git a/‎prepare/cards/rag/end_to_end/hotpotqa.py‎
Lines changed: 4 additions & 2 deletions b/‎prepare/cards/rag/end_to_end/hotpotqa.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎prepare/cards/rag/end_to_end/real_mm_rag.py‎
Lines changed: 131 additions & 0 deletions b/‎prepare/cards/rag/end_to_end/real_mm_rag.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎prepare/tasks/rag/rag_end_to_end.py‎
Lines changed: 2 additions & 2 deletions b/‎prepare/tasks/rag/rag_end_to_end.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/unitxt/catalog/cards/rag/benchmark/hotpotqa/en.json‎
Lines changed: 2 additions & 1 deletion b/‎src/unitxt/catalog/cards/rag/benchmark/hotpotqa/en.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_fin_report/en.json‎
Lines changed: 69 additions & 0 deletions b/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_fin_report/en.json‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_fin_slides/en.json‎
Lines changed: 69 additions & 0 deletions b/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_fin_slides/en.json‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_tech_report/en.json‎
Lines changed: 69 additions & 0 deletions b/‎src/unitxt/catalog/cards/rag/benchmark/real_mm_rag_tech_report/en.json‎
Lines changed: 69 additions & 0 deletions
@@ -18,8 +18,9 @@
 # Benchmark
 benchmark_card = TaskCard(
     loader=LoadHF(
-        path="hotpotqa/hotpot_qa",
+        path="vincentkoc/hotpot_qa_archive",
         name="distractor",
+        revision="c060661",
         data_classification_policy=["public"],
     ),
     preprocess_steps=[
@@ -85,7 +86,8 @@
 # Documents
 documents_card = TaskCard(
     loader=LoadHF(
-        path="hotpotqa/hotpot_qa",
+        path="vincentkoc/hotpot_qa_archive",
+        revision="c060661",
         name="distractor",
         data_classification_policy=["public"],
     ),
 
@@ -0,0 +1,131 @@
+import json
+
+from unitxt import add_to_catalog
+from unitxt.blocks import TaskCard
+from unitxt.collections_operators import Wrap
+from unitxt.image_operators import HashImage, ToImage
+from unitxt.loaders import LoadHF
+from unitxt.operators import (
+    AddIncrementalId,
+    Cast,
+    Copy,
+    Deduplicate,
+    FilterByCondition,
+)
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.templates import InputOutputTemplate
+from unitxt.test_utils.card import test_card
+
+description = (
+    "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate "
+    "retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using "
+    "an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large "
+    "language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. "
+    "To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries "
+    "at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models "
+    "are tested on their true semantic understanding rather than simple keyword matching."
+)
+
+datasets = [
+    {"hf_name": "REAL-MM-RAG_FinSlides", "subset": "fin_slides"},
+    {"hf_name": "REAL-MM-RAG_FinReport", "subset": "fin_report"},
+    {"hf_name": "REAL-MM-RAG_TechReport", "subset": "tech_report"},
+    {"hf_name": "REAL-MM-RAG_TechSlides", "subset": "tech_slides"},
+]
+
+hf_ibm_research = "ibm-research"
+hf_url_base = "https://huggingface.co/datasets/"
+
+for dataset in datasets:
+    hf_name = dataset["hf_name"]
+    hf_dataset_id = f"{hf_ibm_research}/{hf_name}"
+    hf_url = f"{hf_url_base}/{hf_dataset_id}"
+    subset = dataset["subset"]
+
+    # first we create the card for the benchmark
+    card = TaskCard(
+        loader=LoadHF(
+            path=hf_dataset_id,
+            name="default",
+            split="test",
+            data_classification_policy=["public"],
+        ),
+        preprocess_steps=[
+            FilterByCondition(values={"query": None}, condition="ne"),
+            HashImage(
+                field="image",
+                to_field="reference_context_ids",
+            ),
+            Copy( field="query", to_field="question"),
+            AddIncrementalId(to_field="question_id"),
+            Cast(field="question_id", to="str"),
+            SplitRandomMix(
+                {
+                    "test": "test[30%]",
+                    "train": "test[70%]",
+                }),
+            Wrap(
+                field="answer",
+                inside="list",
+                to_field="reference_answers",
+            ),
+            Wrap(
+                field="reference_context_ids",
+                inside="list",
+                to_field="reference_context_ids",
+            ),
+        ],
+        task="tasks.rag.end_to_end",
+        templates={"default": "templates.rag.end_to_end.json_predictions"},
+        __tags__={"license": "cdla-permissive-2.0", "url": hf_url},
+        __title__=dataset["hf_name"].replace("-", "").replace("_", ": "),
+        __description__=description,
+    )
+
+    wrong_answer = {
+        "contexts": ["hi"],
+        "is_answerable": True,
+        "answer": "Don't know",
+        "context_ids": ["id0"],
+    }
+
+    test_card(
+        card,
+        strict=True,
+        full_mismatch_prediction_values=[json.dumps(wrong_answer)],
+        debug=False,
+    )
+
+    add_to_catalog(card, f"cards.rag.benchmark.real_mm_rag_{subset}.en", overwrite=True)
+
+    # next we create the card for the pages (documents)
+    card = TaskCard(
+        loader=LoadHF(
+            path=hf_dataset_id,
+            name="default",
+            split="test",
+            data_classification_policy=["public"],
+        ),
+        preprocess_steps=[
+            RenameSplits({"test": "train"}),
+            HashImage(
+                field="image",
+                to_field="document_id",
+            ),
+            Deduplicate(by=["document_id"]),
+            ToImage(field="image"),
+            Wrap(field="image", inside="list", to_field="passages"),
+        ],
+        task="tasks.rag.corpora",
+        templates={
+            "empty": InputOutputTemplate(
+                input_format="",
+                output_format="",
+            ),
+        },
+        __tags__={"license": "cdla-permissive-2.0", "url": hf_url},
+        __title__=dataset["hf_name"].replace("-", "").replace("_", ": "),
+        __description__=description,
+    )
+    # Not testing card, because documents are not evaluated.
+    add_to_catalog(card, f"cards.rag.documents.real_mm_rag_{subset}.en", overwrite=True)
@@ -2,7 +2,7 @@
 
 from unitxt import add_to_catalog
 from unitxt.blocks import Task
-from unitxt.types import Dialog, RagResponse
+from unitxt.types import Dialog, Image, RagResponse
 
 add_to_catalog(
     Task(
@@ -50,7 +50,7 @@
         input_fields={
             "document_id": str,
             "title": str,
-            "passages": List[str],
+            "passages": List[Union[str, Image]],
             "metadata_tags": Dict[str, str],
         },
         reference_fields={},
 
@@ -2,8 +2,9 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_hf",
-        "path": "hotpotqa/hotpot_qa",
+        "path": "vincentkoc/hotpot_qa_archive",
         "name": "distractor",
+        "revision": "c060661",
         "data_classification_policy": [
             "public"
         ]
 
@@ -0,0 +1,69 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "ibm-research/REAL-MM-RAG_FinReport",
+        "name": "default",
+        "split": "test",
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "filter_by_condition",
+            "values": {
+                "query": null
+            },
+            "condition": "ne"
+        },
+        {
+            "__type__": "hash_image",
+            "field": "image",
+            "to_field": "reference_context_ids"
+        },
+        {
+            "__type__": "copy",
+            "field": "query",
+            "to_field": "question"
+        },
+        {
+            "__type__": "add_incremental_id",
+            "to_field": "question_id"
+        },
+        {
+            "__type__": "cast",
+            "field": "question_id",
+            "to": "str"
+        },
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test[30%]",
+                "train": "test[70%]"
+            }
+        },
+        {
+            "__type__": "wrap",
+            "field": "answer",
+            "inside": "list",
+            "to_field": "reference_answers"
+        },
+        {
+            "__type__": "wrap",
+            "field": "reference_context_ids",
+            "inside": "list",
+            "to_field": "reference_context_ids"
+        }
+    ],
+    "task": "tasks.rag.end_to_end",
+    "templates": {
+        "default": "templates.rag.end_to_end.json_predictions"
+    },
+    "__tags__": {
+        "license": "cdla-permissive-2.0",
+        "url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_FinReport"
+    },
+    "__title__": "REALMMRAG: FinReport",
+    "__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
+}
@@ -0,0 +1,69 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "ibm-research/REAL-MM-RAG_FinSlides",
+        "name": "default",
+        "split": "test",
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "filter_by_condition",
+            "values": {
+                "query": null
+            },
+            "condition": "ne"
+        },
+        {
+            "__type__": "hash_image",
+            "field": "image",
+            "to_field": "reference_context_ids"
+        },
+        {
+            "__type__": "copy",
+            "field": "query",
+            "to_field": "question"
+        },
+        {
+            "__type__": "add_incremental_id",
+            "to_field": "question_id"
+        },
+        {
+            "__type__": "cast",
+            "field": "question_id",
+            "to": "str"
+        },
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test[30%]",
+                "train": "test[70%]"
+            }
+        },
+        {
+            "__type__": "wrap",
+            "field": "answer",
+            "inside": "list",
+            "to_field": "reference_answers"
+        },
+        {
+            "__type__": "wrap",
+            "field": "reference_context_ids",
+            "inside": "list",
+            "to_field": "reference_context_ids"
+        }
+    ],
+    "task": "tasks.rag.end_to_end",
+    "templates": {
+        "default": "templates.rag.end_to_end.json_predictions"
+    },
+    "__tags__": {
+        "license": "cdla-permissive-2.0",
+        "url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_FinSlides"
+    },
+    "__title__": "REALMMRAG: FinSlides",
+    "__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
+}
@@ -0,0 +1,69 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "ibm-research/REAL-MM-RAG_TechReport",
+        "name": "default",
+        "split": "test",
+        "data_classification_policy": [
+            "public"
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "filter_by_condition",
+            "values": {
+                "query": null
+            },
+            "condition": "ne"
+        },
+        {
+            "__type__": "hash_image",
+            "field": "image",
+            "to_field": "reference_context_ids"
+        },
+        {
+            "__type__": "copy",
+            "field": "query",
+            "to_field": "question"
+        },
+        {
+            "__type__": "add_incremental_id",
+            "to_field": "question_id"
+        },
+        {
+            "__type__": "cast",
+            "field": "question_id",
+            "to": "str"
+        },
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test[30%]",
+                "train": "test[70%]"
+            }
+        },
+        {
+            "__type__": "wrap",
+            "field": "answer",
+            "inside": "list",
+            "to_field": "reference_answers"
+        },
+        {
+            "__type__": "wrap",
+            "field": "reference_context_ids",
+            "inside": "list",
+            "to_field": "reference_context_ids"
+        }
+    ],
+    "task": "tasks.rag.end_to_end",
+    "templates": {
+        "default": "templates.rag.end_to_end.json_predictions"
+    },
+    "__tags__": {
+        "license": "cdla-permissive-2.0",
+        "url": "https://huggingface.co/datasets//ibm-research/REAL-MM-RAG_TechReport"
+    },
+    "__title__": "REALMMRAG: TechReport",
+    "__description__": "We introduced REAL-MM-RAG-Bench, a real-world multi-modal retrieval benchmark designed to evaluate retrieval models in reliable, challenging, and realistic settings. The benchmark was constructed using an automated pipeline, where queries were generated by a vision-language model (VLM), filtered by a large language model (LLM), and rephrased by an LLM to ensure high-quality retrieval evaluation. To simulate real-world retrieval challenges, we introduce multi-level query rephrasing, modifying queries at three distinct levels—from minor wording adjustments to significant structural changes—ensuring models are tested on their true semantic understanding rather than simple keyword matching."
+}