Adding optional dataset_name arg to index() method (#138)

gabrielspmoreira · web-flow · commit a70f23af8bb3 · 2026-03-25T17:38:17.000+01:00
* Adding optional dataset_name arg to index() method

* Fixing linting issues

* Fixing linting issues
diff --git a/README.md b/README.md
@@ -126,7 +126,7 @@ class MyCustomPipeline(BasePipeline):
         self.model_name = model_name
         # Initialize your model here
 
-    def index(self, corpus_ids, corpus_images, corpus_texts):
+    def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name: str = None):
         # Indexing function to process corpus, should store anything
         # relevant as class attributes
         self.corpus_ids = corpus_ids
diff --git a/pipeline_implementations/jinav4_text_zerank2textual_pipeline.py b/pipeline_implementations/jinav4_text_zerank2textual_pipeline.py
@@ -257,7 +257,7 @@ def _rerank_candidates(
 
         return results
 
-    def index(self, corpus_ids: List[str], corpus_images: List[str], corpus_texts: List[str]) -> None:
+    def index(self, corpus_ids: List[str], corpus_images: List[str], corpus_texts: List[str], dataset_name: str = None) -> None:
         """
         Indexing step for the pipeline. For this implementation, we don't need to do
         anything here since we compute embeddings on the fly in the search method.
diff --git a/pipeline_implementations/jinav4_vision_jinavisualreranker_pipeline.py b/pipeline_implementations/jinav4_vision_jinavisualreranker_pipeline.py
@@ -301,7 +301,7 @@ def _rerank_candidates(
 
         return results
 
-    def index(self, corpus_ids, corpus_images, corpus_texts):
+    def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name: str = None):
         """
         Store corpus data for use in search().
 
diff --git a/pipeline_implementations/mxbai_edge_colbert_pipeline.py b/pipeline_implementations/mxbai_edge_colbert_pipeline.py
@@ -159,7 +159,7 @@ def _compute_maxsim_scores(
 
         return torch.cat(scores, dim=0)
 
-    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
+    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name: str = None) -> None:
         """
         Indexing is performed on-the-fly in the retrieve method for this pipeline.
         This method is not used but must be implemented to satisfy the BasePipeline interface.
diff --git a/pipeline_implementations/nemotron_colembed_8b_v2.py b/pipeline_implementations/nemotron_colembed_8b_v2.py
@@ -297,7 +297,7 @@ def __init__(self, model_name = "nvidia/nemotron-colembed-vl-8b-v2", batch_size:
         self.batch_size = batch_size
         self.embedding_model = NemotronColEmbed8B(model_name=model_name, batch_size=batch_size)
 
-    def index(self, corpus_ids, corpus_images, corpus_texts):
+    def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
         """
         Store corpus data for use in search().
 
diff --git a/pipeline_implementations/nemotron_embed_and_rerank_vl_v2.py b/pipeline_implementations/nemotron_embed_and_rerank_vl_v2.py
@@ -400,7 +400,7 @@ def __init__(self,
                                         batch_size=ranker_batch_size,
                                         modality=self.modality)
         
-    def index(self, corpus_ids, corpus_images, corpus_texts):
+    def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
         """
         Store corpus data for use in search().
 
diff --git a/pipeline_implementations/nemotron_embed_vl_v2.py b/pipeline_implementations/nemotron_embed_vl_v2.py
@@ -308,7 +308,7 @@ def __init__(self, model_name = "nvidia/llama-nemotron-embed-vl-1b-v2", batch_si
         self.batch_size = batch_size
         self.embedding_model = NemotronEmbedVL(model_name=model_name, batch_size=batch_size, modality=self.modality)
 
-    def index(self, corpus_ids, corpus_images, corpus_texts):
+    def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
         """
         Store corpus data for use in search().
 
diff --git a/pipeline_implementations/qwen3_embedding_8b_pipeline.py b/pipeline_implementations/qwen3_embedding_8b_pipeline.py
@@ -178,7 +178,7 @@ def _compute_similarity(self, query_embeddings: torch.Tensor, corpus_embeddings:
 
         return scores
 
-    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
+    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name = None) -> None:
         """
         Index the corpus by embedding all texts and storing them in memory.
         The embeddings are stored in self.corpus_embeddings and the corresponding IDs and texts are stored
diff --git a/src/vidore_benchmark/cli/pipeline_evaluation.py b/src/vidore_benchmark/cli/pipeline_evaluation.py
@@ -217,6 +217,7 @@ def evaluate(
             corpus_images=corpus_images,
             corpus_texts=corpus_texts,
             qrels=qrels,
+            dataset_name=dataset_name,
             metrics=[
                 "ndcg_cut_1",
                 "ndcg_cut_5",
@@ -452,6 +453,7 @@ def evaluate_all(
                 corpus_images=corpus_images,
                 corpus_texts=corpus_texts,
                 qrels=qrels,
+                dataset_name=dataset_name,
                 metrics=[
                     "ndcg_cut_1",
                     "ndcg_cut_5",
diff --git a/src/vidore_benchmark/pipeline_evaluation/base_pipeline.py b/src/vidore_benchmark/pipeline_evaluation/base_pipeline.py
@@ -14,7 +14,9 @@ class BasePipeline(ABC):
     with their custom pipeline logic.
     """
 
-    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
+    def index(
+        self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name=None
+    ) -> None:
         """
         Optional method to perform indexing or preprocessing on the corpus.
 
diff --git a/src/vidore_benchmark/pipeline_evaluation/evaluator.py b/src/vidore_benchmark/pipeline_evaluation/evaluator.py
@@ -19,6 +19,7 @@ def evaluate_retrieval(
     corpus_images: List[Any],
     corpus_texts: List[str],
     qrels: Dict[str, Dict[str, int]],
+    dataset_name: Optional[str] = None,
     metrics: List[str] = None,
     track_time: bool = True,
 ) -> Dict[str, Dict[str, float]]:
@@ -34,6 +35,7 @@ def evaluate_retrieval(
         corpus_texts: List of corpus texts (markdown strings)
         qrels: Ground truth relevance judgments in pytrec_eval format
                {query_id: {doc_id: relevance_score}}
+        dataset_name: Dataset name,
         metrics: List of metrics to calculate (default: ['ndcg_cut_10'])
         track_time: Whether to track retrieval time (default: True)
 
@@ -52,7 +54,9 @@ def evaluate_retrieval(
     # Call the pipeline's method to get retrieval results
     # Indexing step
     start_time_indexing = time.time()
-    pipeline.index(corpus_ids=corpus_ids, corpus_images=corpus_images, corpus_texts=corpus_texts)
+    pipeline.index(
+        corpus_ids=corpus_ids, corpus_images=corpus_images, corpus_texts=corpus_texts, dataset_name=dataset_name
+    )
     indexing_time = time.time() - start_time_indexing
 
     # Avoid tracking indexing time if no other thing is done than storing the corpus
diff --git a/tests/pipeline_evaluation/test_evaluator.py b/tests/pipeline_evaluation/test_evaluator.py
@@ -17,7 +17,7 @@ def __init__(self, results: Dict[str, Dict[str, float]], infos: Optional[Dict[st
         self.results = results
         self.infos = infos
 
-    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]):
+    def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name=None):
         """Mock index method."""
         pass