Skip to content

Commit a70f23a

Browse files
Adding optional dataset_name arg to index() method (#138)
* Adding optional dataset_name arg to index() method * Fixing linting issues * Fixing linting issues
1 parent 01ee2d9 commit a70f23a

12 files changed

Lines changed: 19 additions & 11 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ class MyCustomPipeline(BasePipeline):
126126
self.model_name = model_name
127127
# Initialize your model here
128128

129-
def index(self, corpus_ids, corpus_images, corpus_texts):
129+
def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name: str = None):
130130
# Indexing function to process corpus, should store anything
131131
# relevant as class attributes
132132
self.corpus_ids = corpus_ids

pipeline_implementations/jinav4_text_zerank2textual_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def _rerank_candidates(
257257

258258
return results
259259

260-
def index(self, corpus_ids: List[str], corpus_images: List[str], corpus_texts: List[str]) -> None:
260+
def index(self, corpus_ids: List[str], corpus_images: List[str], corpus_texts: List[str], dataset_name: str = None) -> None:
261261
"""
262262
Indexing step for the pipeline. For this implementation, we don't need to do
263263
anything here since we compute embeddings on the fly in the search method.

pipeline_implementations/jinav4_vision_jinavisualreranker_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def _rerank_candidates(
301301

302302
return results
303303

304-
def index(self, corpus_ids, corpus_images, corpus_texts):
304+
def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name: str = None):
305305
"""
306306
Store corpus data for use in search().
307307

pipeline_implementations/mxbai_edge_colbert_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def _compute_maxsim_scores(
159159

160160
return torch.cat(scores, dim=0)
161161

162-
def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
162+
def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name: str = None) -> None:
163163
"""
164164
Indexing is performed on-the-fly in the retrieve method for this pipeline.
165165
This method is not used but must be implemented to satisfy the BasePipeline interface.

pipeline_implementations/nemotron_colembed_8b_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def __init__(self, model_name = "nvidia/nemotron-colembed-vl-8b-v2", batch_size:
297297
self.batch_size = batch_size
298298
self.embedding_model = NemotronColEmbed8B(model_name=model_name, batch_size=batch_size)
299299

300-
def index(self, corpus_ids, corpus_images, corpus_texts):
300+
def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
301301
"""
302302
Store corpus data for use in search().
303303

pipeline_implementations/nemotron_embed_and_rerank_vl_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def __init__(self,
400400
batch_size=ranker_batch_size,
401401
modality=self.modality)
402402

403-
def index(self, corpus_ids, corpus_images, corpus_texts):
403+
def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
404404
"""
405405
Store corpus data for use in search().
406406

pipeline_implementations/nemotron_embed_vl_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def __init__(self, model_name = "nvidia/llama-nemotron-embed-vl-1b-v2", batch_si
308308
self.batch_size = batch_size
309309
self.embedding_model = NemotronEmbedVL(model_name=model_name, batch_size=batch_size, modality=self.modality)
310310

311-
def index(self, corpus_ids, corpus_images, corpus_texts):
311+
def index(self, corpus_ids, corpus_images, corpus_texts, dataset_name = None):
312312
"""
313313
Store corpus data for use in search().
314314

pipeline_implementations/qwen3_embedding_8b_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def _compute_similarity(self, query_embeddings: torch.Tensor, corpus_embeddings:
178178

179179
return scores
180180

181-
def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
181+
def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name = None) -> None:
182182
"""
183183
Index the corpus by embedding all texts and storing them in memory.
184184
The embeddings are stored in self.corpus_embeddings and the corresponding IDs and texts are stored

src/vidore_benchmark/cli/pipeline_evaluation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ def evaluate(
217217
corpus_images=corpus_images,
218218
corpus_texts=corpus_texts,
219219
qrels=qrels,
220+
dataset_name=dataset_name,
220221
metrics=[
221222
"ndcg_cut_1",
222223
"ndcg_cut_5",
@@ -452,6 +453,7 @@ def evaluate_all(
452453
corpus_images=corpus_images,
453454
corpus_texts=corpus_texts,
454455
qrels=qrels,
456+
dataset_name=dataset_name,
455457
metrics=[
456458
"ndcg_cut_1",
457459
"ndcg_cut_5",

src/vidore_benchmark/pipeline_evaluation/base_pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ class BasePipeline(ABC):
1414
with their custom pipeline logic.
1515
"""
1616

17-
def index(self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str]) -> None:
17+
def index(
18+
self, corpus_ids: List[str], corpus_images: List[Any], corpus_texts: List[str], dataset_name=None
19+
) -> None:
1820
"""
1921
Optional method to perform indexing or preprocessing on the corpus.
2022

0 commit comments

Comments
 (0)