Skip to content

Commit fb68651

Browse files
Added diver retriever and BRIGHT benchmark retrieval example to README
1 parent 36446a6 commit fb68651

File tree

2 files changed

+54
-15
lines changed

2 files changed

+54
-15
lines changed

README.md

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -849,7 +849,7 @@ rankify-index index data/wikipedia_100.jsonl \
849849
---
850850

851851
### 2️⃣ Running Retrieval
852-
To perform retrieval using **Rankify**, you can choose from various retrieval methods such as **BM25, DPR, ANCE, Contriever, ColBERT, and BGE**.
852+
To perform retrieval using **Rankify**, you can choose from various retrieval methods such as **BM25, DPR, ANCE, Contriever, ColBERT, BGE, Sbert, Nomic, Instructor, DiverRetriever, SRF, E5, RaDeR, M2, GritLM, ReasonEmbed, ReasonIR and BGEReasoner**.
853853

854854
### Step 1: Setup example queries
855855

@@ -882,64 +882,64 @@ Retrieval ```index_type``` (e.g., ```"wiki```", "```msmarco```") to load pre-co
882882
bm25_retriever_wiki = Retriever(method="bm25", n_docs=5, index_type="wiki")
883883

884884
# BM25 retrieval on MS MARCO
885-
bm25_retriever_msmacro = Retriever(method="bm25", n_docs=5, index_type="msmarco")
885+
bm25_retriever_msmarco = Retriever(method="bm25", n_docs=5, index_type="msmarco")
886886

887887

888888
# DPR (multi-encoder) retrieval on Wikipedia
889889
dpr_retriever_wiki = Retriever(method="dpr", model="dpr-multi", n_docs=5, index_type="wiki")
890890

891891
# DPR (multi-encoder) retrieval on MS MARCO
892-
dpr_retriever_msmacro = Retriever(method="dpr", model="dpr-multi", n_docs=5, index_type="msmarco")
892+
dpr_retriever_msmarco = Retriever(method="dpr", model="dpr-multi", n_docs=5, index_type="msmarco")
893893

894894

895895
# DPR (single-encoder) retrieval on Wikipedia
896896
dpr_retriever_wiki = Retriever(method="dpr", model="dpr-single", n_docs=5, index_type="wiki")
897897

898898
# DPR (single-encoder) retrieval on MS MARCO
899-
dpr_retriever_msmacro = Retriever(method="dpr", model="dpr-single", n_docs=5, index_type="msmarco")
899+
dpr_retriever_msmarco = Retriever(method="dpr", model="dpr-single", n_docs=5, index_type="msmarco")
900900

901901

902902
# ANCE retrieval on Wikipedia
903903
ance_retriever_wiki = Retriever(method="ance", model="ance-multi", n_docs=5, index_type="wiki")
904904

905905
# ANCE retrieval on MS MARCO
906-
ance_retriever_msmacro = Retriever(method="ance", model="ance-multi", n_docs=5, index_type="msmarco")
906+
ance_retriever_msmarco = Retriever(method="ance", model="ance-multi", n_docs=5, index_type="msmarco")
907907

908908

909909
# Contriever retrieval on Wikipedia
910910
contriever_retriever_wiki = Retriever(method="contriever", model="facebook/contriever-msmarco", n_docs=5, index_type="wiki")
911911

912912
# Contriever retrieval on MS MARCO
913-
contriever_retriever_msmacro = Retriever(method="contriever", model="facebook/contriever-msmarco", n_docs=5, index_type="msmarco")
913+
contriever_retriever_msmarco = Retriever(method="contriever", model="facebook/contriever-msmarco", n_docs=5, index_type="msmarco")
914914

915915

916916
# ColBERT retrieval on Wikipedia
917917
colbert_retriever_wiki = Retriever(method="colbert", model="colbert-ir/colbertv2.0", n_docs=5, index_type="wiki")
918918

919919
# ColBERT retrieval on MS MARCO
920-
colbert_retriever_msmacro = Retriever(method="colbert", model="colbert-ir/colbertv2.0", n_docs=5, index_type="msmarco")
920+
colbert_retriever_msmarco = Retriever(method="colbert", model="colbert-ir/colbertv2.0", n_docs=5, index_type="msmarco")
921921

922922

923923
# BGE retrieval on Wikipedia
924924
bge_retriever_wiki = Retriever(method="bge", model="BAAI/bge-large-en-v1.5", n_docs=5, index_type="wiki")
925925

926926
# BGE retrieval on MS MARCO
927-
bge_retriever_msmacro = Retriever(method="bge", model="BAAI/bge-large-en-v1.5", n_docs=5, index_type="msmarco")
927+
bge_retriever_msmarco = Retriever(method="bge", model="BAAI/bge-large-en-v1.5", n_docs=5, index_type="msmarco")
928928

929929

930930
# Hyde retrieval on Wikipedia
931931
hyde_retriever_wiki = Retriever(method="hyde" , n_docs=5, index_type="wiki", api_key=OPENAI_API_KEY )
932932

933933
# Hyde retrieval on MS MARCO
934-
hyde_retriever_msmacro = Retriever(method="hyde", n_docs=5, index_type="msmarco", api_key=OPENAI_API_KEY)
934+
hyde_retriever_msmarco = Retriever(method="hyde", n_docs=5, index_type="msmarco", api_key=OPENAI_API_KEY)
935935
```
936936

937937
**Option B:**
938938
Retrieval with custom datasets and automated caching.
939939

940940
Featuring some of the latest 7B+ parameter models, all of the models below are purposed only for usage with custom datasets.
941941

942-
Simply provide a corpus_path to a ```.jsonl``` file, and the model will embed and cache the data locally on the first run.
942+
Simply pass a ```.jsonl``` file to ```corpus_path```, ensuring your data maps to the required ```id:``` and ```text:``` fields, and the model will embed and cache the data locally on the first run.
943943

944944
```python
945945
# Bi-encoders as implemented in the diver framework (11 configurable models, specified by model_id)
@@ -965,6 +965,8 @@ rader_retriever = Retriever(method="diver-dense", model_id="rader", corpus_path=
965965

966966
nomic_retriever = Retriever(method="diver-dense", model_id="nomic", corpus_path="data/my_corpus.jsonl", encode_batch_size=4, n_docs=5)
967967

968+
diver_retriever = Retriever(method="diver-dense", model_id="diver", corpus_path="data/my_corpus.jsonl", encode_batch_size=4, n_docs=5)
969+
968970

969971
# Reasonir retrieval
970972
reasonir_retriever = Retriever(method="reasonir", corpus_path="data/my_corpus.jsonl", encode_batch_size=4, n_docs=5)
@@ -980,6 +982,40 @@ reasonembed_qwen4b_retriever = Retriever(method="reason-embed", model_id="llama-
980982

981983
# BgeReasonEmbed retrieval
982984
bge_reasoner_retriever = Retriever(method="bge-reasoner-embed", corpus_path="data/my_corpus.jsonl", encode_batch_size=8, n_docs=5)
985+
```
986+
### Retrieval Example: ReasonIR on the BRIGHT Benchmark (Biology queries)
987+
This example demonstrates how to evaluate the `reasonir/ReasonIR-8B` model on the reasoning-intensive BRIGHT benchmark.
988+
989+
```python
990+
from datasets import load_dataset
991+
from rankify.dataset.dataset import Document, Question, Answer
992+
from rankify.retrievers.retriever import Retriever
993+
994+
corpus_path = "bright_biology_corpus.jsonl" # .jsonl corpus for retrieval
995+
996+
docs = load_dataset("xlangai/BRIGHT", "documents", split="biology")
997+
docs.to_json(corpus_path, force_ascii=False)
998+
999+
queries = load_dataset("xlangai/BRIGHT", "examples", split="biology")
1000+
1001+
documents = []
1002+
for item in queries:
1003+
doc = Document(id=item["id"],
1004+
question=Question(question=item["query"]),
1005+
answers=Answer(answers=item.get("gold_ids", [])))
1006+
documents.append(doc)
1007+
break
1008+
1009+
retriever = Retriever(
1010+
method="reasonir", # Use ReasonIR retriever
1011+
n_docs=3, # Retrieve top 3 documents per query
1012+
corpus_path=corpus_path, # Path to the JSONL we just created
1013+
text_field="content", # BRIGHT uses 'content' instead of 'text'
1014+
batch_size=4,
1015+
)
1016+
1017+
results = retriever.retrieve(documents)
1018+
9831019
```
9841020

9851021
### Step 3: Execute and View Results
@@ -1413,6 +1449,7 @@ print("RAGAS (OpenAI):", {k: v for k, v in scores_openai.items() if k.startswith
14131449
-**[ReasonIR](https://arxiv.org/abs/2504.20595)**
14141450
-**[BGE-Reasoner](https://huggingface.co/BAAI/bge-en-icl)**
14151451
-**[ReasonEmbed](https://arxiv.org/abs/2510.08252)**
1452+
-**[DiverRetriever](https://huggingface.co/AQ-MedAI/Diver-Retriever-4B)**
14161453
- 🕒 **RepLlama**
14171454
- 🕒 **coCondenser**
14181455
- 🕒 **Spar**

rankify/retrievers/diver_dense_retriever.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
- https://github.com/AQ-MedAI/Diver/blob/main/Retriever/retrievers.py
44
55
Supports:
6-
- SentenceTransformers (bge, sbert, nomic, instructor)
7-
- HF AutoModels (sf, qwen, e5, rader, contriever, m2)
6+
- SentenceTransformers (bge, sbert, nomic, instructor, diver-retriever)
7+
- HF AutoModels (sf, e5, rader, contriever, m2)
88
- GritLM (grit)
99
"""
1010

@@ -155,7 +155,9 @@ def _load_model(self):
155155
elif self.model_id == "contriever_st":
156156
self.model = SentenceTransformer('nishimoto/contriever-sentencetransformer', device=self.device)
157157
elif self.model_id == "nomic":
158-
self.model = SentenceTransformer(self.checkpoint or "nomic-ai/nomic-embed-text-v1", trust_remote_code=True, device=self.device)
158+
self.model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True, device=self.device)
159+
elif self.model_id == "diver":
160+
self.model = SentenceTransformer("AQ-MedAI/Diver-Retriever-4B", trust_remote_code=True, device=self.device)
159161
elif self.model_id == "inst-l":
160162
self.model = SentenceTransformer("hkunlp/instructor-large", device=self.device)
161163
self.model.max_seq_length = self.doc_max_length
@@ -312,7 +314,7 @@ def _load_or_build_doc_embeddings(self):
312314
docs = self.doc_texts
313315

314316
# Encoding logic based on model_id
315-
if self.model_id in ["bge", "sbert", "contriever_st", "nomic"]:
317+
if self.model_id in ["bge", "sbert", "contriever_st", "nomic", "diver"]:
316318
doc_emb = self.model.encode(docs, show_progress_bar=True, batch_size=self.encode_batch_size, normalize_embeddings=True)
317319

318320
elif self.model_id in ["inst-l", "inst-xl"]:
@@ -352,7 +354,7 @@ def _initialize_searcher(self):
352354
return None
353355

354356
def _encode_queries(self, queries: List[str]):
355-
if self.model_id in ["bge", "sbert", "contriever_st", "nomic"]:
357+
if self.model_id in ["bge", "sbert", "contriever_st", "nomic", "diver"]:
356358
if self.model_id == "bge":
357359
queries = add_instruct_concatenate(texts=queries, task=self.task, instruction=self.instruction_query)
358360
return self.model.encode(queries, show_progress_bar=True, batch_size=self.encode_batch_size, normalize_embeddings=True)

0 commit comments

Comments
 (0)