@@ -849,7 +849,7 @@ rankify-index index data/wikipedia_100.jsonl \
849849---
850850
851851### 2️⃣ Running Retrieval
852- To perform retrieval using ** Rankify** , you can choose from various retrieval methods such as ** BM25, DPR, ANCE, Contriever, ColBERT, and BGE ** .
852+ To perform retrieval using ** Rankify** , you can choose from various retrieval methods such as ** BM25, DPR, ANCE, Contriever, ColBERT, BGE, Sbert, Nomic, Instructor, DiverRetriever, SRF, E5, RaDeR, M2, GritLM, ReasonEmbed, ReasonIR and BGEReasoner ** .
853853
854854### Step 1: Setup example queries
855855
@@ -882,64 +882,64 @@ Retrieval ```index_type``` (e.g., ```"wiki```", "```msmarco```") to load pre-co
882882bm25_retriever_wiki = Retriever(method = " bm25" , n_docs = 5 , index_type = " wiki" )
883883
884884# BM25 retrieval on MS MARCO
885- bm25_retriever_msmacro = Retriever(method = " bm25" , n_docs = 5 , index_type = " msmarco" )
885+ bm25_retriever_msmarco = Retriever(method = " bm25" , n_docs = 5 , index_type = " msmarco" )
886886
887887
888888# DPR (multi-encoder) retrieval on Wikipedia
889889dpr_retriever_wiki = Retriever(method = " dpr" , model = " dpr-multi" , n_docs = 5 , index_type = " wiki" )
890890
891891# DPR (multi-encoder) retrieval on MS MARCO
892- dpr_retriever_msmacro = Retriever(method = " dpr" , model = " dpr-multi" , n_docs = 5 , index_type = " msmarco" )
892+ dpr_retriever_msmarco = Retriever(method = " dpr" , model = " dpr-multi" , n_docs = 5 , index_type = " msmarco" )
893893
894894
895895# DPR (single-encoder) retrieval on Wikipedia
896896dpr_retriever_wiki = Retriever(method = " dpr" , model = " dpr-single" , n_docs = 5 , index_type = " wiki" )
897897
898898# DPR (single-encoder) retrieval on MS MARCO
899- dpr_retriever_msmacro = Retriever(method = " dpr" , model = " dpr-single" , n_docs = 5 , index_type = " msmarco" )
899+ dpr_retriever_msmarco = Retriever(method = " dpr" , model = " dpr-single" , n_docs = 5 , index_type = " msmarco" )
900900
901901
902902# ANCE retrieval on Wikipedia
903903ance_retriever_wiki = Retriever(method = " ance" , model = " ance-multi" , n_docs = 5 , index_type = " wiki" )
904904
905905# ANCE retrieval on MS MARCO
906- ance_retriever_msmacro = Retriever(method = " ance" , model = " ance-multi" , n_docs = 5 , index_type = " msmarco" )
906+ ance_retriever_msmarco = Retriever(method = " ance" , model = " ance-multi" , n_docs = 5 , index_type = " msmarco" )
907907
908908
909909# Contriever retrieval on Wikipedia
910910contriever_retriever_wiki = Retriever(method = " contriever" , model = " facebook/contriever-msmarco" , n_docs = 5 , index_type = " wiki" )
911911
912912# Contriever retrieval on MS MARCO
913- contriever_retriever_msmacro = Retriever(method = " contriever" , model = " facebook/contriever-msmarco" , n_docs = 5 , index_type = " msmarco" )
913+ contriever_retriever_msmarco = Retriever(method = " contriever" , model = " facebook/contriever-msmarco" , n_docs = 5 , index_type = " msmarco" )
914914
915915
916916# ColBERT retrieval on Wikipedia
917917colbert_retriever_wiki = Retriever(method = " colbert" , model = " colbert-ir/colbertv2.0" , n_docs = 5 , index_type = " wiki" )
918918
919919# ColBERT retrieval on MS MARCO
920- colbert_retriever_msmacro = Retriever(method = " colbert" , model = " colbert-ir/colbertv2.0" , n_docs = 5 , index_type = " msmarco" )
920+ colbert_retriever_msmarco = Retriever(method = " colbert" , model = " colbert-ir/colbertv2.0" , n_docs = 5 , index_type = " msmarco" )
921921
922922
923923# BGE retrieval on Wikipedia
924924bge_retriever_wiki = Retriever(method = " bge" , model = " BAAI/bge-large-en-v1.5" , n_docs = 5 , index_type = " wiki" )
925925
926926# BGE retrieval on MS MARCO
927- bge_retriever_msmacro = Retriever(method = " bge" , model = " BAAI/bge-large-en-v1.5" , n_docs = 5 , index_type = " msmarco" )
927+ bge_retriever_msmarco = Retriever(method = " bge" , model = " BAAI/bge-large-en-v1.5" , n_docs = 5 , index_type = " msmarco" )
928928
929929
930930# Hyde retrieval on Wikipedia
931931hyde_retriever_wiki = Retriever(method = " hyde" , n_docs = 5 , index_type = " wiki" , api_key = OPENAI_API_KEY )
932932
933933# Hyde retrieval on MS MARCO
934- hyde_retriever_msmacro = Retriever(method = " hyde" , n_docs = 5 , index_type = " msmarco" , api_key = OPENAI_API_KEY )
934+ hyde_retriever_msmarco = Retriever(method = " hyde" , n_docs = 5 , index_type = " msmarco" , api_key = OPENAI_API_KEY )
935935```
936936
937937** Option B:**
938938Retrieval with custom datasets and automated caching.
939939
940940Featuring some of the latest 7B+ parameter models, all of the models below are purposed only for usage with custom datasets.
941941
942- Simply provide a corpus_path to a ``` .jsonl ``` file , and the model will embed and cache the data locally on the first run.
942+ Simply pass a ``` .jsonl ``` file to ``` corpus_path ``` , ensuring your data maps to the required ``` id: ``` and ``` text: ``` fields , and the model will embed and cache the data locally on the first run.
943943
944944``` python
945945# Bi-encoders as implemented in the diver framework (11 configurable models, specified by model_id)
@@ -965,6 +965,8 @@ rader_retriever = Retriever(method="diver-dense", model_id="rader", corpus_path=
965965
966966nomic_retriever = Retriever(method = " diver-dense" , model_id = " nomic" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 4 , n_docs = 5 )
967967
968+ diver_retriever = Retriever(method = " diver-dense" , model_id = " diver" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 4 , n_docs = 5 )
969+
968970
969971# Reasonir retrieval
970972reasonir_retriever = Retriever(method = " reasonir" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 4 , n_docs = 5 )
@@ -980,6 +982,40 @@ reasonembed_qwen4b_retriever = Retriever(method="reason-embed", model_id="llama-
980982
981983# BgeReasonEmbed retrieval
982984bge_reasoner_retriever = Retriever(method = " bge-reasoner-embed" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
985+ ```
986+ ### Retrieval Example: ReasonIR on the BRIGHT Benchmark (Biology queries)
987+ This example demonstrates how to evaluate the ` reasonir/ReasonIR-8B ` model on the reasoning-intensive BRIGHT benchmark.
988+
989+ ``` python
990+ from datasets import load_dataset
991+ from rankify.dataset.dataset import Document, Question, Answer
992+ from rankify.retrievers.retriever import Retriever
993+
994+ corpus_path = " bright_biology_corpus.jsonl" # .jsonl corpus for retrieval
995+
996+ docs = load_dataset(" xlangai/BRIGHT" , " documents" , split = " biology" )
997+ docs.to_json(corpus_path, force_ascii = False )
998+
999+ queries = load_dataset(" xlangai/BRIGHT" , " examples" , split = " biology" )
1000+
1001+ documents = []
1002+ for item in queries:
1003+ doc = Document(id = item[" id" ],
1004+ question = Question(question = item[" query" ]),
1005+ answers = Answer(answers = item.get(" gold_ids" , [])))
1006+ documents.append(doc)
1007+ break
1008+
1009+ retriever = Retriever(
1010+ method = " reasonir" , # Use ReasonIR retriever
1011+ n_docs = 3 , # Retrieve top 3 documents per query
1012+ corpus_path = corpus_path, # Path to the JSONL we just created
1013+ text_field = " content" , # BRIGHT uses 'content' instead of 'text'
1014+ batch_size = 4 ,
1015+ )
1016+
1017+ results = retriever.retrieve(documents)
1018+
9831019```
9841020
9851021### Step 3: Execute and View Results
@@ -1413,6 +1449,7 @@ print("RAGAS (OpenAI):", {k: v for k, v in scores_openai.items() if k.startswith
14131449- ✅ ** [ ReasonIR] ( https://arxiv.org/abs/2504.20595 ) **
14141450- ✅ ** [ BGE-Reasoner] ( https://huggingface.co/BAAI/bge-en-icl ) **
14151451- ✅ ** [ ReasonEmbed] ( https://arxiv.org/abs/2510.08252 ) **
1452+ - ✅ ** [ DiverRetriever] ( https://huggingface.co/AQ-MedAI/Diver-Retriever-4B ) **
14161453- 🕒 ** RepLlama**
14171454- 🕒 ** coCondenser**
14181455- 🕒 ** Spar**
0 commit comments