@@ -851,6 +851,8 @@ rankify-index index data/wikipedia_100.jsonl \
851851### 2️⃣ Running Retrieval
852852To perform retrieval using ** Rankify** , you can choose from various retrieval methods such as ** BM25, DPR, ANCE, Contriever, ColBERT, and BGE** .
853853
854+ ### Step 1: Setup example queries
855+
854856** Example: Running Retrieval on Sample Queries**
855857``` python
856858from rankify.dataset.dataset import Document, Question, Answer, Context
@@ -870,6 +872,10 @@ documents = [
870872 Document(question = Question(" Who wrote Hamlet?" ), answers = Answer([" Shakespeare" ]), contexts = [])
871873]
872874```
875+ ### Step 2: Choose Retrieval Option
876+
877+ ** Option A:**
878+ Retrieval ``` index_type ``` (e.g., ``` "wiki ``` ", "``` msmarco ``` ") to load pre-computed FAISS indices.
873879
874880``` python
875881# BM25 retrieval on Wikipedia
@@ -885,12 +891,14 @@ dpr_retriever_wiki = Retriever(method="dpr", model="dpr-multi", n_docs=5, index_
885891# DPR (multi-encoder) retrieval on MS MARCO
886892dpr_retriever_msmacro = Retriever(method = " dpr" , model = " dpr-multi" , n_docs = 5 , index_type = " msmarco" )
887893
894+
888895# DPR (single-encoder) retrieval on Wikipedia
889896dpr_retriever_wiki = Retriever(method = " dpr" , model = " dpr-single" , n_docs = 5 , index_type = " wiki" )
890897
891898# DPR (single-encoder) retrieval on MS MARCO
892899dpr_retriever_msmacro = Retriever(method = " dpr" , model = " dpr-single" , n_docs = 5 , index_type = " msmarco" )
893900
901+
894902# ANCE retrieval on Wikipedia
895903ance_retriever_wiki = Retriever(method = " ance" , model = " ance-multi" , n_docs = 5 , index_type = " wiki" )
896904
@@ -926,6 +934,55 @@ hyde_retriever_wiki = Retriever(method="hyde" , n_docs=5, index_type="wiki", api
926934hyde_retriever_msmacro = Retriever(method = " hyde" , n_docs = 5 , index_type = " msmarco" , api_key = OPENAI_API_KEY )
927935```
928936
937+ ** Option B:**
938+ Retrieval with custom datasets and automated caching.
939+
940+ Featuring some of the latest 7B+ parameter models, all of the models below are purposed only for usage with custom datasets.
941+
942+ Simply provide a corpus_path to a ``` .jsonl ``` file, and the model will embed and cache the data locally on the first run.
943+
944+ ``` python
945+ # Bi-encoders as implemented in the diver framework (11 configurable models, specified by model_id)
946+ bge_large_retriever = Retriever(method = " diver-dense" , model_id = " bge" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
947+
948+ sbert_retriever = Retriever(method = " diver-dense" , model_id = " sbert" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
949+
950+ inst_l_retriever = Retriever(method = " diver-dense" , model_id = " inst-l" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
951+
952+ inst_xl_retriever = Retriever(method = " diver-dense" , model_id = " inst-xl" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
953+
954+ sfr_retriever = Retriever(method = " diver-dense" , model_id = " sf" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
955+
956+ e5_retriever = Retriever(method = " diver-dense" , model_id = " e5" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
957+
958+ contriever_retriever = Retriever(method = " diver-dense" , model_id = " contriever" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
959+
960+ m2_retriever = Retriever(method = " diver-dense" , model_id = " m2" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
961+
962+ grit_retriever = Retriever(method = " diver-dense" , model_id = " grit" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
963+
964+ rader_retriever = Retriever(method = " diver-dense" , model_id = " rader" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
965+
966+ nomic_retriever = Retriever(method = " diver-dense" , model_id = " nomic" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 4 , n_docs = 5 )
967+
968+
969+ # Reasonir retrieval
970+ reasonir_retriever = Retriever(method = " reasonir" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 4 , n_docs = 5 )
971+
972+
973+ # ReasonEmbed retrieval (3 configurable models specified by model_id)
974+ reasonembed_qwen8b_retriever = Retriever(method = " reason-embed" , model_id = " qwen3-8b" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
975+
976+ reasonembed_llama8b_retriever = Retriever(method = " reason-embed" , model_id = " qwen3-4b" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
977+
978+ reasonembed_qwen4b_retriever = Retriever(method = " reason-embed" , model_id = " llama-8b" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
979+
980+
981+ # BgeReasonEmbed retrieval
982+ bge_reasoner_retriever = Retriever(method = " bge-reasoner-embed" , corpus_path = " data/my_corpus.jsonl" , encode_batch_size = 8 , n_docs = 5 )
983+ ```
984+
985+ ### Step 3: Execute and View Results
929986** Running Retrieval**
930987
931988After defining the retriever, you can retrieve documents using:
0 commit comments