@@ -81,7 +81,7 @@ def sanity_check(self):
8181 continue
8282 raise ValueError ("Sanity check failed after timeout" )
8383
84- def evaluate (self , retriever : str ):
84+ def evaluate (self , retriever : str , limit : int | None = None ):
8585 retrieval_tcs = []
8686 response_times = []
8787
@@ -93,7 +93,8 @@ def evaluate(self, retriever: str):
9393 )
9494
9595 # retrieval test cases
96- for i , qa_pair in enumerate (tqdm (self .qns , desc = "Evaluating" )):
96+ questions = self .qns [:limit ] if limit else self .qns
97+ for i , qa_pair in enumerate (tqdm (questions , desc = "Evaluating" )):
9798 question , ground_truth = qa_pair ["question" ], qa_pair ["ground_truth" ]
9899 response , response_time = self .query (retriever , question )
99100 response_text = response ["response" ]
@@ -114,7 +115,6 @@ def evaluate(self, retriever: str):
114115 evaluate (
115116 test_cases = retrieval_tcs ,
116117 metrics = [precision , recall , hallucination ],
117- print_results = False ,
118118 )
119119
120120 # parse deepeval results
@@ -155,11 +155,14 @@ def query(self, retriever: str, query: str) -> tuple[dict, float]:
155155 )
156156 parser .add_argument ("--dataset" , type = str , help = "Path to dataset to evaluate on" )
157157 parser .add_argument ("--retriever" , type = str , help = "Retriever to evaluate on" )
158+ parser .add_argument (
159+ "--limit" , type = int , help = "Limit number of questions to evaluate" , default = None
160+ )
158161 args = parser .parse_args ()
159162
160163 # Pull the dataset from huggingface hub
161164 hf_pull .main ()
162165
163166 # Evaluate the model on the dataset
164167 harness = EvaluationHarness (args .base_url , args .dataset , args .reranker_base_url )
165- harness .evaluate (args .retriever )
168+ harness .evaluate (args .retriever , limit = args . limit )
0 commit comments