@@ -86,6 +86,11 @@ def _calculate_agent_latency_per_request(request: EvaluationRequest) -> float:
8686 return sum (latencies ) / len (latencies ) if latencies else 0.0
8787
8888
89+ def _measure_latency (start_time : float ) -> float :
90+ """Calculate evaluation latency given start time."""
91+ return time .perf_counter () - start_time
92+
93+
8994class MetricsEvaluator :
9095 """Handles individual metric evaluation with proper scoring and status determination."""
9196
@@ -155,7 +160,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
155160 EvaluationResult with score, result, token usage, and execution time,
156161 or None if metric should be skipped (e.g., script metrics when API disabled).
157162 """
158- start_time = time .time ()
163+ start_time = time .perf_counter ()
159164
160165 try :
161166 # Create logging summary
@@ -184,9 +189,8 @@ def evaluate_metric( # pylint: disable=too-many-locals
184189
185190 # Route to appropriate handler
186191 if framework not in self .handlers :
187- execution_time = time .time () - start_time
188192 return self ._create_error_result (
189- request , f"Unsupported framework: { framework } " , execution_time
193+ request , f"Unsupported framework: { framework } " , start_time
190194 )
191195
192196 # Check required data for metric (after API call); skip with ERROR if missing
@@ -198,11 +202,10 @@ def evaluate_metric( # pylint: disable=too-many-locals
198202 request .turn_data , request .metric_identifier
199203 )
200204 if not ok :
201- execution_time = time .time () - start_time
202205 logger .warning (
203206 "Skipping metric due to missing required data: %s" , msg
204207 )
205- return self ._create_error_result (request , msg , execution_time )
208+ return self ._create_error_result (request , msg , start_time )
206209
207210 # Create evaluation scope
208211 evaluation_scope = EvaluationScope (
@@ -224,7 +227,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
224227 # Evaluate metric
225228 metric_result = self ._evaluate_wrapper (request , evaluation_scope , threshold )
226229
227- execution_time = time . time () - start_time
230+ evaluation_latency = _measure_latency ( start_time )
228231
229232 turn_data = request .turn_data
230233 api_input_tokens , api_output_tokens = (
@@ -240,8 +243,9 @@ def evaluate_metric( # pylint: disable=too-many-locals
240243 metric_metadata = self ._extract_metadata_for_csv (request ),
241244 query = turn_data .query if turn_data else "" ,
242245 response = turn_data .response or "" if turn_data else "" ,
243- execution_time = execution_time ,
246+ evaluation_latency = evaluation_latency ,
244247 agent_latency = agent_latency ,
248+ execution_time = evaluation_latency + agent_latency ,
245249 api_input_tokens = api_input_tokens ,
246250 api_output_tokens = api_output_tokens ,
247251 # Streaming performance metrics
@@ -266,9 +270,8 @@ def evaluate_metric( # pylint: disable=too-many-locals
266270
267271 except EvaluationError as e :
268272 # Any evaluation error should result in ERROR status
269- execution_time = time .time () - start_time
270273 return self ._create_error_result (
271- request , f"Evaluation error: { e } " , execution_time
274+ request , f"Evaluation error: { e } " , start_time
272275 )
273276
274277 def _will_use_panel (self , metric_identifier : str ) -> bool :
@@ -720,14 +723,15 @@ def _evaluate_non_llm(
720723 )
721724
722725 def _create_error_result (
723- self , request : EvaluationRequest , reason : str , execution_time : float
726+ self , request : EvaluationRequest , reason : str , start_time : float
724727 ) -> EvaluationResult :
725728 """Create an ERROR result for failed evaluation."""
726729 turn_data = request .turn_data
727730 api_input_tokens , api_output_tokens = _calculate_api_token_counts_per_request (
728731 request
729732 )
730733 agent_latency = _calculate_agent_latency_per_request (request )
734+ evaluation_latency = _measure_latency (start_time )
731735 return EvaluationResult (
732736 conversation_group_id = request .conv_data .conversation_group_id ,
733737 tag = request .conv_data .tag ,
@@ -740,8 +744,9 @@ def _create_error_result(
740744 reason = reason ,
741745 query = turn_data .query if turn_data else "" ,
742746 response = turn_data .response or "" if turn_data else "" ,
743- execution_time = execution_time ,
747+ evaluation_latency = evaluation_latency ,
744748 agent_latency = agent_latency ,
749+ execution_time = evaluation_latency + agent_latency ,
745750 api_input_tokens = api_input_tokens ,
746751 api_output_tokens = api_output_tokens ,
747752 # Streaming performance metrics
0 commit comments