Skip to content

Commit 3eed19a

Browse files
committed
Execution time and evaluation latency impl
1 parent 2d96f6d commit 3eed19a

15 files changed

Lines changed: 205 additions & 588 deletions

File tree

config/system.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ storage:
270270
- "threshold"
271271
- "reason"
272272
- "execution_time"
273+
- "evaluation_latency"
273274
- "query"
274275
- "response"
275276
- "api_input_tokens"

src/lightspeed_evaluation/core/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
"query",
101101
"response",
102102
"execution_time",
103+
"evaluation_latency",
103104
"api_input_tokens",
104105
"api_output_tokens",
105106
"judge_llm_input_tokens",

src/lightspeed_evaluation/core/models/data.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,10 +529,13 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
529529
query: str = Field(default="", description="Query text")
530530
response: str = Field(default="", description="Response text")
531531
execution_time: float = Field(
532-
default=0, ge=0, description="Execution time in seconds"
532+
default=0.0, ge=0, description="Execution time for entire turn in seconds"
533+
)
534+
evaluation_latency: float = Field(
535+
default=0.0, ge=0, description="Evaluation latency in seconds"
533536
)
534537
agent_latency: float = Field(
535-
default=0,
538+
default=0.0,
536539
ge=0,
537540
description="API latency in seconds (per turn or average for conversation)",
538541
)

src/lightspeed_evaluation/core/output/generator.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,11 +269,8 @@ def _generate_csv_report(
269269
for column in csv_columns:
270270
if hasattr(result, column):
271271
value = getattr(result, column)
272-
# Special formatting for execution_time
273-
if column == "execution_time" and value is not None:
274-
row_data.append(f"{value:.3f}")
275272
# Convert judge_scores to JSON string
276-
elif column == "judge_scores" and value is not None:
273+
if column == "judge_scores" and value is not None:
277274
row_data.append(
278275
json.dumps(
279276
[js.model_dump() for js in value], default=str
@@ -822,7 +819,7 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
822819
"result": r.result,
823820
"score": r.score,
824821
"threshold": r.threshold,
825-
"execution_time": round(r.execution_time, 3),
822+
"evaluation_latency": r.evaluation_latency,
826823
"judge_llm_input_tokens": r.judge_llm_input_tokens,
827824
"judge_llm_output_tokens": r.judge_llm_output_tokens,
828825
"judge_scores": (

src/lightspeed_evaluation/core/storage/sql_storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods
5757
reason = Column(Text, nullable=True)
5858
query = Column(Text, nullable=True)
5959
response = Column(Text, nullable=True)
60-
execution_time = Column(Float, nullable=True)
60+
evaluation_latency = Column(Float, nullable=True)
6161
api_input_tokens = Column(Integer, nullable=True)
6262
api_output_tokens = Column(Integer, nullable=True)
6363
judge_llm_input_tokens = Column(Integer, nullable=True)
@@ -318,7 +318,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
318318
reason=result.reason,
319319
query=result.query,
320320
response=result.response,
321-
execution_time=result.execution_time,
321+
evaluation_latency=result.evaluation_latency,
322322
api_input_tokens=result.api_input_tokens,
323323
api_output_tokens=result.api_output_tokens,
324324
judge_llm_input_tokens=result.judge_llm_input_tokens,

src/lightspeed_evaluation/pipeline/evaluation/evaluator.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ def _calculate_agent_latency_per_request(request: EvaluationRequest) -> float:
8686
return sum(latencies) / len(latencies) if latencies else 0.0
8787

8888

89+
def _measure_latency(start_time: float) -> float:
90+
"""Calculate evaluation latency given start time."""
91+
return time.perf_counter() - start_time
92+
93+
8994
class MetricsEvaluator:
9095
"""Handles individual metric evaluation with proper scoring and status determination."""
9196

@@ -155,7 +160,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
155160
EvaluationResult with score, result, token usage, and execution time,
156161
or None if metric should be skipped (e.g., script metrics when API disabled).
157162
"""
158-
start_time = time.time()
163+
start_time = time.perf_counter()
159164

160165
try:
161166
# Create logging summary
@@ -184,9 +189,8 @@ def evaluate_metric( # pylint: disable=too-many-locals
184189

185190
# Route to appropriate handler
186191
if framework not in self.handlers:
187-
execution_time = time.time() - start_time
188192
return self._create_error_result(
189-
request, f"Unsupported framework: {framework}", execution_time
193+
request, f"Unsupported framework: {framework}", start_time
190194
)
191195

192196
# Check required data for metric (after API call); skip with ERROR if missing
@@ -198,11 +202,10 @@ def evaluate_metric( # pylint: disable=too-many-locals
198202
request.turn_data, request.metric_identifier
199203
)
200204
if not ok:
201-
execution_time = time.time() - start_time
202205
logger.warning(
203206
"Skipping metric due to missing required data: %s", msg
204207
)
205-
return self._create_error_result(request, msg, execution_time)
208+
return self._create_error_result(request, msg, start_time)
206209

207210
# Create evaluation scope
208211
evaluation_scope = EvaluationScope(
@@ -224,7 +227,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
224227
# Evaluate metric
225228
metric_result = self._evaluate_wrapper(request, evaluation_scope, threshold)
226229

227-
execution_time = time.time() - start_time
230+
evaluation_latency = _measure_latency(start_time)
228231

229232
turn_data = request.turn_data
230233
api_input_tokens, api_output_tokens = (
@@ -240,8 +243,9 @@ def evaluate_metric( # pylint: disable=too-many-locals
240243
metric_metadata=self._extract_metadata_for_csv(request),
241244
query=turn_data.query if turn_data else "",
242245
response=turn_data.response or "" if turn_data else "",
243-
execution_time=execution_time,
246+
evaluation_latency=evaluation_latency,
244247
agent_latency=agent_latency,
248+
execution_time=evaluation_latency + agent_latency,
245249
api_input_tokens=api_input_tokens,
246250
api_output_tokens=api_output_tokens,
247251
# Streaming performance metrics
@@ -266,9 +270,8 @@ def evaluate_metric( # pylint: disable=too-many-locals
266270

267271
except EvaluationError as e:
268272
# Any evaluation error should result in ERROR status
269-
execution_time = time.time() - start_time
270273
return self._create_error_result(
271-
request, f"Evaluation error: {e}", execution_time
274+
request, f"Evaluation error: {e}", start_time
272275
)
273276

274277
def _will_use_panel(self, metric_identifier: str) -> bool:
@@ -720,14 +723,15 @@ def _evaluate_non_llm(
720723
)
721724

722725
def _create_error_result(
723-
self, request: EvaluationRequest, reason: str, execution_time: float
726+
self, request: EvaluationRequest, reason: str, start_time: float
724727
) -> EvaluationResult:
725728
"""Create an ERROR result for failed evaluation."""
726729
turn_data = request.turn_data
727730
api_input_tokens, api_output_tokens = _calculate_api_token_counts_per_request(
728731
request
729732
)
730733
agent_latency = _calculate_agent_latency_per_request(request)
734+
evaluation_latency = _measure_latency(start_time)
731735
return EvaluationResult(
732736
conversation_group_id=request.conv_data.conversation_group_id,
733737
tag=request.conv_data.tag,
@@ -740,8 +744,9 @@ def _create_error_result(
740744
reason=reason,
741745
query=turn_data.query if turn_data else "",
742746
response=turn_data.response or "" if turn_data else "",
743-
execution_time=execution_time,
747+
evaluation_latency=evaluation_latency,
744748
agent_latency=agent_latency,
749+
execution_time=evaluation_latency + agent_latency,
745750
api_input_tokens=api_input_tokens,
746751
api_output_tokens=api_output_tokens,
747752
# Streaming performance metrics

tests/script/conftest.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
2929
"result": "PASS",
3030
"score": 0.8,
3131
"threshold": 0.7,
32-
"execution_time": 1.0,
32+
"evaluation_latency": 1.0,
3333
},
3434
{
3535
"conversation_group_id": "conv1",
@@ -38,7 +38,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
3838
"result": "PASS",
3939
"score": 0.9,
4040
"threshold": 0.7,
41-
"execution_time": 1.2,
41+
"evaluation_latency": 1.2,
4242
},
4343
]
4444

@@ -50,7 +50,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
5050
"result": "PASS",
5151
"score": 0.85,
5252
"threshold": 0.7,
53-
"execution_time": 1.1,
53+
"evaluation_latency": 1.1,
5454
},
5555
{
5656
"conversation_group_id": "conv1",
@@ -59,7 +59,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
5959
"result": "FAIL",
6060
"score": 0.6,
6161
"threshold": 0.7,
62-
"execution_time": 1.0,
62+
"evaluation_latency": 1.0,
6363
},
6464
]
6565

@@ -198,7 +198,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
198198
"result": "PASS",
199199
"score": 0.95,
200200
"threshold": 0.8,
201-
"execution_time": 1.0,
201+
"evaluation_latency": 1.0,
202202
},
203203
{
204204
"conversation_group_id": "conv1",
@@ -207,7 +207,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
207207
"result": "PASS",
208208
"score": 0.85,
209209
"threshold": 0.7,
210-
"execution_time": 1.2,
210+
"evaluation_latency": 1.2,
211211
},
212212
]
213213
* 5, # Repeat to get 10 results

tests/unit/core/models/test_data.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ def test_default_values(self) -> None:
469469
assert result.tag == "eval"
470470
assert result.score is None
471471
assert result.reason == ""
472-
assert result.execution_time == 0
472+
assert result.evaluation_latency == 0
473473

474474
def test_explicit_tag_value(self) -> None:
475475
"""Test EvaluationResult with explicit tag value."""
@@ -507,16 +507,16 @@ def test_invalid_result_status_rejected(self) -> None:
507507
threshold=0.7,
508508
)
509509

510-
def test_negative_execution_time_rejected(self) -> None:
511-
"""Test that negative execution_time is rejected."""
510+
def test_negative_evaluation_latency_rejected(self) -> None:
511+
"""Test that negative evaluation_latency is rejected."""
512512
with pytest.raises(ValidationError):
513513
EvaluationResult(
514514
conversation_group_id="conv1",
515515
turn_id="turn1",
516516
metric_identifier="metric1",
517517
result="PASS",
518518
threshold=0.7,
519-
execution_time=-1,
519+
evaluation_latency=-1,
520520
)
521521

522522
def test_conversation_level_metric_allows_none_turn_id(self) -> None:

tests/unit/core/models/test_summary.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"score": 0.85,
2626
"threshold": 0.7,
2727
"reason": "Good",
28-
"execution_time": 1.0,
28+
"evaluation_latency": 1.0,
2929
"judge_llm_input_tokens": 100,
3030
"judge_llm_output_tokens": 50,
3131
}

tests/unit/core/output/test_generator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
323323
reason="Score is 0.8",
324324
query="What is OpenShift?",
325325
response="OpenShift is a container platform.",
326-
execution_time=1.5,
326+
evaluation_latency=1.5,
327327
),
328328
EvaluationResult(
329329
conversation_group_id="test_conv",
@@ -335,7 +335,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
335335
reason="Poor performance",
336336
query="How to deploy?",
337337
response="Use oc apply.",
338-
execution_time=0.8,
338+
evaluation_latency=0.8,
339339
expected_response="Use oc apply -f deployment.yaml",
340340
),
341341
EvaluationResult(
@@ -348,7 +348,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
348348
reason="API connection failed",
349349
query="Create namespace",
350350
response="",
351-
execution_time=0.0,
351+
evaluation_latency=0.0,
352352
),
353353
]
354354

0 commit comments

Comments
 (0)