Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ storage:
- "threshold"
- "reason"
- "execution_time"
- "evaluation_latency"
- "query"
- "response"
- "api_input_tokens"
Expand Down
1 change: 1 addition & 0 deletions src/lightspeed_evaluation/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"query",
"response",
"execution_time",
"evaluation_latency",
"api_input_tokens",
"api_output_tokens",
"judge_llm_input_tokens",
Expand Down
7 changes: 5 additions & 2 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,10 +529,13 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
query: str = Field(default="", description="Query text")
response: str = Field(default="", description="Response text")
execution_time: float = Field(
default=0, ge=0, description="Execution time in seconds"
default=0.0, ge=0, description="Execution time for entire turn in seconds"
)
evaluation_latency: float = Field(
default=0.0, ge=0, description="Evaluation latency in seconds"
)
agent_latency: float = Field(
default=0,
default=0.0,
ge=0,
description="API latency in seconds (per turn or average for conversation)",
)
Expand Down
8 changes: 3 additions & 5 deletions src/lightspeed_evaluation/core/output/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,11 +271,8 @@ def _generate_csv_report(
for column in csv_columns:
if hasattr(result, column):
value = getattr(result, column)
# Special formatting for execution_time
if column == "execution_time" and value is not None:
row_data.append(f"{value:.3f}")
# Convert judge_scores to JSON string
elif column == "judge_scores" and value is not None:
if column == "judge_scores" and value is not None:
row_data.append(
json.dumps(
[js.model_dump() for js in value], default=str
Expand Down Expand Up @@ -817,7 +814,8 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
"result": r.result,
"score": r.score,
"threshold": r.threshold,
"execution_time": round(r.execution_time, 3),
"execution_time": r.execution_time,
"evaluation_latency": r.evaluation_latency,
Comment thread
xmican10 marked this conversation as resolved.
"judge_llm_input_tokens": r.judge_llm_input_tokens,
"judge_llm_output_tokens": r.judge_llm_output_tokens,
"judge_scores": (
Expand Down
2 changes: 2 additions & 0 deletions src/lightspeed_evaluation/core/storage/sql_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods
query = Column(Text, nullable=True)
response = Column(Text, nullable=True)
execution_time = Column(Float, nullable=True)
evaluation_latency = Column(Float, nullable=True)
Comment thread
xmican10 marked this conversation as resolved.
api_input_tokens = Column(Integer, nullable=True)
api_output_tokens = Column(Integer, nullable=True)
judge_llm_input_tokens = Column(Integer, nullable=True)
Expand Down Expand Up @@ -319,6 +320,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
query=result.query,
response=result.response,
execution_time=result.execution_time,
evaluation_latency=result.evaluation_latency,
api_input_tokens=result.api_input_tokens,
api_output_tokens=result.api_output_tokens,
judge_llm_input_tokens=result.judge_llm_input_tokens,
Expand Down
27 changes: 16 additions & 11 deletions src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ def _compute_agent_latency_per_request(request: EvaluationRequest) -> float:
return sum(latencies)


def _measure_latency(start_time: float) -> float:
"""Calculate evaluation latency given start time."""
return time.perf_counter() - start_time


class MetricsEvaluator:
"""Handles individual metric evaluation with proper scoring and status determination."""

Expand Down Expand Up @@ -155,7 +160,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
EvaluationResult with score, result, token usage, and execution time,
or None if metric should be skipped (e.g., script metrics when API disabled).
"""
start_time = time.time()
start_time = time.perf_counter()

try:
# Create logging summary
Expand Down Expand Up @@ -184,9 +189,8 @@ def evaluate_metric( # pylint: disable=too-many-locals

# Route to appropriate handler
if framework not in self.handlers:
execution_time = time.time() - start_time
return self._create_error_result(
request, f"Unsupported framework: {framework}", execution_time
request, f"Unsupported framework: {framework}", start_time
)

# Check required data for metric (after API call); skip with ERROR if missing
Expand All @@ -198,11 +202,10 @@ def evaluate_metric( # pylint: disable=too-many-locals
request.turn_data, request.metric_identifier
)
if not ok:
execution_time = time.time() - start_time
logger.warning(
"Skipping metric due to missing required data: %s", msg
)
return self._create_error_result(request, msg, execution_time)
return self._create_error_result(request, msg, start_time)

# Create evaluation scope
evaluation_scope = EvaluationScope(
Expand All @@ -224,7 +227,7 @@ def evaluate_metric( # pylint: disable=too-many-locals
# Evaluate metric
metric_result = self._evaluate_wrapper(request, evaluation_scope, threshold)

execution_time = time.time() - start_time
evaluation_latency = _measure_latency(start_time)

turn_data = request.turn_data
api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request(
Expand All @@ -240,8 +243,9 @@ def evaluate_metric( # pylint: disable=too-many-locals
metric_metadata=self._extract_metadata_for_csv(request),
query=turn_data.query if turn_data else "",
response=turn_data.response or "" if turn_data else "",
execution_time=execution_time,
evaluation_latency=evaluation_latency,
agent_latency=agent_latency,
execution_time=evaluation_latency + agent_latency,
api_input_tokens=api_input_tokens,
api_output_tokens=api_output_tokens,
# Streaming performance metrics
Expand All @@ -266,9 +270,8 @@ def evaluate_metric( # pylint: disable=too-many-locals

except EvaluationError as e:
# Any evaluation error should result in ERROR status
execution_time = time.time() - start_time
return self._create_error_result(
request, f"Evaluation error: {e}", execution_time
request, f"Evaluation error: {e}", start_time
)

def _will_use_panel(self, metric_identifier: str) -> bool:
Expand Down Expand Up @@ -720,14 +723,15 @@ def _evaluate_non_llm(
)

def _create_error_result(
self, request: EvaluationRequest, reason: str, execution_time: float
self, request: EvaluationRequest, reason: str, start_time: float
) -> EvaluationResult:
"""Create an ERROR result for failed evaluation."""
turn_data = request.turn_data
api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request(
request
)
agent_latency = _compute_agent_latency_per_request(request)
evaluation_latency = _measure_latency(start_time)
return EvaluationResult(
conversation_group_id=request.conv_data.conversation_group_id,
tag=request.conv_data.tag,
Expand All @@ -740,8 +744,9 @@ def _create_error_result(
reason=reason,
query=turn_data.query if turn_data else "",
response=turn_data.response or "" if turn_data else "",
execution_time=execution_time,
evaluation_latency=evaluation_latency,
agent_latency=agent_latency,
execution_time=evaluation_latency + agent_latency,
api_input_tokens=api_input_tokens,
api_output_tokens=api_output_tokens,
# Streaming performance metrics
Expand Down
12 changes: 6 additions & 6 deletions tests/script/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
"result": "PASS",
"score": 0.8,
"threshold": 0.7,
"execution_time": 1.0,
"evaluation_latency": 1.0,
},
{
"conversation_group_id": "conv1",
Expand All @@ -38,7 +38,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
"result": "PASS",
"score": 0.9,
"threshold": 0.7,
"execution_time": 1.2,
"evaluation_latency": 1.2,
},
]

Expand All @@ -50,7 +50,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
"result": "PASS",
"score": 0.85,
"threshold": 0.7,
"execution_time": 1.1,
"evaluation_latency": 1.1,
},
{
"conversation_group_id": "conv1",
Expand All @@ -59,7 +59,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
"result": "FAIL",
"score": 0.6,
"threshold": 0.7,
"execution_time": 1.0,
"evaluation_latency": 1.0,
},
]

Expand Down Expand Up @@ -198,7 +198,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
"result": "PASS",
"score": 0.95,
"threshold": 0.8,
"execution_time": 1.0,
"evaluation_latency": 1.0,
},
{
"conversation_group_id": "conv1",
Expand All @@ -207,7 +207,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
"result": "PASS",
"score": 0.85,
"threshold": 0.7,
"execution_time": 1.2,
"evaluation_latency": 1.2,
},
]
* 5, # Repeat to get 10 results
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/core/models/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ def test_default_values(self) -> None:
assert result.tag == "eval"
assert result.score is None
assert result.reason == ""
assert result.execution_time == 0
assert result.evaluation_latency == 0

def test_explicit_tag_value(self) -> None:
"""Test EvaluationResult with explicit tag value."""
Expand Down Expand Up @@ -507,16 +507,16 @@ def test_invalid_result_status_rejected(self) -> None:
threshold=0.7,
)

def test_negative_execution_time_rejected(self) -> None:
"""Test that negative execution_time is rejected."""
def test_negative_evaluation_latency_rejected(self) -> None:
"""Test that negative evaluation_latency is rejected."""
with pytest.raises(ValidationError):
EvaluationResult(
conversation_group_id="conv1",
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
threshold=0.7,
execution_time=-1,
evaluation_latency=-1,
)

def test_conversation_level_metric_allows_none_turn_id(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/core/models/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"score": 0.85,
"threshold": 0.7,
"reason": "Good",
"execution_time": 1.0,
"evaluation_latency": 1.0,
"judge_llm_input_tokens": 100,
"judge_llm_output_tokens": 50,
}
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/core/output/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
reason="Score is 0.8",
query="What is OpenShift?",
response="OpenShift is a container platform.",
execution_time=1.5,
evaluation_latency=1.5,
),
EvaluationResult(
conversation_group_id="test_conv",
Expand All @@ -335,7 +335,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
reason="Poor performance",
query="How to deploy?",
response="Use oc apply.",
execution_time=0.8,
evaluation_latency=0.8,
expected_response="Use oc apply -f deployment.yaml",
),
EvaluationResult(
Expand All @@ -348,7 +348,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
reason="API connection failed",
query="Create namespace",
response="",
execution_time=0.0,
evaluation_latency=0.0,
),
]

Expand Down
9 changes: 5 additions & 4 deletions tests/unit/core/storage/test_sql_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def sample_result() -> EvaluationResult:
reason="Good response",
query="What is Python?",
response="Python is a programming language.",
execution_time=1.5,
evaluation_latency=1.5,
api_input_tokens=100,
api_output_tokens=50,
)
Expand Down Expand Up @@ -280,7 +280,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None:
reason="Excellent response",
query="Complex question?",
response="Detailed answer.",
execution_time=2.5,
evaluation_latency=2.5,
api_input_tokens=200,
api_output_tokens=150,
judge_llm_input_tokens=50,
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None:
assert row is not None
assert row.conversation_group_id == "conv_full"
assert row.score == 0.92
assert row.execution_time == 2.5
assert row.evaluation_latency == 2.5
assert row.tool_calls == '[{"name": "search"}]'

def test_null_fields_handled(self, temp_db_url: str) -> None:
Expand Down Expand Up @@ -414,7 +414,7 @@ def test_all_csv_columns_present(self) -> None:
"reason",
"query",
"response",
"execution_time",
"evaluation_latency",
"api_input_tokens",
"api_output_tokens",
"judge_llm_input_tokens",
Expand All @@ -425,6 +425,7 @@ def test_all_csv_columns_present(self) -> None:
"streaming_duration",
"agent_latency",
"tokens_per_second",
"execution_time",
"tool_calls",
"contexts",
"expected_response",
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/core/system/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,22 @@
class TestConfigLoader:
"""Unit tests for ConfigLoader."""

def test_load_default_system_yaml_loads_config_successfully(self) -> None:
"""Test that default config/system.yaml can be loaded without validation errors."""
default_system_yaml_path = Path(__file__).parents[4] / "config" / "system.yaml"
assert (
default_system_yaml_path.exists()
), f"system.yaml not found at {default_system_yaml_path}"

loader = ConfigLoader()

# This should not raise any validation errors
config = loader.load_system_config(str(default_system_yaml_path))

# Basic sanity checks
assert config is not None
assert loader.system_config is not None

def test_load_system_config_file_not_found(self) -> None:
"""Test loading non-existent config file raises error."""
loader = ConfigLoader()
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/pipeline/evaluation/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,24 @@ def processor(
) -> ConversationProcessor:
"""Create ConversationProcessor instance for PR tests."""
return ConversationProcessor(config_loader, processor_components_pr)


@pytest.fixture
def evaluator(
config_loader: ConfigLoader,
mock_metric_manager: MetricManager,
mock_script_manager: ScriptExecutionManager,
mocker: MockerFixture,
) -> MetricsEvaluator:
"""Create MetricsEvaluator with all handlers mocked."""
create_mock_llm_manager(mocker)
mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager")
mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics")
mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics")
mocker.patch(
"lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
)
mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")

return MetricsEvaluator(config_loader, mock_metric_manager, mock_script_manager)
1 change: 1 addition & 0 deletions tests/unit/pipeline/evaluation/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def test_mark_turn_metrics_as_error(self) -> None:
assert results[0].reason == error_reason
assert results[0].query == "Test query"
assert results[0].response == ""
assert results[0].evaluation_latency == 0.0
Comment thread
xmican10 marked this conversation as resolved.
assert results[0].execution_time == 0.0

# Check second error result
Expand Down
Loading
Loading