diff --git a/config/system.yaml b/config/system.yaml index 4ff16404..1ad8873f 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -270,6 +270,7 @@ storage: - "threshold" - "reason" - "execution_time" + - "evaluation_latency" - "query" - "response" - "api_input_tokens" diff --git a/src/lightspeed_evaluation/core/constants.py b/src/lightspeed_evaluation/core/constants.py index bb4cf114..abafe343 100644 --- a/src/lightspeed_evaluation/core/constants.py +++ b/src/lightspeed_evaluation/core/constants.py @@ -100,6 +100,7 @@ "query", "response", "execution_time", + "evaluation_latency", "api_input_tokens", "api_output_tokens", "judge_llm_input_tokens", diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index 5dfbbca4..aeabbdc0 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -529,10 +529,13 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin): query: str = Field(default="", description="Query text") response: str = Field(default="", description="Response text") execution_time: float = Field( - default=0, ge=0, description="Execution time in seconds" + default=0.0, ge=0, description="Execution time for entire turn in seconds" + ) + evaluation_latency: float = Field( + default=0.0, ge=0, description="Evaluation latency in seconds" ) agent_latency: float = Field( - default=0, + default=0.0, ge=0, description="API latency in seconds (per turn or average for conversation)", ) diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py index 3ddfeb1e..8b56884b 100644 --- a/src/lightspeed_evaluation/core/output/generator.py +++ b/src/lightspeed_evaluation/core/output/generator.py @@ -271,11 +271,8 @@ def _generate_csv_report( for column in csv_columns: if hasattr(result, column): value = getattr(result, column) - # Special formatting for execution_time - if column == "execution_time" and value is not None: - row_data.append(f"{value:.3f}") # Convert judge_scores to JSON string - elif column == "judge_scores" and value is not None: + if column == "judge_scores" and value is not None: row_data.append( json.dumps( [js.model_dump() for js in value], default=str @@ -817,7 +814,8 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]: "result": r.result, "score": r.score, "threshold": r.threshold, - "execution_time": round(r.execution_time, 3), + "execution_time": r.execution_time, + "evaluation_latency": r.evaluation_latency, "judge_llm_input_tokens": r.judge_llm_input_tokens, "judge_llm_output_tokens": r.judge_llm_output_tokens, "judge_scores": ( diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py index ae0b07b3..1cb545a8 100644 --- a/src/lightspeed_evaluation/core/storage/sql_storage.py +++ b/src/lightspeed_evaluation/core/storage/sql_storage.py @@ -58,6 +58,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods query = Column(Text, nullable=True) response = Column(Text, nullable=True) execution_time = Column(Float, nullable=True) + evaluation_latency = Column(Float, nullable=True) api_input_tokens = Column(Integer, nullable=True) api_output_tokens = Column(Integer, nullable=True) judge_llm_input_tokens = Column(Integer, nullable=True) @@ -319,6 +320,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB: query=result.query, response=result.response, execution_time=result.execution_time, + evaluation_latency=result.evaluation_latency, api_input_tokens=result.api_input_tokens, api_output_tokens=result.api_output_tokens, judge_llm_input_tokens=result.judge_llm_input_tokens, diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py index d447a7bf..f091266b 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py @@ -86,6 +86,11 @@ def _compute_agent_latency_per_request(request: EvaluationRequest) -> float: return sum(latencies) +def _measure_latency(start_time: float) -> float: + """Calculate evaluation latency given start time.""" + return time.perf_counter() - start_time + + class MetricsEvaluator: """Handles individual metric evaluation with proper scoring and status determination.""" @@ -155,7 +160,7 @@ def evaluate_metric( # pylint: disable=too-many-locals EvaluationResult with score, result, token usage, and execution time, or None if metric should be skipped (e.g., script metrics when API disabled). """ - start_time = time.time() + start_time = time.perf_counter() try: # Create logging summary @@ -184,9 +189,8 @@ def evaluate_metric( # pylint: disable=too-many-locals # Route to appropriate handler if framework not in self.handlers: - execution_time = time.time() - start_time return self._create_error_result( - request, f"Unsupported framework: {framework}", execution_time + request, f"Unsupported framework: {framework}", start_time ) # Check required data for metric (after API call); skip with ERROR if missing @@ -198,11 +202,10 @@ def evaluate_metric( # pylint: disable=too-many-locals request.turn_data, request.metric_identifier ) if not ok: - execution_time = time.time() - start_time logger.warning( "Skipping metric due to missing required data: %s", msg ) - return self._create_error_result(request, msg, execution_time) + return self._create_error_result(request, msg, start_time) # Create evaluation scope evaluation_scope = EvaluationScope( @@ -224,7 +227,7 @@ def evaluate_metric( # pylint: disable=too-many-locals # Evaluate metric metric_result = self._evaluate_wrapper(request, evaluation_scope, threshold) - execution_time = time.time() - start_time + evaluation_latency = _measure_latency(start_time) turn_data = request.turn_data api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request( @@ -240,8 +243,9 @@ def evaluate_metric( # pylint: disable=too-many-locals metric_metadata=self._extract_metadata_for_csv(request), query=turn_data.query if turn_data else "", response=turn_data.response or "" if turn_data else "", - execution_time=execution_time, + evaluation_latency=evaluation_latency, agent_latency=agent_latency, + execution_time=evaluation_latency + agent_latency, api_input_tokens=api_input_tokens, api_output_tokens=api_output_tokens, # Streaming performance metrics @@ -266,9 +270,8 @@ def evaluate_metric( # pylint: disable=too-many-locals except EvaluationError as e: # Any evaluation error should result in ERROR status - execution_time = time.time() - start_time return self._create_error_result( - request, f"Evaluation error: {e}", execution_time + request, f"Evaluation error: {e}", start_time ) def _will_use_panel(self, metric_identifier: str) -> bool: @@ -720,7 +723,7 @@ def _evaluate_non_llm( ) def _create_error_result( - self, request: EvaluationRequest, reason: str, execution_time: float + self, request: EvaluationRequest, reason: str, start_time: float ) -> EvaluationResult: """Create an ERROR result for failed evaluation.""" turn_data = request.turn_data @@ -728,6 +731,7 @@ def _create_error_result( request ) agent_latency = _compute_agent_latency_per_request(request) + evaluation_latency = _measure_latency(start_time) return EvaluationResult( conversation_group_id=request.conv_data.conversation_group_id, tag=request.conv_data.tag, @@ -740,8 +744,9 @@ def _create_error_result( reason=reason, query=turn_data.query if turn_data else "", response=turn_data.response or "" if turn_data else "", - execution_time=execution_time, + evaluation_latency=evaluation_latency, agent_latency=agent_latency, + execution_time=evaluation_latency + agent_latency, api_input_tokens=api_input_tokens, api_output_tokens=api_output_tokens, # Streaming performance metrics diff --git a/tests/script/conftest.py b/tests/script/conftest.py index 752800a2..ad002dee 100644 --- a/tests/script/conftest.py +++ b/tests/script/conftest.py @@ -29,7 +29,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]: "result": "PASS", "score": 0.8, "threshold": 0.7, - "execution_time": 1.0, + "evaluation_latency": 1.0, }, { "conversation_group_id": "conv1", @@ -38,7 +38,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]: "result": "PASS", "score": 0.9, "threshold": 0.7, - "execution_time": 1.2, + "evaluation_latency": 1.2, }, ] @@ -50,7 +50,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]: "result": "PASS", "score": 0.85, "threshold": 0.7, - "execution_time": 1.1, + "evaluation_latency": 1.1, }, { "conversation_group_id": "conv1", @@ -59,7 +59,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]: "result": "FAIL", "score": 0.6, "threshold": 0.7, - "execution_time": 1.0, + "evaluation_latency": 1.0, }, ] @@ -198,7 +198,7 @@ def sample_evaluation_summary() -> dict[str, Any]: "result": "PASS", "score": 0.95, "threshold": 0.8, - "execution_time": 1.0, + "evaluation_latency": 1.0, }, { "conversation_group_id": "conv1", @@ -207,7 +207,7 @@ def sample_evaluation_summary() -> dict[str, Any]: "result": "PASS", "score": 0.85, "threshold": 0.7, - "execution_time": 1.2, + "evaluation_latency": 1.2, }, ] * 5, # Repeat to get 10 results diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py index da8588e6..10c882fa 100644 --- a/tests/unit/core/models/test_data.py +++ b/tests/unit/core/models/test_data.py @@ -469,7 +469,7 @@ def test_default_values(self) -> None: assert result.tag == "eval" assert result.score is None assert result.reason == "" - assert result.execution_time == 0 + assert result.evaluation_latency == 0 def test_explicit_tag_value(self) -> None: """Test EvaluationResult with explicit tag value.""" @@ -507,8 +507,8 @@ def test_invalid_result_status_rejected(self) -> None: threshold=0.7, ) - def test_negative_execution_time_rejected(self) -> None: - """Test that negative execution_time is rejected.""" + def test_negative_evaluation_latency_rejected(self) -> None: + """Test that negative evaluation_latency is rejected.""" with pytest.raises(ValidationError): EvaluationResult( conversation_group_id="conv1", @@ -516,7 +516,7 @@ def test_negative_execution_time_rejected(self) -> None: metric_identifier="metric1", result="PASS", threshold=0.7, - execution_time=-1, + evaluation_latency=-1, ) def test_conversation_level_metric_allows_none_turn_id(self) -> None: diff --git a/tests/unit/core/models/test_summary.py b/tests/unit/core/models/test_summary.py index 3e53d697..c84a6f01 100644 --- a/tests/unit/core/models/test_summary.py +++ b/tests/unit/core/models/test_summary.py @@ -26,7 +26,7 @@ "score": 0.85, "threshold": 0.7, "reason": "Good", - "execution_time": 1.0, + "evaluation_latency": 1.0, "judge_llm_input_tokens": 100, "judge_llm_output_tokens": 50, } diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py index 21ffd1ec..c8aa29a3 100644 --- a/tests/unit/core/output/test_generator.py +++ b/tests/unit/core/output/test_generator.py @@ -323,7 +323,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None: reason="Score is 0.8", query="What is OpenShift?", response="OpenShift is a container platform.", - execution_time=1.5, + evaluation_latency=1.5, ), EvaluationResult( conversation_group_id="test_conv", @@ -335,7 +335,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None: reason="Poor performance", query="How to deploy?", response="Use oc apply.", - execution_time=0.8, + evaluation_latency=0.8, expected_response="Use oc apply -f deployment.yaml", ), EvaluationResult( @@ -348,7 +348,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None: reason="API connection failed", query="Create namespace", response="", - execution_time=0.0, + evaluation_latency=0.0, ), ] diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py index 29493135..d79b698a 100644 --- a/tests/unit/core/storage/test_sql_storage.py +++ b/tests/unit/core/storage/test_sql_storage.py @@ -50,7 +50,7 @@ def sample_result() -> EvaluationResult: reason="Good response", query="What is Python?", response="Python is a programming language.", - execution_time=1.5, + evaluation_latency=1.5, api_input_tokens=100, api_output_tokens=50, ) @@ -280,7 +280,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None: reason="Excellent response", query="Complex question?", response="Detailed answer.", - execution_time=2.5, + evaluation_latency=2.5, api_input_tokens=200, api_output_tokens=150, judge_llm_input_tokens=50, @@ -309,7 +309,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None: assert row is not None assert row.conversation_group_id == "conv_full" assert row.score == 0.92 - assert row.execution_time == 2.5 + assert row.evaluation_latency == 2.5 assert row.tool_calls == '[{"name": "search"}]' def test_null_fields_handled(self, temp_db_url: str) -> None: @@ -414,7 +414,7 @@ def test_all_csv_columns_present(self) -> None: "reason", "query", "response", - "execution_time", + "evaluation_latency", "api_input_tokens", "api_output_tokens", "judge_llm_input_tokens", @@ -425,6 +425,7 @@ def test_all_csv_columns_present(self) -> None: "streaming_duration", "agent_latency", "tokens_per_second", + "execution_time", "tool_calls", "contexts", "expected_response", diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py index b2108152..86799bcd 100644 --- a/tests/unit/core/system/test_loader.py +++ b/tests/unit/core/system/test_loader.py @@ -15,6 +15,22 @@ class TestConfigLoader: """Unit tests for ConfigLoader.""" + def test_load_default_system_yaml_loads_config_successfully(self) -> None: + """Test that default config/system.yaml can be loaded without validation errors.""" + default_system_yaml_path = Path(__file__).parents[4] / "config" / "system.yaml" + assert ( + default_system_yaml_path.exists() + ), f"system.yaml not found at {default_system_yaml_path}" + + loader = ConfigLoader() + + # This should not raise any validation errors + config = loader.load_system_config(str(default_system_yaml_path)) + + # Basic sanity checks + assert config is not None + assert loader.system_config is not None + def test_load_system_config_file_not_found(self) -> None: """Test loading non-existent config file raises error.""" loader = ConfigLoader() diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py index 2d06d16a..959713a2 100644 --- a/tests/unit/pipeline/evaluation/conftest.py +++ b/tests/unit/pipeline/evaluation/conftest.py @@ -252,3 +252,24 @@ def processor( ) -> ConversationProcessor: """Create ConversationProcessor instance for PR tests.""" return ConversationProcessor(config_loader, processor_components_pr) + + +@pytest.fixture +def evaluator( + config_loader: ConfigLoader, + mock_metric_manager: MetricManager, + mock_script_manager: ScriptExecutionManager, + mocker: MockerFixture, +) -> MetricsEvaluator: + """Create MetricsEvaluator with all handlers mocked.""" + create_mock_llm_manager(mocker) + mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager") + mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") + mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics") + mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics") + mocker.patch( + "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" + ) + mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics") + + return MetricsEvaluator(config_loader, mock_metric_manager, mock_script_manager) diff --git a/tests/unit/pipeline/evaluation/test_errors.py b/tests/unit/pipeline/evaluation/test_errors.py index b8477c08..04f97222 100644 --- a/tests/unit/pipeline/evaluation/test_errors.py +++ b/tests/unit/pipeline/evaluation/test_errors.py @@ -157,6 +157,7 @@ def test_mark_turn_metrics_as_error(self) -> None: assert results[0].reason == error_reason assert results[0].query == "Test query" assert results[0].response == "" + assert results[0].evaluation_latency == 0.0 assert results[0].execution_time == 0.0 # Check second error result diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py index 2d93fd32..82b63555 100644 --- a/tests/unit/pipeline/evaluation/test_evaluator.py +++ b/tests/unit/pipeline/evaluation/test_evaluator.py @@ -1,4 +1,4 @@ -# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments,too-many-lines +# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments,too-many-lines, too-many-public-methods """Unit tests for pipeline evaluation evaluator module.""" @@ -21,41 +21,17 @@ from lightspeed_evaluation.core.script import ScriptExecutionManager from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator -from tests.unit.pipeline.evaluation.conftest import create_mock_llm_manager - class TestMetricsEvaluator: """Unit tests for MetricsEvaluator.""" def test_initialization( self, + evaluator: MetricsEvaluator, config_loader: ConfigLoader, mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, ) -> None: """Test evaluator initialization.""" - # Mock the metric handlers - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics") - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - assert evaluator.config_loader == config_loader assert evaluator.metric_manager == mock_metric_manager assert ( @@ -74,39 +50,9 @@ def test_initialization_raises_error_without_config( with pytest.raises(RuntimeError, match="Uninitialized system_config"): MetricsEvaluator(loader, mock_metric_manager, mock_script_manager) - def test_evaluate_metric_turn_level_pass( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, - ) -> None: + def test_evaluate_metric_turn_level_pass(self, evaluator: MetricsEvaluator) -> None: """Test evaluating turn-level metric that passes.""" - # Mock the handlers - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.return_value = (0.85, "Good faithfulness") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) + evaluator.handlers["ragas"].evaluate.return_value = (0.85, "Good faithfulness") turn_data = TurnData( turn_id="1", @@ -115,7 +61,6 @@ def test_evaluate_metric_turn_level_pass( contexts=["Context"], ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - request = EvaluationRequest.for_turn( conv_data, "ragas:faithfulness", 0, turn_data ) @@ -130,49 +75,26 @@ def test_evaluate_metric_turn_level_pass( assert result.conversation_group_id == "test_conv" assert result.turn_id == "1" assert result.metric_identifier == "ragas:faithfulness" - assert result.query == "What is Python?" assert result.response == "Python is a programming language." assert result.contexts == '["Context"]' + # Verify execution_time is populated + assert result.execution_time >= 0.0 + assert result.evaluation_latency >= 0.0 + assert result.agent_latency >= 0.0 + assert result.execution_time == result.evaluation_latency + result.agent_latency - def test_evaluate_metric_turn_level_fail( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, - ) -> None: + def test_evaluate_metric_turn_level_fail(self, evaluator: MetricsEvaluator) -> None: """Test evaluating turn-level metric that fails.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.return_value = (0.3, "Low faithfulness score") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager + evaluator.handlers["ragas"].evaluate.return_value = ( + 0.3, + "Low faithfulness score", ) turn_data = TurnData( turn_id="1", query="Query", response="Response", contexts=["Context"] ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - request = EvaluationRequest.for_turn( conv_data, "ragas:faithfulness", 0, turn_data ) @@ -185,35 +107,10 @@ def test_evaluate_metric_turn_level_fail( assert result.threshold == 0.7 def test_evaluate_metric_missing_required_data_returns_error( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """When required data is missing or empty, return ERROR and skip metric processing.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mock_ragas = mocker.Mock() - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) + mock_ragas = evaluator.handlers["ragas"] turn_data = TurnData( turn_id="1", @@ -236,34 +133,12 @@ def test_evaluate_metric_missing_required_data_returns_error( mock_ragas.evaluate.assert_not_called() def test_evaluate_metric_conversation_level( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test evaluating conversation-level metric.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - - mock_deepeval = mocker.Mock() - mock_deepeval.evaluate.return_value = (0.75, "Complete conversation") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics", - return_value=mock_deepeval, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager + evaluator.handlers["deepeval"].evaluate.return_value = ( + 0.75, + "Complete conversation", ) turn1 = TurnData( @@ -283,12 +158,12 @@ def test_evaluate_metric_conversation_level( conv_data = EvaluationData( conversation_group_id="test_conv", turns=[turn1, turn2] ) - request = EvaluationRequest.for_conversation( conv_data, "deepeval:conversation_completeness" ) result = evaluator.evaluate_metric(request) + assert result is not None assert result.result == "PASS" assert result.score == 0.75 @@ -297,32 +172,9 @@ def test_evaluate_metric_conversation_level( assert result.api_output_tokens == 20 def test_evaluate_metric_unsupported_framework( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test unsupported framework returns ERROR and aggregates API tokens across turns.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - turn1 = TurnData( turn_id="1", query="Q", @@ -352,41 +204,13 @@ def test_evaluate_metric_unsupported_framework( assert result.api_output_tokens == 5 def test_evaluate_metric_returns_none_score( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test handling when metric evaluation returns None score.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.return_value = (None, "Evaluation failed") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) + evaluator.handlers["ragas"].evaluate.return_value = (None, "Evaluation failed") turn_data = TurnData(turn_id="1", query="Q", response="R", contexts=["C"]) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - request = EvaluationRequest.for_turn( conv_data, "ragas:faithfulness", 0, turn_data ) @@ -399,45 +223,19 @@ def test_evaluate_metric_returns_none_score( assert result.reason == "Evaluation failed" def test_evaluate_metric_exception_handling( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test exception handling during metric evaluation. Note: Even on error, turn data fields (query, response, contexts) should be preserved in the result for debugging and analysis purposes. """ - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.side_effect = EvaluationError("Unexpected error") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager + evaluator.handlers["ragas"].evaluate.side_effect = EvaluationError( + "Unexpected error" ) turn_data = TurnData(turn_id="1", query="Q", response="R", contexts=["C"]) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - request = EvaluationRequest.for_turn( conv_data, "ragas:faithfulness", 0, turn_data ) @@ -448,214 +246,57 @@ def test_evaluate_metric_exception_handling( assert result.result == "ERROR" assert "Evaluation error" in result.reason assert "Unexpected error" in result.reason - # Turn data should be preserved even on error for debugging assert result.query == "Q" assert result.response == "R" - assert result.contexts == '["C"]' # JSON-serialized contexts preserved on error - assert result.expected_response is None # Was not set in turn_data + assert result.contexts == '["C"]' + assert result.expected_response is None def test_evaluate_metric_skip_script_when_api_disabled( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator, config_loader: ConfigLoader ) -> None: """Test script metrics are skipped when API is disabled.""" assert config_loader.system_config is not None config_loader.system_config.agents = None - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - turn_data = TurnData(turn_id="1", query="Q", response="R") conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) - request = EvaluationRequest.for_turn( conv_data, "script:action_eval", 0, turn_data ) result = evaluator.evaluate_metric(request) - # Should return None when API is disabled for script metrics - assert result is None + assert ( + result is None + ) # Should return None when API is disabled for script metrics - def test_determine_status_with_threshold( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, - ) -> None: + def test_determine_status_with_threshold(self, evaluator: MetricsEvaluator) -> None: """Test _determine_status method.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - - # Test PASS assert evaluator._determine_status(0.8, 0.7) == "PASS" assert evaluator._determine_status(0.7, 0.7) == "PASS" # Equal passes - - # Test FAIL assert evaluator._determine_status(0.6, 0.7) == "FAIL" def test_determine_status_without_threshold( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test _determine_status uses default 0.5 when threshold is None.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - - # Should use 0.5 as default assert evaluator._determine_status(0.6, None) == "PASS" assert evaluator._determine_status(0.4, None) == "FAIL" - def _setup_evaluate_test( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, - mock_return: tuple[float, str] | list[tuple[float, str]], - ) -> tuple[MetricsEvaluator, dict]: - """Helper to setup common mocks for _evaluate_wrapper() tests. - - Returns: - tuple: (evaluator, mock_handlers) where mock_handlers is a dict with keys: - 'ragas', 'geval', 'custom', 'script', 'nlp' - """ - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - # Create a helper to setup mock with return values - def create_mock_handler( # type: ignore[no-untyped-def] - mocker: MockerFixture, - mock_return: tuple[float, str] | list[tuple[float, str]], - ): - mock = mocker.Mock() - if isinstance(mock_return, list): - mock.evaluate.side_effect = mock_return - else: - mock.evaluate.return_value = mock_return - return mock - - # Setup all handler mocks - mock_ragas = create_mock_handler(mocker, mock_return) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - - mock_deepeval = create_mock_handler(mocker, mock_return) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics", - return_value=mock_deepeval, - ) - - mock_custom = create_mock_handler(mocker, mock_return) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics", - return_value=mock_custom, - ) - - mock_nlp = create_mock_handler(mocker, mock_return) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics", - return_value=mock_nlp, - ) - - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - - # Return evaluator and dict of all mocks - mock_handlers = { - "ragas": mock_ragas, - "geval": mock_deepeval, - "custom": mock_custom, - "nlp": mock_nlp, - } - - return evaluator, mock_handlers - @pytest.mark.parametrize( "metric_identifier", ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"], ) def test_evaluate_with_expected_response_list( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, - metric_identifier: str, + self, evaluator: MetricsEvaluator, metric_identifier: str ) -> None: """Test _evaluate_wrapper() with list expected_response for metric that requires it.""" - evaluator, mock_handlers = self._setup_evaluate_test( - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - [(0.3, "Low score"), (0.85, "High score")], - ) + framework = metric_identifier.split(":")[0] + evaluator.handlers[framework].evaluate.side_effect = [ + (0.3, "Low score"), + (0.85, "High score"), + ] turn_data = TurnData( turn_id="1", @@ -673,27 +314,14 @@ def test_evaluate_with_expected_response_list( assert metric_result.score == 0.85 assert metric_result.reason == "High score" assert metric_result.result == "PASS" - - # Check the appropriate handler was called based on metric framework - framework = metric_identifier.split(":")[0] - assert mock_handlers[framework].evaluate.call_count == 2 + assert evaluator.handlers[framework].evaluate.call_count == 2 def test_evaluate_with_expected_response_list_fail( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test _evaluate_wrapper() with list expected_response for metric that requires it.""" scores_reasons = [(0.3, "Score 1"), (0.65, "Score 2"), (0.45, "Score 3")] - evaluator, mock_handlers = self._setup_evaluate_test( - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - scores_reasons, - ) + evaluator.handlers["ragas"].evaluate.side_effect = scores_reasons turn_data = TurnData( turn_id="1", @@ -716,23 +344,13 @@ def test_evaluate_with_expected_response_list_fail( assert metric_result.score == 0.65 assert metric_result.reason == reason_combined assert metric_result.result == "FAIL" - assert mock_handlers["ragas"].evaluate.call_count == 3 + assert evaluator.handlers["ragas"].evaluate.call_count == 3 def test_evaluate_with_expected_response_string( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test _evaluate_wrapper() with string expected_response.""" - evaluator, mock_handlers = self._setup_evaluate_test( - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - (0.85, "Good score"), - ) + evaluator.handlers["ragas"].evaluate.return_value = (0.85, "Good score") turn_data = TurnData( turn_id="1", query="Q", response="R", expected_response="A", contexts=["C"] @@ -748,7 +366,7 @@ def test_evaluate_with_expected_response_string( assert metric_result.score == 0.85 assert metric_result.reason == "Good score" assert metric_result.result == "PASS" - assert mock_handlers["ragas"].evaluate.call_count == 1 + assert evaluator.handlers["ragas"].evaluate.call_count == 1 @pytest.mark.parametrize( "metric_identifier", ["ragas:faithfulness", "geval:technical_accuracy"] @@ -760,21 +378,16 @@ def test_evaluate_with_expected_response_string( ) def test_evaluate_with_expected_response_not_needed( self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + evaluator: MetricsEvaluator, metric_identifier: str, expected_response: str | list[str] | None, ) -> None: """Test _evaluate_wrapper() with metric that does not require expected_response.""" - evaluator, mock_handlers = self._setup_evaluate_test( - config_loader, - mock_metric_manager, - mock_script_manager, - mocker, - [(0.3, "Low score"), (0.85, "High score")], - ) + framework = metric_identifier.split(":")[0] + evaluator.handlers[framework].evaluate.side_effect = [ + (0.3, "Low score"), + (0.85, "High score"), + ] turn_data = TurnData( turn_id="1", @@ -792,54 +405,21 @@ def test_evaluate_with_expected_response_not_needed( assert metric_result.score == 0.3 assert metric_result.reason == "Low score" assert metric_result.result == "FAIL" - - # Check the appropriate handler was called based on metric - framework = metric_identifier.split(":")[0] - assert mock_handlers[framework].evaluate.call_count == 1 + assert evaluator.handlers[framework].evaluate.call_count == 1 def test_evaluate_multiple_expected_responses_error_preserves_tokens( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator, mocker: MockerFixture ) -> None: """Test token preservation when error occurs during multiple expected responses evaluation. Scenario: First iteration succeeds with tokens, second iteration fails. Expected: Error result should preserve tokens from first iteration. """ - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - # First call returns valid result with tokens, second call raises exception - mock_ragas = mocker.Mock() - mock_ragas.evaluate.side_effect = [ + evaluator.handlers["ragas"].evaluate.side_effect = [ (0.3, "First iteration failed threshold"), EvaluationError("LLM error in second iteration"), ] - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics") - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) - # Mock token tracker to simulate tokens from first iteration original_evaluate = evaluator._evaluate def mock_evaluate_with_tokens( @@ -849,7 +429,6 @@ def mock_evaluate_with_tokens( threshold: Optional[float], ) -> MetricResult: result = original_evaluate(request, scope, token_tracker, threshold) - # Simulate tokens were added after each successful call result.judge_llm_input_tokens = 150 result.judge_llm_output_tokens = 50 return result @@ -873,60 +452,25 @@ def mock_evaluate_with_tokens( result = evaluator.evaluate_metric(request) assert result is not None - # With consistent flow, both iterations complete and accumulate tokens - # Result is not ERROR because we get partial results (first iteration FAIL, second ERROR) - # The highest score from iterations is preserved assert result.score == 0.3 # From first iteration - # Tokens accumulated from both iterations (first success + second failure mock) assert result.judge_llm_input_tokens == 300 # 150 + 150 assert result.judge_llm_output_tokens == 100 # 50 + 50 - # Error from second iteration captured in accumulated reason assert "error" in result.reason.lower() def test_evaluate_single_path_error_preserves_tokens( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator ) -> None: """Test token preservation when error occurs in single evaluation path. Scenario: Single evaluation call fails but tokens were tracked. Expected: Error result should preserve any tokens captured. """ - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.side_effect = EvaluationError("LLM connection failed") - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics") - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager + evaluator.handlers["ragas"].evaluate.side_effect = EvaluationError( + "LLM connection failed" ) turn_data = TurnData( - turn_id="1", - query="Q", - response="R", - expected_response="A", # Single expected response - contexts=["C"], + turn_id="1", query="Q", response="R", expected_response="A", contexts=["C"] ) conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) request = EvaluationRequest.for_turn( @@ -938,46 +482,17 @@ def test_evaluate_single_path_error_preserves_tokens( assert result is not None assert result.result == "ERROR" assert "LLM connection failed" in result.reason - # Token counts should be present (even if 0) assert result.judge_llm_input_tokens >= 0 assert result.judge_llm_output_tokens >= 0 def test_multiple_expected_responses_error_no_double_counting( - self, - config_loader: ConfigLoader, - mock_metric_manager: MetricManager, - mock_script_manager: ScriptExecutionManager, - mocker: MockerFixture, + self, evaluator: MetricsEvaluator, mocker: MockerFixture ) -> None: """Test token counts use deltas not cumulative totals when error on iteration 2+.""" - create_mock_llm_manager(mocker) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager" - ) - - mock_ragas = mocker.Mock() - mock_ragas.evaluate.side_effect = [ + evaluator.handlers["ragas"].evaluate.side_effect = [ (0.3, "First iteration"), EvaluationError("Second iteration failed"), ] - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics", - return_value=mock_ragas, - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics" - ) - mocker.patch( - "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics" - ) - mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics") - - evaluator = MetricsEvaluator( - config_loader, mock_metric_manager, mock_script_manager - ) call_count = [0] @@ -989,7 +504,6 @@ def mock_evaluate_with_tokens( ) -> MetricResult: call_count[0] += 1 if call_count[0] == 1: - # Iteration 1: add tokens and return success token_tracker.add_judge_tokens(100, 50) token_tracker.add_embedding_tokens(20) return MetricResult( @@ -1001,7 +515,6 @@ def mock_evaluate_with_tokens( judge_llm_output_tokens=50, embedding_tokens=20, ) - # Iteration 2: add tokens then raise error token_tracker.add_judge_tokens(150, 75) token_tracker.add_embedding_tokens(30) raise EvaluationError("Second iteration failed") @@ -1030,3 +543,63 @@ def mock_evaluate_with_tokens( assert result.judge_llm_input_tokens == 250 # 100+150 assert result.judge_llm_output_tokens == 125 # 50+75 assert result.embedding_tokens == 50 # 20+30 + + def test_execution_time_calculation(self, evaluator: MetricsEvaluator) -> None: + """Test execution_time is correctly calculated as evaluation_latency + agent_latency.""" + mock_ragas = evaluator.handlers["ragas"] + mock_ragas.evaluate.return_value = (0.85, "Good score") + + turn_data = TurnData( + turn_id="1", query="Q", response="R", contexts=["C"], agent_latency=1.5 + ) + conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) + request = EvaluationRequest.for_turn( + conv_data, "ragas:faithfulness", 0, turn_data + ) + + result = evaluator.evaluate_metric(request) + + assert result is not None + assert result.agent_latency == 1.5 + assert result.evaluation_latency > 0.0 + assert result.execution_time == result.evaluation_latency + result.agent_latency + assert result.execution_time >= 1.5 + + def test_execution_time_in_error_result(self, evaluator: MetricsEvaluator) -> None: + """Test execution_time is populated even in ERROR results.""" + turn_data = TurnData(turn_id="1", query="Q", response="R", agent_latency=2.0) + conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data]) + request = EvaluationRequest.for_turn(conv_data, "unknown:metric", 0, turn_data) + + result = evaluator.evaluate_metric(request) + + assert result is not None + assert result.result == "ERROR" + assert result.agent_latency == 2.0 + assert result.evaluation_latency > 0.0 + assert result.execution_time == result.evaluation_latency + result.agent_latency + assert result.execution_time >= 2.0 + + def test_execution_time_conversation_level_sums_agent_latency( + self, evaluator: MetricsEvaluator + ) -> None: + """Test execution_time uses the summed agent_latency for conversation-level metrics.""" + mock_deepeval = evaluator.handlers["deepeval"] + mock_deepeval.evaluate.return_value = (0.75, "Good conversation") + + turn1 = TurnData(turn_id="1", query="Q1", response="R1", agent_latency=1.0) + turn2 = TurnData(turn_id="2", query="Q2", response="R2", agent_latency=3.0) + conv_data = EvaluationData( + conversation_group_id="test_conv", turns=[turn1, turn2] + ) + request = EvaluationRequest.for_conversation( + conv_data, "deepeval:conversation_completeness" + ) + + result = evaluator.evaluate_metric(request) + + assert result is not None + assert result.agent_latency == 4.0 + assert result.evaluation_latency > 0.0 + assert result.execution_time == result.evaluation_latency + result.agent_latency + assert result.execution_time >= 4.0