diff --git a/config/system.yaml b/config/system.yaml
index 4ff16404..1ad8873f 100644
--- a/config/system.yaml
+++ b/config/system.yaml
@@ -270,6 +270,7 @@ storage:
       - "threshold"
       - "reason"
       - "execution_time"
+      - "evaluation_latency"
       - "query"
       - "response"
       - "api_input_tokens"
diff --git a/src/lightspeed_evaluation/core/constants.py b/src/lightspeed_evaluation/core/constants.py
index bb4cf114..abafe343 100644
--- a/src/lightspeed_evaluation/core/constants.py
+++ b/src/lightspeed_evaluation/core/constants.py
@@ -100,6 +100,7 @@
     "query",
     "response",
     "execution_time",
+    "evaluation_latency",
     "api_input_tokens",
     "api_output_tokens",
     "judge_llm_input_tokens",
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
index 5dfbbca4..aeabbdc0 100644
--- a/src/lightspeed_evaluation/core/models/data.py
+++ b/src/lightspeed_evaluation/core/models/data.py
@@ -529,10 +529,13 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
     query: str = Field(default="", description="Query text")
     response: str = Field(default="", description="Response text")
     execution_time: float = Field(
-        default=0, ge=0, description="Execution time in seconds"
+        default=0.0, ge=0, description="Execution time for entire turn in seconds"
+    )
+    evaluation_latency: float = Field(
+        default=0.0, ge=0, description="Evaluation latency in seconds"
     )
     agent_latency: float = Field(
-        default=0,
+        default=0.0,
         ge=0,
         description="API latency in seconds (per turn or average for conversation)",
     )
diff --git a/src/lightspeed_evaluation/core/output/generator.py b/src/lightspeed_evaluation/core/output/generator.py
index 3ddfeb1e..8b56884b 100644
--- a/src/lightspeed_evaluation/core/output/generator.py
+++ b/src/lightspeed_evaluation/core/output/generator.py
@@ -271,11 +271,8 @@ def _generate_csv_report(
                 for column in csv_columns:
                     if hasattr(result, column):
                         value = getattr(result, column)
-                        # Special formatting for execution_time
-                        if column == "execution_time" and value is not None:
-                            row_data.append(f"{value:.3f}")
                         # Convert judge_scores to JSON string
-                        elif column == "judge_scores" and value is not None:
+                        if column == "judge_scores" and value is not None:
                             row_data.append(
                                 json.dumps(
                                     [js.model_dump() for js in value], default=str
@@ -817,7 +814,8 @@ def _result_to_json_dict(r: EvaluationResult) -> dict[str, Any]:
         "result": r.result,
         "score": r.score,
         "threshold": r.threshold,
-        "execution_time": round(r.execution_time, 3),
+        "execution_time": r.execution_time,
+        "evaluation_latency": r.evaluation_latency,
         "judge_llm_input_tokens": r.judge_llm_input_tokens,
         "judge_llm_output_tokens": r.judge_llm_output_tokens,
         "judge_scores": (
diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py
index ae0b07b3..1cb545a8 100644
--- a/src/lightspeed_evaluation/core/storage/sql_storage.py
+++ b/src/lightspeed_evaluation/core/storage/sql_storage.py
@@ -58,6 +58,7 @@ class EvaluationResultDB(Base):  # pylint: disable=too-few-public-methods
     query = Column(Text, nullable=True)
     response = Column(Text, nullable=True)
     execution_time = Column(Float, nullable=True)
+    evaluation_latency = Column(Float, nullable=True)
     api_input_tokens = Column(Integer, nullable=True)
     api_output_tokens = Column(Integer, nullable=True)
     judge_llm_input_tokens = Column(Integer, nullable=True)
@@ -319,6 +320,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
             query=result.query,
             response=result.response,
             execution_time=result.execution_time,
+            evaluation_latency=result.evaluation_latency,
             api_input_tokens=result.api_input_tokens,
             api_output_tokens=result.api_output_tokens,
             judge_llm_input_tokens=result.judge_llm_input_tokens,
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
index d447a7bf..f091266b 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/evaluator.py
@@ -86,6 +86,11 @@ def _compute_agent_latency_per_request(request: EvaluationRequest) -> float:
     return sum(latencies)
 
 
+def _measure_latency(start_time: float) -> float:
+    """Calculate evaluation latency given start time."""
+    return time.perf_counter() - start_time
+
+
 class MetricsEvaluator:
     """Handles individual metric evaluation with proper scoring and status determination."""
 
@@ -155,7 +160,7 @@ def evaluate_metric(  # pylint: disable=too-many-locals
             EvaluationResult with score, result, token usage, and execution time,
             or None if metric should be skipped (e.g., script metrics when API disabled).
         """
-        start_time = time.time()
+        start_time = time.perf_counter()
 
         try:
             # Create logging summary
@@ -184,9 +189,8 @@ def evaluate_metric(  # pylint: disable=too-many-locals
 
             # Route to appropriate handler
             if framework not in self.handlers:
-                execution_time = time.time() - start_time
                 return self._create_error_result(
-                    request, f"Unsupported framework: {framework}", execution_time
+                    request, f"Unsupported framework: {framework}", start_time
                 )
 
             # Check required data for metric (after API call); skip with ERROR if missing
@@ -198,11 +202,10 @@ def evaluate_metric(  # pylint: disable=too-many-locals
                     request.turn_data, request.metric_identifier
                 )
                 if not ok:
-                    execution_time = time.time() - start_time
                     logger.warning(
                         "Skipping metric due to missing required data: %s", msg
                     )
-                    return self._create_error_result(request, msg, execution_time)
+                    return self._create_error_result(request, msg, start_time)
 
             # Create evaluation scope
             evaluation_scope = EvaluationScope(
@@ -224,7 +227,7 @@ def evaluate_metric(  # pylint: disable=too-many-locals
             # Evaluate metric
             metric_result = self._evaluate_wrapper(request, evaluation_scope, threshold)
 
-            execution_time = time.time() - start_time
+            evaluation_latency = _measure_latency(start_time)
 
             turn_data = request.turn_data
             api_input_tokens, api_output_tokens = _compute_api_token_counts_per_request(
@@ -240,8 +243,9 @@ def evaluate_metric(  # pylint: disable=too-many-locals
                 metric_metadata=self._extract_metadata_for_csv(request),
                 query=turn_data.query if turn_data else "",
                 response=turn_data.response or "" if turn_data else "",
-                execution_time=execution_time,
+                evaluation_latency=evaluation_latency,
                 agent_latency=agent_latency,
+                execution_time=evaluation_latency + agent_latency,
                 api_input_tokens=api_input_tokens,
                 api_output_tokens=api_output_tokens,
                 # Streaming performance metrics
@@ -266,9 +270,8 @@ def evaluate_metric(  # pylint: disable=too-many-locals
 
         except EvaluationError as e:
             # Any evaluation error should result in ERROR status
-            execution_time = time.time() - start_time
             return self._create_error_result(
-                request, f"Evaluation error: {e}", execution_time
+                request, f"Evaluation error: {e}", start_time
             )
 
     def _will_use_panel(self, metric_identifier: str) -> bool:
@@ -720,7 +723,7 @@ def _evaluate_non_llm(
         )
 
     def _create_error_result(
-        self, request: EvaluationRequest, reason: str, execution_time: float
+        self, request: EvaluationRequest, reason: str, start_time: float
     ) -> EvaluationResult:
         """Create an ERROR result for failed evaluation."""
         turn_data = request.turn_data
@@ -728,6 +731,7 @@ def _create_error_result(
             request
         )
         agent_latency = _compute_agent_latency_per_request(request)
+        evaluation_latency = _measure_latency(start_time)
         return EvaluationResult(
             conversation_group_id=request.conv_data.conversation_group_id,
             tag=request.conv_data.tag,
@@ -740,8 +744,9 @@ def _create_error_result(
             reason=reason,
             query=turn_data.query if turn_data else "",
             response=turn_data.response or "" if turn_data else "",
-            execution_time=execution_time,
+            evaluation_latency=evaluation_latency,
             agent_latency=agent_latency,
+            execution_time=evaluation_latency + agent_latency,
             api_input_tokens=api_input_tokens,
             api_output_tokens=api_output_tokens,
             # Streaming performance metrics
diff --git a/tests/script/conftest.py b/tests/script/conftest.py
index 752800a2..ad002dee 100644
--- a/tests/script/conftest.py
+++ b/tests/script/conftest.py
@@ -29,7 +29,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
             "result": "PASS",
             "score": 0.8,
             "threshold": 0.7,
-            "execution_time": 1.0,
+            "evaluation_latency": 1.0,
         },
         {
             "conversation_group_id": "conv1",
@@ -38,7 +38,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
             "result": "PASS",
             "score": 0.9,
             "threshold": 0.7,
-            "execution_time": 1.2,
+            "evaluation_latency": 1.2,
         },
     ]
 
@@ -50,7 +50,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
             "result": "PASS",
             "score": 0.85,
             "threshold": 0.7,
-            "execution_time": 1.1,
+            "evaluation_latency": 1.1,
         },
         {
             "conversation_group_id": "conv1",
@@ -59,7 +59,7 @@ def sample_evaluation_data() -> tuple[list[dict], list[dict]]:
             "result": "FAIL",
             "score": 0.6,
             "threshold": 0.7,
-            "execution_time": 1.0,
+            "evaluation_latency": 1.0,
         },
     ]
 
@@ -198,7 +198,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
                 "result": "PASS",
                 "score": 0.95,
                 "threshold": 0.8,
-                "execution_time": 1.0,
+                "evaluation_latency": 1.0,
             },
             {
                 "conversation_group_id": "conv1",
@@ -207,7 +207,7 @@ def sample_evaluation_summary() -> dict[str, Any]:
                 "result": "PASS",
                 "score": 0.85,
                 "threshold": 0.7,
-                "execution_time": 1.2,
+                "evaluation_latency": 1.2,
             },
         ]
         * 5,  # Repeat to get 10 results
diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py
index da8588e6..10c882fa 100644
--- a/tests/unit/core/models/test_data.py
+++ b/tests/unit/core/models/test_data.py
@@ -469,7 +469,7 @@ def test_default_values(self) -> None:
         assert result.tag == "eval"
         assert result.score is None
         assert result.reason == ""
-        assert result.execution_time == 0
+        assert result.evaluation_latency == 0
 
     def test_explicit_tag_value(self) -> None:
         """Test EvaluationResult with explicit tag value."""
@@ -507,8 +507,8 @@ def test_invalid_result_status_rejected(self) -> None:
                 threshold=0.7,
             )
 
-    def test_negative_execution_time_rejected(self) -> None:
-        """Test that negative execution_time is rejected."""
+    def test_negative_evaluation_latency_rejected(self) -> None:
+        """Test that negative evaluation_latency is rejected."""
         with pytest.raises(ValidationError):
             EvaluationResult(
                 conversation_group_id="conv1",
@@ -516,7 +516,7 @@ def test_negative_execution_time_rejected(self) -> None:
                 metric_identifier="metric1",
                 result="PASS",
                 threshold=0.7,
-                execution_time=-1,
+                evaluation_latency=-1,
             )
 
     def test_conversation_level_metric_allows_none_turn_id(self) -> None:
diff --git a/tests/unit/core/models/test_summary.py b/tests/unit/core/models/test_summary.py
index 3e53d697..c84a6f01 100644
--- a/tests/unit/core/models/test_summary.py
+++ b/tests/unit/core/models/test_summary.py
@@ -26,7 +26,7 @@
     "score": 0.85,
     "threshold": 0.7,
     "reason": "Good",
-    "execution_time": 1.0,
+    "evaluation_latency": 1.0,
     "judge_llm_input_tokens": 100,
     "judge_llm_output_tokens": 50,
 }
diff --git a/tests/unit/core/output/test_generator.py b/tests/unit/core/output/test_generator.py
index 21ffd1ec..c8aa29a3 100644
--- a/tests/unit/core/output/test_generator.py
+++ b/tests/unit/core/output/test_generator.py
@@ -323,7 +323,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
                 reason="Score is 0.8",
                 query="What is OpenShift?",
                 response="OpenShift is a container platform.",
-                execution_time=1.5,
+                evaluation_latency=1.5,
             ),
             EvaluationResult(
                 conversation_group_id="test_conv",
@@ -335,7 +335,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
                 reason="Poor performance",
                 query="How to deploy?",
                 response="Use oc apply.",
-                execution_time=0.8,
+                evaluation_latency=0.8,
                 expected_response="Use oc apply -f deployment.yaml",
             ),
             EvaluationResult(
@@ -348,7 +348,7 @@ def test_generate_csv_with_specific_results(self, tmp_path: Path) -> None:
                 reason="API connection failed",
                 query="Create namespace",
                 response="",
-                execution_time=0.0,
+                evaluation_latency=0.0,
             ),
         ]
 
diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py
index 29493135..d79b698a 100644
--- a/tests/unit/core/storage/test_sql_storage.py
+++ b/tests/unit/core/storage/test_sql_storage.py
@@ -50,7 +50,7 @@ def sample_result() -> EvaluationResult:
         reason="Good response",
         query="What is Python?",
         response="Python is a programming language.",
-        execution_time=1.5,
+        evaluation_latency=1.5,
         api_input_tokens=100,
         api_output_tokens=50,
     )
@@ -280,7 +280,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None:
             reason="Excellent response",
             query="Complex question?",
             response="Detailed answer.",
-            execution_time=2.5,
+            evaluation_latency=2.5,
             api_input_tokens=200,
             api_output_tokens=150,
             judge_llm_input_tokens=50,
@@ -309,7 +309,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None:
         assert row is not None
         assert row.conversation_group_id == "conv_full"
         assert row.score == 0.92
-        assert row.execution_time == 2.5
+        assert row.evaluation_latency == 2.5
         assert row.tool_calls == '[{"name": "search"}]'
 
     def test_null_fields_handled(self, temp_db_url: str) -> None:
@@ -414,7 +414,7 @@ def test_all_csv_columns_present(self) -> None:
             "reason",
             "query",
             "response",
-            "execution_time",
+            "evaluation_latency",
             "api_input_tokens",
             "api_output_tokens",
             "judge_llm_input_tokens",
@@ -425,6 +425,7 @@ def test_all_csv_columns_present(self) -> None:
             "streaming_duration",
             "agent_latency",
             "tokens_per_second",
+            "execution_time",
             "tool_calls",
             "contexts",
             "expected_response",
diff --git a/tests/unit/core/system/test_loader.py b/tests/unit/core/system/test_loader.py
index b2108152..86799bcd 100644
--- a/tests/unit/core/system/test_loader.py
+++ b/tests/unit/core/system/test_loader.py
@@ -15,6 +15,22 @@
 class TestConfigLoader:
     """Unit tests for ConfigLoader."""
 
+    def test_load_default_system_yaml_loads_config_successfully(self) -> None:
+        """Test that default config/system.yaml can be loaded without validation errors."""
+        default_system_yaml_path = Path(__file__).parents[4] / "config" / "system.yaml"
+        assert (
+            default_system_yaml_path.exists()
+        ), f"system.yaml not found at {default_system_yaml_path}"
+
+        loader = ConfigLoader()
+
+        # This should not raise any validation errors
+        config = loader.load_system_config(str(default_system_yaml_path))
+
+        # Basic sanity checks
+        assert config is not None
+        assert loader.system_config is not None
+
     def test_load_system_config_file_not_found(self) -> None:
         """Test loading non-existent config file raises error."""
         loader = ConfigLoader()
diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py
index 2d06d16a..959713a2 100644
--- a/tests/unit/pipeline/evaluation/conftest.py
+++ b/tests/unit/pipeline/evaluation/conftest.py
@@ -252,3 +252,24 @@ def processor(
 ) -> ConversationProcessor:
     """Create ConversationProcessor instance for PR tests."""
     return ConversationProcessor(config_loader, processor_components_pr)
+
+
+@pytest.fixture
+def evaluator(
+    config_loader: ConfigLoader,
+    mock_metric_manager: MetricManager,
+    mock_script_manager: ScriptExecutionManager,
+    mocker: MockerFixture,
+) -> MetricsEvaluator:
+    """Create MetricsEvaluator with all handlers mocked."""
+    create_mock_llm_manager(mocker)
+    mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager")
+    mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
+    mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics")
+    mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics")
+    mocker.patch(
+        "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
+    )
+    mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")
+
+    return MetricsEvaluator(config_loader, mock_metric_manager, mock_script_manager)
diff --git a/tests/unit/pipeline/evaluation/test_errors.py b/tests/unit/pipeline/evaluation/test_errors.py
index b8477c08..04f97222 100644
--- a/tests/unit/pipeline/evaluation/test_errors.py
+++ b/tests/unit/pipeline/evaluation/test_errors.py
@@ -157,6 +157,7 @@ def test_mark_turn_metrics_as_error(self) -> None:
         assert results[0].reason == error_reason
         assert results[0].query == "Test query"
         assert results[0].response == ""
+        assert results[0].evaluation_latency == 0.0
         assert results[0].execution_time == 0.0
 
         # Check second error result
diff --git a/tests/unit/pipeline/evaluation/test_evaluator.py b/tests/unit/pipeline/evaluation/test_evaluator.py
index 2d93fd32..82b63555 100644
--- a/tests/unit/pipeline/evaluation/test_evaluator.py
+++ b/tests/unit/pipeline/evaluation/test_evaluator.py
@@ -1,4 +1,4 @@
-# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments,too-many-lines
+# pylint: disable=protected-access,redefined-outer-name,too-many-arguments,too-many-positional-arguments,too-many-lines, too-many-public-methods
 
 """Unit tests for pipeline evaluation evaluator module."""
 
@@ -21,41 +21,17 @@
 from lightspeed_evaluation.core.script import ScriptExecutionManager
 from lightspeed_evaluation.pipeline.evaluation.evaluator import MetricsEvaluator
 
-from tests.unit.pipeline.evaluation.conftest import create_mock_llm_manager
-
 
 class TestMetricsEvaluator:
     """Unit tests for MetricsEvaluator."""
 
     def test_initialization(
         self,
+        evaluator: MetricsEvaluator,
         config_loader: ConfigLoader,
         mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
     ) -> None:
         """Test evaluator initialization."""
-        # Mock the metric handlers
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
         assert evaluator.config_loader == config_loader
         assert evaluator.metric_manager == mock_metric_manager
         assert (
@@ -74,39 +50,9 @@ def test_initialization_raises_error_without_config(
         with pytest.raises(RuntimeError, match="Uninitialized system_config"):
             MetricsEvaluator(loader, mock_metric_manager, mock_script_manager)
 
-    def test_evaluate_metric_turn_level_pass(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
-    ) -> None:
+    def test_evaluate_metric_turn_level_pass(self, evaluator: MetricsEvaluator) -> None:
         """Test evaluating turn-level metric that passes."""
-        # Mock the handlers
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.return_value = (0.85, "Good faithfulness")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
+        evaluator.handlers["ragas"].evaluate.return_value = (0.85, "Good faithfulness")
 
         turn_data = TurnData(
             turn_id="1",
@@ -115,7 +61,6 @@ def test_evaluate_metric_turn_level_pass(
             contexts=["Context"],
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
-
         request = EvaluationRequest.for_turn(
             conv_data, "ragas:faithfulness", 0, turn_data
         )
@@ -130,49 +75,26 @@ def test_evaluate_metric_turn_level_pass(
         assert result.conversation_group_id == "test_conv"
         assert result.turn_id == "1"
         assert result.metric_identifier == "ragas:faithfulness"
-
         assert result.query == "What is Python?"
         assert result.response == "Python is a programming language."
         assert result.contexts == '["Context"]'
+        # Verify execution_time is populated
+        assert result.execution_time >= 0.0
+        assert result.evaluation_latency >= 0.0
+        assert result.agent_latency >= 0.0
+        assert result.execution_time == result.evaluation_latency + result.agent_latency
 
-    def test_evaluate_metric_turn_level_fail(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
-    ) -> None:
+    def test_evaluate_metric_turn_level_fail(self, evaluator: MetricsEvaluator) -> None:
         """Test evaluating turn-level metric that fails."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.return_value = (0.3, "Low faithfulness score")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
+        evaluator.handlers["ragas"].evaluate.return_value = (
+            0.3,
+            "Low faithfulness score",
         )
 
         turn_data = TurnData(
             turn_id="1", query="Query", response="Response", contexts=["Context"]
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
-
         request = EvaluationRequest.for_turn(
             conv_data, "ragas:faithfulness", 0, turn_data
         )
@@ -185,35 +107,10 @@ def test_evaluate_metric_turn_level_fail(
         assert result.threshold == 0.7
 
     def test_evaluate_metric_missing_required_data_returns_error(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """When required data is missing or empty, return ERROR and skip metric processing."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mock_ragas = mocker.Mock()
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
+        mock_ragas = evaluator.handlers["ragas"]
 
         turn_data = TurnData(
             turn_id="1",
@@ -236,34 +133,12 @@ def test_evaluate_metric_missing_required_data_returns_error(
         mock_ragas.evaluate.assert_not_called()
 
     def test_evaluate_metric_conversation_level(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test evaluating conversation-level metric."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-
-        mock_deepeval = mocker.Mock()
-        mock_deepeval.evaluate.return_value = (0.75, "Complete conversation")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics",
-            return_value=mock_deepeval,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
+        evaluator.handlers["deepeval"].evaluate.return_value = (
+            0.75,
+            "Complete conversation",
         )
 
         turn1 = TurnData(
@@ -283,12 +158,12 @@ def test_evaluate_metric_conversation_level(
         conv_data = EvaluationData(
             conversation_group_id="test_conv", turns=[turn1, turn2]
         )
-
         request = EvaluationRequest.for_conversation(
             conv_data, "deepeval:conversation_completeness"
         )
 
         result = evaluator.evaluate_metric(request)
+
         assert result is not None
         assert result.result == "PASS"
         assert result.score == 0.75
@@ -297,32 +172,9 @@ def test_evaluate_metric_conversation_level(
         assert result.api_output_tokens == 20
 
     def test_evaluate_metric_unsupported_framework(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test unsupported framework returns ERROR and aggregates API tokens across turns."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
         turn1 = TurnData(
             turn_id="1",
             query="Q",
@@ -352,41 +204,13 @@ def test_evaluate_metric_unsupported_framework(
         assert result.api_output_tokens == 5
 
     def test_evaluate_metric_returns_none_score(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test handling when metric evaluation returns None score."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.return_value = (None, "Evaluation failed")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
+        evaluator.handlers["ragas"].evaluate.return_value = (None, "Evaluation failed")
 
         turn_data = TurnData(turn_id="1", query="Q", response="R", contexts=["C"])
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
-
         request = EvaluationRequest.for_turn(
             conv_data, "ragas:faithfulness", 0, turn_data
         )
@@ -399,45 +223,19 @@ def test_evaluate_metric_returns_none_score(
         assert result.reason == "Evaluation failed"
 
     def test_evaluate_metric_exception_handling(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test exception handling during metric evaluation.
 
         Note: Even on error, turn data fields (query, response, contexts) should be
         preserved in the result for debugging and analysis purposes.
         """
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.side_effect = EvaluationError("Unexpected error")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
+        evaluator.handlers["ragas"].evaluate.side_effect = EvaluationError(
+            "Unexpected error"
         )
 
         turn_data = TurnData(turn_id="1", query="Q", response="R", contexts=["C"])
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
-
         request = EvaluationRequest.for_turn(
             conv_data, "ragas:faithfulness", 0, turn_data
         )
@@ -448,214 +246,57 @@ def test_evaluate_metric_exception_handling(
         assert result.result == "ERROR"
         assert "Evaluation error" in result.reason
         assert "Unexpected error" in result.reason
-
         # Turn data should be preserved even on error for debugging
         assert result.query == "Q"
         assert result.response == "R"
-        assert result.contexts == '["C"]'  # JSON-serialized contexts preserved on error
-        assert result.expected_response is None  # Was not set in turn_data
+        assert result.contexts == '["C"]'
+        assert result.expected_response is None
 
     def test_evaluate_metric_skip_script_when_api_disabled(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator, config_loader: ConfigLoader
     ) -> None:
         """Test script metrics are skipped when API is disabled."""
         assert config_loader.system_config is not None
         config_loader.system_config.agents = None
 
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
         turn_data = TurnData(turn_id="1", query="Q", response="R")
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
-
         request = EvaluationRequest.for_turn(
             conv_data, "script:action_eval", 0, turn_data
         )
 
         result = evaluator.evaluate_metric(request)
 
-        # Should return None when API is disabled for script metrics
-        assert result is None
+        assert (
+            result is None
+        )  # Should return None when API is disabled for script metrics
 
-    def test_determine_status_with_threshold(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
-    ) -> None:
+    def test_determine_status_with_threshold(self, evaluator: MetricsEvaluator) -> None:
         """Test _determine_status method."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
-        # Test PASS
         assert evaluator._determine_status(0.8, 0.7) == "PASS"
         assert evaluator._determine_status(0.7, 0.7) == "PASS"  # Equal passes
-
-        # Test FAIL
         assert evaluator._determine_status(0.6, 0.7) == "FAIL"
 
     def test_determine_status_without_threshold(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test _determine_status uses default 0.5 when threshold is None."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
-        # Should use 0.5 as default
         assert evaluator._determine_status(0.6, None) == "PASS"
         assert evaluator._determine_status(0.4, None) == "FAIL"
 
-    def _setup_evaluate_test(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
-        mock_return: tuple[float, str] | list[tuple[float, str]],
-    ) -> tuple[MetricsEvaluator, dict]:
-        """Helper to setup common mocks for _evaluate_wrapper() tests.
-
-        Returns:
-            tuple: (evaluator, mock_handlers) where mock_handlers is a dict with keys:
-                   'ragas', 'geval', 'custom', 'script', 'nlp'
-        """
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        # Create a helper to setup mock with return values
-        def create_mock_handler(  # type: ignore[no-untyped-def]
-            mocker: MockerFixture,
-            mock_return: tuple[float, str] | list[tuple[float, str]],
-        ):
-            mock = mocker.Mock()
-            if isinstance(mock_return, list):
-                mock.evaluate.side_effect = mock_return
-            else:
-                mock.evaluate.return_value = mock_return
-            return mock
-
-        # Setup all handler mocks
-        mock_ragas = create_mock_handler(mocker, mock_return)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-
-        mock_deepeval = create_mock_handler(mocker, mock_return)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics",
-            return_value=mock_deepeval,
-        )
-
-        mock_custom = create_mock_handler(mocker, mock_return)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics",
-            return_value=mock_custom,
-        )
-
-        mock_nlp = create_mock_handler(mocker, mock_return)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics",
-            return_value=mock_nlp,
-        )
-
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
-
-        # Return evaluator and dict of all mocks
-        mock_handlers = {
-            "ragas": mock_ragas,
-            "geval": mock_deepeval,
-            "custom": mock_custom,
-            "nlp": mock_nlp,
-        }
-
-        return evaluator, mock_handlers
-
     @pytest.mark.parametrize(
         "metric_identifier",
         ["ragas:context_recall", "custom:answer_correctness", "nlp:rouge"],
     )
     def test_evaluate_with_expected_response_list(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
-        metric_identifier: str,
+        self, evaluator: MetricsEvaluator, metric_identifier: str
     ) -> None:
         """Test _evaluate_wrapper() with list expected_response for metric that requires it."""
-        evaluator, mock_handlers = self._setup_evaluate_test(
-            config_loader,
-            mock_metric_manager,
-            mock_script_manager,
-            mocker,
-            [(0.3, "Low score"), (0.85, "High score")],
-        )
+        framework = metric_identifier.split(":")[0]
+        evaluator.handlers[framework].evaluate.side_effect = [
+            (0.3, "Low score"),
+            (0.85, "High score"),
+        ]
 
         turn_data = TurnData(
             turn_id="1",
@@ -673,27 +314,14 @@ def test_evaluate_with_expected_response_list(
         assert metric_result.score == 0.85
         assert metric_result.reason == "High score"
         assert metric_result.result == "PASS"
-
-        # Check the appropriate handler was called based on metric framework
-        framework = metric_identifier.split(":")[0]
-        assert mock_handlers[framework].evaluate.call_count == 2
+        assert evaluator.handlers[framework].evaluate.call_count == 2
 
     def test_evaluate_with_expected_response_list_fail(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test _evaluate_wrapper() with list expected_response for metric that requires it."""
         scores_reasons = [(0.3, "Score 1"), (0.65, "Score 2"), (0.45, "Score 3")]
-        evaluator, mock_handlers = self._setup_evaluate_test(
-            config_loader,
-            mock_metric_manager,
-            mock_script_manager,
-            mocker,
-            scores_reasons,
-        )
+        evaluator.handlers["ragas"].evaluate.side_effect = scores_reasons
 
         turn_data = TurnData(
             turn_id="1",
@@ -716,23 +344,13 @@ def test_evaluate_with_expected_response_list_fail(
         assert metric_result.score == 0.65
         assert metric_result.reason == reason_combined
         assert metric_result.result == "FAIL"
-        assert mock_handlers["ragas"].evaluate.call_count == 3
+        assert evaluator.handlers["ragas"].evaluate.call_count == 3
 
     def test_evaluate_with_expected_response_string(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test _evaluate_wrapper() with string expected_response."""
-        evaluator, mock_handlers = self._setup_evaluate_test(
-            config_loader,
-            mock_metric_manager,
-            mock_script_manager,
-            mocker,
-            (0.85, "Good score"),
-        )
+        evaluator.handlers["ragas"].evaluate.return_value = (0.85, "Good score")
 
         turn_data = TurnData(
             turn_id="1", query="Q", response="R", expected_response="A", contexts=["C"]
@@ -748,7 +366,7 @@ def test_evaluate_with_expected_response_string(
         assert metric_result.score == 0.85
         assert metric_result.reason == "Good score"
         assert metric_result.result == "PASS"
-        assert mock_handlers["ragas"].evaluate.call_count == 1
+        assert evaluator.handlers["ragas"].evaluate.call_count == 1
 
     @pytest.mark.parametrize(
         "metric_identifier", ["ragas:faithfulness", "geval:technical_accuracy"]
@@ -760,21 +378,16 @@ def test_evaluate_with_expected_response_string(
     )
     def test_evaluate_with_expected_response_not_needed(
         self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        evaluator: MetricsEvaluator,
         metric_identifier: str,
         expected_response: str | list[str] | None,
     ) -> None:
         """Test _evaluate_wrapper() with metric that does not require expected_response."""
-        evaluator, mock_handlers = self._setup_evaluate_test(
-            config_loader,
-            mock_metric_manager,
-            mock_script_manager,
-            mocker,
-            [(0.3, "Low score"), (0.85, "High score")],
-        )
+        framework = metric_identifier.split(":")[0]
+        evaluator.handlers[framework].evaluate.side_effect = [
+            (0.3, "Low score"),
+            (0.85, "High score"),
+        ]
 
         turn_data = TurnData(
             turn_id="1",
@@ -792,54 +405,21 @@ def test_evaluate_with_expected_response_not_needed(
         assert metric_result.score == 0.3
         assert metric_result.reason == "Low score"
         assert metric_result.result == "FAIL"
-
-        # Check the appropriate handler was called based on metric
-        framework = metric_identifier.split(":")[0]
-        assert mock_handlers[framework].evaluate.call_count == 1
+        assert evaluator.handlers[framework].evaluate.call_count == 1
 
     def test_evaluate_multiple_expected_responses_error_preserves_tokens(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator, mocker: MockerFixture
     ) -> None:
         """Test token preservation when error occurs during multiple expected responses evaluation.
 
         Scenario: First iteration succeeds with tokens, second iteration fails.
         Expected: Error result should preserve tokens from first iteration.
         """
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        # First call returns valid result with tokens, second call raises exception
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.side_effect = [
+        evaluator.handlers["ragas"].evaluate.side_effect = [
             (0.3, "First iteration failed threshold"),
             EvaluationError("LLM error in second iteration"),
         ]
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
 
-        # Mock token tracker to simulate tokens from first iteration
         original_evaluate = evaluator._evaluate
 
         def mock_evaluate_with_tokens(
@@ -849,7 +429,6 @@ def mock_evaluate_with_tokens(
             threshold: Optional[float],
         ) -> MetricResult:
             result = original_evaluate(request, scope, token_tracker, threshold)
-            # Simulate tokens were added after each successful call
             result.judge_llm_input_tokens = 150
             result.judge_llm_output_tokens = 50
             return result
@@ -873,60 +452,25 @@ def mock_evaluate_with_tokens(
         result = evaluator.evaluate_metric(request)
 
         assert result is not None
-        # With consistent flow, both iterations complete and accumulate tokens
-        # Result is not ERROR because we get partial results (first iteration FAIL, second ERROR)
-        # The highest score from iterations is preserved
         assert result.score == 0.3  # From first iteration
-        # Tokens accumulated from both iterations (first success + second failure mock)
         assert result.judge_llm_input_tokens == 300  # 150 + 150
         assert result.judge_llm_output_tokens == 100  # 50 + 50
-        # Error from second iteration captured in accumulated reason
         assert "error" in result.reason.lower()
 
     def test_evaluate_single_path_error_preserves_tokens(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator
     ) -> None:
         """Test token preservation when error occurs in single evaluation path.
 
         Scenario: Single evaluation call fails but tokens were tracked.
         Expected: Error result should preserve any tokens captured.
         """
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.side_effect = EvaluationError("LLM connection failed")
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
+        evaluator.handlers["ragas"].evaluate.side_effect = EvaluationError(
+            "LLM connection failed"
         )
 
         turn_data = TurnData(
-            turn_id="1",
-            query="Q",
-            response="R",
-            expected_response="A",  # Single expected response
-            contexts=["C"],
+            turn_id="1", query="Q", response="R", expected_response="A", contexts=["C"]
         )
         conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
         request = EvaluationRequest.for_turn(
@@ -938,46 +482,17 @@ def test_evaluate_single_path_error_preserves_tokens(
         assert result is not None
         assert result.result == "ERROR"
         assert "LLM connection failed" in result.reason
-        # Token counts should be present (even if 0)
         assert result.judge_llm_input_tokens >= 0
         assert result.judge_llm_output_tokens >= 0
 
     def test_multiple_expected_responses_error_no_double_counting(
-        self,
-        config_loader: ConfigLoader,
-        mock_metric_manager: MetricManager,
-        mock_script_manager: ScriptExecutionManager,
-        mocker: MockerFixture,
+        self, evaluator: MetricsEvaluator, mocker: MockerFixture
     ) -> None:
         """Test token counts use deltas not cumulative totals when error on iteration 2+."""
-        create_mock_llm_manager(mocker)
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.EmbeddingManager"
-        )
-
-        mock_ragas = mocker.Mock()
-        mock_ragas.evaluate.side_effect = [
+        evaluator.handlers["ragas"].evaluate.side_effect = [
             (0.3, "First iteration"),
             EvaluationError("Second iteration failed"),
         ]
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.RagasMetrics",
-            return_value=mock_ragas,
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.DeepEvalMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.CustomMetrics"
-        )
-        mocker.patch(
-            "lightspeed_evaluation.pipeline.evaluation.evaluator.ScriptEvalMetrics"
-        )
-        mocker.patch("lightspeed_evaluation.pipeline.evaluation.evaluator.NLPMetrics")
-
-        evaluator = MetricsEvaluator(
-            config_loader, mock_metric_manager, mock_script_manager
-        )
 
         call_count = [0]
 
@@ -989,7 +504,6 @@ def mock_evaluate_with_tokens(
         ) -> MetricResult:
             call_count[0] += 1
             if call_count[0] == 1:
-                # Iteration 1: add tokens and return success
                 token_tracker.add_judge_tokens(100, 50)
                 token_tracker.add_embedding_tokens(20)
                 return MetricResult(
@@ -1001,7 +515,6 @@ def mock_evaluate_with_tokens(
                     judge_llm_output_tokens=50,
                     embedding_tokens=20,
                 )
-            # Iteration 2: add tokens then raise error
             token_tracker.add_judge_tokens(150, 75)
             token_tracker.add_embedding_tokens(30)
             raise EvaluationError("Second iteration failed")
@@ -1030,3 +543,63 @@ def mock_evaluate_with_tokens(
         assert result.judge_llm_input_tokens == 250  # 100+150
         assert result.judge_llm_output_tokens == 125  # 50+75
         assert result.embedding_tokens == 50  # 20+30
+
+    def test_execution_time_calculation(self, evaluator: MetricsEvaluator) -> None:
+        """Test execution_time is correctly calculated as evaluation_latency + agent_latency."""
+        mock_ragas = evaluator.handlers["ragas"]
+        mock_ragas.evaluate.return_value = (0.85, "Good score")
+
+        turn_data = TurnData(
+            turn_id="1", query="Q", response="R", contexts=["C"], agent_latency=1.5
+        )
+        conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
+        request = EvaluationRequest.for_turn(
+            conv_data, "ragas:faithfulness", 0, turn_data
+        )
+
+        result = evaluator.evaluate_metric(request)
+
+        assert result is not None
+        assert result.agent_latency == 1.5
+        assert result.evaluation_latency > 0.0
+        assert result.execution_time == result.evaluation_latency + result.agent_latency
+        assert result.execution_time >= 1.5
+
+    def test_execution_time_in_error_result(self, evaluator: MetricsEvaluator) -> None:
+        """Test execution_time is populated even in ERROR results."""
+        turn_data = TurnData(turn_id="1", query="Q", response="R", agent_latency=2.0)
+        conv_data = EvaluationData(conversation_group_id="test_conv", turns=[turn_data])
+        request = EvaluationRequest.for_turn(conv_data, "unknown:metric", 0, turn_data)
+
+        result = evaluator.evaluate_metric(request)
+
+        assert result is not None
+        assert result.result == "ERROR"
+        assert result.agent_latency == 2.0
+        assert result.evaluation_latency > 0.0
+        assert result.execution_time == result.evaluation_latency + result.agent_latency
+        assert result.execution_time >= 2.0
+
+    def test_execution_time_conversation_level_sums_agent_latency(
+        self, evaluator: MetricsEvaluator
+    ) -> None:
+        """Test execution_time uses the summed agent_latency for conversation-level metrics."""
+        mock_deepeval = evaluator.handlers["deepeval"]
+        mock_deepeval.evaluate.return_value = (0.75, "Good conversation")
+
+        turn1 = TurnData(turn_id="1", query="Q1", response="R1", agent_latency=1.0)
+        turn2 = TurnData(turn_id="2", query="Q2", response="R2", agent_latency=3.0)
+        conv_data = EvaluationData(
+            conversation_group_id="test_conv", turns=[turn1, turn2]
+        )
+        request = EvaluationRequest.for_conversation(
+            conv_data, "deepeval:conversation_completeness"
+        )
+
+        result = evaluator.evaluate_metric(request)
+
+        assert result is not None
+        assert result.agent_latency == 4.0
+        assert result.evaluation_latency > 0.0
+        assert result.execution_time == result.evaluation_latency + result.agent_latency
+        assert result.execution_time >= 4.0