trackit · LimRaymond · May 13, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/backend/python-eval-function/src/accuracy_evaluator.py b/backend/python-eval-function/src/accuracy_evaluator.py
@@ -1,7 +1,9 @@
 import logging
-from typing import Optional, List, Dict
+from typing import Optional, List, Mapping
 from dataclasses import dataclass
 
+from metrics import is_metric_enabled
+
 logger = logging.getLogger(__name__)
 
 
@@ -29,7 +31,8 @@ def __init__(self):
     def calculate_accuracy_metrics(
         self,
         predictions: List[str],
-        references: Optional[List[str]]
+        references: Optional[List[str]],
+        selected: Optional[Mapping[str, bool]] = None,
     ) -> Optional[AccuracyMetrics]:
         if references is None or len(references) == 0:
             logger.info("No reference outputs available, skipping accuracy metrics")
@@ -38,28 +41,36 @@ def calculate_accuracy_metrics(
         if all(ref is None or ref == "" for ref in references):
             logger.info("All reference outputs are empty, skipping accuracy metrics")
             return None
-
-        logger.info(f"Calculating accuracy metrics for {len(predictions)} predictions")
-
+
+        logger.info(
+            f"Calculating accuracy metrics for {len(predictions)} predictions "
+            f"(selected={dict(selected) if selected is not None else 'all'})"
+        )
+
         try:
             metrics = AccuracyMetrics()
-
-            metrics.bleu = self._calculate_bleu(predictions, references)
-            metrics.rouge = self._calculate_rouge(predictions, references)
-            metrics.meteor = self._calculate_meteor(predictions, references)
-            metrics.levenshtein = self._calculate_levenshtein(predictions, references)
-            metrics.bertscore = self._calculate_bertscore(predictions, references)
-
+
+            if is_metric_enabled(selected, 'bleu'):
+                metrics.bleu = self._calculate_bleu(predictions, references)
+            if is_metric_enabled(selected, 'rouge'):
+                metrics.rouge = self._calculate_rouge(predictions, references)
+            if is_metric_enabled(selected, 'meteor'):
+                metrics.meteor = self._calculate_meteor(predictions, references)
+            if is_metric_enabled(selected, 'levenshtein'):
+                metrics.levenshtein = self._calculate_levenshtein(predictions, references)
+            if is_metric_enabled(selected, 'bertscore'):
+                metrics.bertscore = self._calculate_bertscore(predictions, references)
+
             def fmt(v): return f"{v:.4f}" if v is not None else "N/A"
             logger.info(
                 f"Accuracy metrics calculated - "
                 f"BLEU={fmt(metrics.bleu)}, ROUGE={fmt(metrics.rouge)}, "
                 f"METEOR={fmt(metrics.meteor)}, Levenshtein={fmt(metrics.levenshtein)}, "
                 f"BERTScore={fmt(metrics.bertscore)}"
             )
-            
+
             return metrics
-            
+
         except Exception as e:
             logger.error(f"Error calculating accuracy metrics: {e}", exc_info=True)
             return None

diff --git a/backend/python-eval-function/src/classification_evaluator.py b/backend/python-eval-function/src/classification_evaluator.py
@@ -1,7 +1,9 @@
 import logging
-from typing import Optional, List
+from typing import Optional, List, Mapping
 from dataclasses import dataclass
 
+from metrics import is_metric_enabled
+
 logger = logging.getLogger(__name__)
 
 
@@ -13,6 +15,14 @@ class ClassificationMetrics:
     f1_macro: Optional[float] = None
     f1_weighted: Optional[float] = None
 
+_CLASSIFICATION_KEYS: tuple[str, ...] = (
+    'classification_accuracy',
+    'precision_macro',
+    'recall_macro',
+    'f1_macro',
+    'f1_weighted',
+)
+
 
 def normalize_prediction(prediction: str, valid_classes: List[str]) -> str:
     cleaned = prediction.strip()
@@ -38,6 +48,7 @@ def calculate_classification_metrics(
         predictions: List[str],
         references: List[str],
         valid_classes: Optional[List[str]] = None,
+        selected: Optional[Mapping[str, bool]] = None,
     ) -> Optional[ClassificationMetrics]:
         if not references or not predictions:
             logger.info("No references or predictions, skipping classification metrics")
@@ -47,6 +58,12 @@ def calculate_classification_metrics(
             logger.info("All references are empty, skipping classification metrics")
             return None
 
+        if selected is not None and not any(
+            is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS
+        ):
+            logger.info("All classification metrics disabled, skipping computation")
+            return ClassificationMetrics()
+
         if valid_classes is None:
             valid_classes = list(set(references))
 
@@ -56,7 +73,7 @@ def calculate_classification_metrics(
 
         logger.info(
             f"Calculating classification metrics for {len(normalized_preds)} predictions "
-            f"across {len(valid_classes)} classes"
+            f"across {len(valid_classes)} classes (selected={dict(selected) if selected is not None else 'all'})"
         )
 
         try:
@@ -86,11 +103,21 @@ def calculate_classification_metrics(
             )
 
             metrics = ClassificationMetrics(
-                accuracy=round(acc, 4),
-                precision_macro=round(precision, 4),
-                recall_macro=round(recall, 4),
-                f1_macro=round(f1, 4),
-                f1_weighted=round(f1_w, 4),
+                accuracy=round(acc, 4)
+                if is_metric_enabled(selected, 'classification_accuracy')
+                else None,
+                precision_macro=round(precision, 4)
+                if is_metric_enabled(selected, 'precision_macro')
+                else None,
+                recall_macro=round(recall, 4)
+                if is_metric_enabled(selected, 'recall_macro')
+                else None,
+                f1_macro=round(f1, 4)
+                if is_metric_enabled(selected, 'f1_macro')
+                else None,
+                f1_weighted=round(f1_w, 4)
+                if is_metric_enabled(selected, 'f1_weighted')
+                else None,
             )
 
             logger.info(

diff --git a/backend/python-eval-function/src/dynamodb_service.py b/backend/python-eval-function/src/dynamodb_service.py
@@ -7,6 +7,8 @@
 import boto3
 from botocore.exceptions import ClientError
 
+from metrics import normalize_metrics_config
+
 logger = logging.getLogger(__name__)
 
 
@@ -28,18 +30,32 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]:
             item = response['Item']
             models = json.loads(item.get('models', '[]'))
             weights = json.loads(item.get('weights', '{}'))
-
+            raw_metrics = item.get('metrics')
+            stored_metrics: Optional[Dict[str, Any]] = None
+            if raw_metrics:
+                try:
+                    stored_metrics = json.loads(raw_metrics)
+                except (TypeError, ValueError) as parse_err:
+                    logger.warning(
+                        f"Failed to parse stored metrics config "
+                        f"({parse_err}); defaulting to all metrics enabled"
+                    )
+            metrics = normalize_metrics_config(stored_metrics)
+
             job_config = {
                 'evaluation_id': item['evaluation_id'],
                 'dataset_id': item['dataset_id'],
                 'models': models,
                 'weights': weights,
+                'metrics': metrics,
                 'status': item.get('status', 'pending'),
                 'created_at': item.get('created_at', ''),
                 'total_samples': item.get('total_samples')
             }
-
-            logger.info(f"Loaded job config: {len(models)} models, weights: {weights}")
+
+            logger.info(
+                f"Loaded job config: {len(models)} models, weights: {weights}, metrics: {metrics}"
+            )
             return job_config
 
         except ClientError as e:

diff --git a/backend/python-eval-function/src/geval_evaluator.py b/backend/python-eval-function/src/geval_evaluator.py
@@ -99,6 +99,8 @@ def evaluate(
         predictions: List[str],
         references: Optional[List[str]] = None,
         task_type: str = "summarization",
+        compute_reasoning: bool = True,
+        compute_faithfulness: bool = True,
     ) -> GEvalMetrics:
         if not inputs or not predictions:
             logger.warning("Empty inputs or predictions, skipping G-Eval")
@@ -108,10 +110,22 @@ def evaluate(
             logger.error("inputs and predictions length mismatch, skipping G-Eval")
             return GEvalMetrics()
 
+        if not compute_reasoning and not compute_faithfulness:
+            logger.info("Both G-Eval metrics disabled, skipping")
+            return GEvalMetrics()
+
         try:
             model = self._build_judge_model()
-            reasoning_metric = self._build_reasoning_metric(model, task_type)
-            faithfulness_metric = self._build_faithfulness_metric(model, task_type)
+            reasoning_metric = (
+                self._build_reasoning_metric(model, task_type)
+                if compute_reasoning
+                else None
+            )
+            faithfulness_metric = (
+                self._build_faithfulness_metric(model, task_type)
+                if compute_faithfulness
+                else None
+            )
         except Exception as e:
             logger.error(f"Failed to initialize G-Eval components: {e}", exc_info=True)
             return GEvalMetrics()
@@ -131,23 +145,25 @@ def evaluate(
                 actual_output=pred,
             )
 
-            try:
-                reasoning_metric.measure(test_case)
-                reasoning_scores.append(reasoning_metric.score)
-                logger.debug(
-                    f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
-                )
-            except Exception as e:
-                logger.warning(f"[{idx}] Reasoning metric failed: {e}")
-
-            try:
-                faithfulness_metric.measure(test_case)
-                faithfulness_scores.append(faithfulness_metric.score)
-                logger.debug(
-                    f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
-                )
-            except Exception as e:
-                logger.warning(f"[{idx}] Faithfulness metric failed: {e}")
+            if reasoning_metric is not None:
+                try:
+                    reasoning_metric.measure(test_case)
+                    reasoning_scores.append(reasoning_metric.score)
+                    logger.debug(
+                        f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
+                    )
+                except Exception as e:
+                    logger.warning(f"[{idx}] Reasoning metric failed: {e}")
+
+            if faithfulness_metric is not None:
+                try:
+                    faithfulness_metric.measure(test_case)
+                    faithfulness_scores.append(faithfulness_metric.score)
+                    logger.debug(
+                        f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
+                    )
+                except Exception as e:
+                    logger.warning(f"[{idx}] Faithfulness metric failed: {e}")
 
         result = GEvalMetrics()
 

diff --git a/backend/python-eval-function/src/main.py b/backend/python-eval-function/src/main.py
@@ -74,8 +74,18 @@ def main():
         dataset_id = job_config['dataset_id']
         models = job_config['models']
         weights = job_config['weights']
-
-        logger.info(f"Job config loaded - dataset: {dataset_id}, models: {len(models)}")
+        # `metrics_config` is already normalized by DynamoDBService.load_job —
+        # every key present, all bool.
+        metrics_config = job_config.get('metrics') or {}
+
+        from metrics import is_metric_enabled
+        compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning')
+        compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness')
+
+        logger.info(
+            f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, "
+            f"metrics={metrics_config or 'all'}"
+        )
 
         from dataset_loader import DatasetLoader
         dataset_loader = DatasetLoader()
@@ -134,7 +144,9 @@ def main():
                 predictions = [invocation_results[i].response_text for i in successful_indices]
                 references = [all_references[i] for i in successful_indices] if all_references else None
 
-                accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(predictions, references)
+                accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(
+                    predictions, references, selected=metrics_config
+                )
                 accuracy_results[model_id] = accuracy_metrics or AccuracyMetrics()
 
             logger.info(f"Summarization accuracy complete for {len(accuracy_results)} models")
@@ -149,7 +161,8 @@ def main():
                 references = [all_references[i] for i in successful_indices] if all_references else None
 
                 cls_metrics = classification_evaluator.calculate_classification_metrics(
-                    predictions, references, valid_classes=unique_classes
+                    predictions, references, valid_classes=unique_classes,
+                    selected=metrics_config,
                 )
 
                 acc = AccuracyMetrics()
@@ -163,28 +176,40 @@ def main():
 
             logger.info(f"Classification accuracy complete for {len(accuracy_results)} models")
 
-        from geval_evaluator import GEvalEvaluator
-        geval_evaluator = GEvalEvaluator()
-
-        for model_id, invocation_results in results_by_model.items():
-            successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
-            predictions = [invocation_results[i].response_text for i in successful_indices]
-            inputs = [dataset.documents[i] for i in successful_indices]
-
-            logger.info(f"Running G-Eval for model {model_id} on {len(predictions)} samples")
-            geval_metrics = geval_evaluator.evaluate(inputs, predictions, task_type=task_type)
-
-            acc = accuracy_results.get(model_id)
-            if acc is None:
-                acc = AccuracyMetrics()
-                accuracy_results[model_id] = acc
-
-            acc.geval_reasoning = geval_metrics.reasoning
-            acc.geval_faithfulness = geval_metrics.faithfulness
-
-            logger.info(f"G-Eval complete for {model_id}")
-
-        logger.info(f"G-Eval evaluation complete for all models")
+        if compute_geval_reasoning or compute_geval_faithfulness:
+            from geval_evaluator import GEvalEvaluator
+            geval_evaluator = GEvalEvaluator()
+
+            for model_id, invocation_results in results_by_model.items():
+                successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
+                predictions = [invocation_results[i].response_text for i in successful_indices]
+                inputs = [dataset.documents[i] for i in successful_indices]
+
+                logger.info(
+                    f"Running G-Eval for model {model_id} on {len(predictions)} samples "
+                    f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})"
+                )
+                geval_metrics = geval_evaluator.evaluate(
+                    inputs, predictions, task_type=task_type,
+                    compute_reasoning=compute_geval_reasoning,
+                    compute_faithfulness=compute_geval_faithfulness,
+                )
+
+                acc = accuracy_results.get(model_id)
+                if acc is None:
+                    acc = AccuracyMetrics()
+                    accuracy_results[model_id] = acc
+
+                if compute_geval_reasoning:
+                    acc.geval_reasoning = geval_metrics.reasoning
+                if compute_geval_faithfulness:
+                    acc.geval_faithfulness = geval_metrics.faithfulness
+
+                logger.info(f"G-Eval complete for {model_id}")
+
+            logger.info("G-Eval evaluation complete for all models")
+        else:
+            logger.info("G-Eval disabled via metrics config, skipping")
 
         from cost_calculator import CostCalculator
         cost_calculator = CostCalculator()