From 4ab84b3246c3cf48f348730477f944f096ff37fc Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Wed, 13 May 2026 23:38:06 +0200
Subject: [PATCH 1/9] feat: Initial toggle for metrics

---
 .../src/accuracy_evaluator.py                 |  39 ++-
 .../src/classification_evaluator.py           |  46 +++-
 .../src/dynamodb_service.py                   |  24 +-
 .../src/geval_evaluator.py                    |  54 ++--
 backend/python-eval-function/src/main.py      |  77 ++++--
 backend/python-eval-function/src/metrics.py   |  53 ++++
 .../EvaluationLaunchAdapter.ts                |  12 +-
 backend/src/models/Evaluation.ts              |  69 +++++
 .../EvaluationJobsRepository.ts               |   9 +
 .../EvaluationLaunchUseCase.test.ts           |  96 ++++++-
 .../EvaluationLaunchUseCase.ts                |   3 +
 .../FakeEvaluationLaunchUseCase.ts            |  13 +-
 .../components/evaluator/MetricsWeights.tsx   | 246 +++++++++++++++++-
 frontend/src/pages/Index.tsx                  |   9 +
 frontend/src/types/evaluation.ts              |  55 +++-
 15 files changed, 715 insertions(+), 90 deletions(-)
 create mode 100644 backend/python-eval-function/src/metrics.py

diff --git a/backend/python-eval-function/src/accuracy_evaluator.py b/backend/python-eval-function/src/accuracy_evaluator.py
index a2785a3..a73538b 100644
--- a/backend/python-eval-function/src/accuracy_evaluator.py
+++ b/backend/python-eval-function/src/accuracy_evaluator.py
@@ -1,7 +1,9 @@
 import logging
-from typing import Optional, List, Dict
+from typing import Optional, List, Mapping
 from dataclasses import dataclass
 
+from metrics import is_metric_enabled
+
 logger = logging.getLogger(__name__)
 
 
@@ -29,7 +31,8 @@ def __init__(self):
     def calculate_accuracy_metrics(
         self,
         predictions: List[str],
-        references: Optional[List[str]]
+        references: Optional[List[str]],
+        selected: Optional[Mapping[str, bool]] = None,
     ) -> Optional[AccuracyMetrics]:
         if references is None or len(references) == 0:
             logger.info("No reference outputs available, skipping accuracy metrics")
@@ -38,18 +41,26 @@ def calculate_accuracy_metrics(
         if all(ref is None or ref == "" for ref in references):
             logger.info("All reference outputs are empty, skipping accuracy metrics")
             return None
-        
-        logger.info(f"Calculating accuracy metrics for {len(predictions)} predictions")
-        
+
+        logger.info(
+            f"Calculating accuracy metrics for {len(predictions)} predictions "
+            f"(selected={dict(selected) if selected is not None else 'all'})"
+        )
+
         try:
             metrics = AccuracyMetrics()
-            
-            metrics.bleu = self._calculate_bleu(predictions, references)
-            metrics.rouge = self._calculate_rouge(predictions, references)
-            metrics.meteor = self._calculate_meteor(predictions, references)
-            metrics.levenshtein = self._calculate_levenshtein(predictions, references)
-            metrics.bertscore = self._calculate_bertscore(predictions, references)
-            
+
+            if is_metric_enabled(selected, 'bleu'):
+                metrics.bleu = self._calculate_bleu(predictions, references)
+            if is_metric_enabled(selected, 'rouge'):
+                metrics.rouge = self._calculate_rouge(predictions, references)
+            if is_metric_enabled(selected, 'meteor'):
+                metrics.meteor = self._calculate_meteor(predictions, references)
+            if is_metric_enabled(selected, 'levenshtein'):
+                metrics.levenshtein = self._calculate_levenshtein(predictions, references)
+            if is_metric_enabled(selected, 'bertscore'):
+                metrics.bertscore = self._calculate_bertscore(predictions, references)
+
             def fmt(v): return f"{v:.4f}" if v is not None else "N/A"
             logger.info(
                 f"Accuracy metrics calculated - "
@@ -57,9 +68,9 @@ def fmt(v): return f"{v:.4f}" if v is not None else "N/A"
                 f"METEOR={fmt(metrics.meteor)}, Levenshtein={fmt(metrics.levenshtein)}, "
                 f"BERTScore={fmt(metrics.bertscore)}"
             )
-            
+
             return metrics
-            
+
         except Exception as e:
             logger.error(f"Error calculating accuracy metrics: {e}", exc_info=True)
             return None
diff --git a/backend/python-eval-function/src/classification_evaluator.py b/backend/python-eval-function/src/classification_evaluator.py
index 44a64d7..0438870 100644
--- a/backend/python-eval-function/src/classification_evaluator.py
+++ b/backend/python-eval-function/src/classification_evaluator.py
@@ -1,7 +1,9 @@
 import logging
-from typing import Optional, List
+from typing import Optional, List, Mapping
 from dataclasses import dataclass
 
+from metrics import is_metric_enabled
+
 logger = logging.getLogger(__name__)
 
 
@@ -14,6 +16,17 @@ class ClassificationMetrics:
     f1_weighted: Optional[float] = None
 
 
+# Maps the external (request-level) metric keys to the internal
+# ClassificationMetrics field names.
+_CLASSIFICATION_KEYS: tuple[str, ...] = (
+    'classification_accuracy',
+    'precision_macro',
+    'recall_macro',
+    'f1_macro',
+    'f1_weighted',
+)
+
+
 def normalize_prediction(prediction: str, valid_classes: List[str]) -> str:
     cleaned = prediction.strip()
 
@@ -38,6 +51,7 @@ def calculate_classification_metrics(
         predictions: List[str],
         references: List[str],
         valid_classes: Optional[List[str]] = None,
+        selected: Optional[Mapping[str, bool]] = None,
     ) -> Optional[ClassificationMetrics]:
         if not references or not predictions:
             logger.info("No references or predictions, skipping classification metrics")
@@ -47,6 +61,14 @@ def calculate_classification_metrics(
             logger.info("All references are empty, skipping classification metrics")
             return None
 
+        # If the user opted out of every classification metric, skip the
+        # sklearn computation entirely.
+        if selected is not None and not any(
+            is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS
+        ):
+            logger.info("All classification metrics disabled, skipping computation")
+            return ClassificationMetrics()
+
         if valid_classes is None:
             valid_classes = list(set(references))
 
@@ -56,7 +78,7 @@ def calculate_classification_metrics(
 
         logger.info(
             f"Calculating classification metrics for {len(normalized_preds)} predictions "
-            f"across {len(valid_classes)} classes"
+            f"across {len(valid_classes)} classes (selected={dict(selected) if selected is not None else 'all'})"
         )
 
         try:
@@ -86,11 +108,21 @@ def calculate_classification_metrics(
             )
 
             metrics = ClassificationMetrics(
-                accuracy=round(acc, 4),
-                precision_macro=round(precision, 4),
-                recall_macro=round(recall, 4),
-                f1_macro=round(f1, 4),
-                f1_weighted=round(f1_w, 4),
+                accuracy=round(acc, 4)
+                if is_metric_enabled(selected, 'classification_accuracy')
+                else None,
+                precision_macro=round(precision, 4)
+                if is_metric_enabled(selected, 'precision_macro')
+                else None,
+                recall_macro=round(recall, 4)
+                if is_metric_enabled(selected, 'recall_macro')
+                else None,
+                f1_macro=round(f1, 4)
+                if is_metric_enabled(selected, 'f1_macro')
+                else None,
+                f1_weighted=round(f1_w, 4)
+                if is_metric_enabled(selected, 'f1_weighted')
+                else None,
             )
 
             logger.info(
diff --git a/backend/python-eval-function/src/dynamodb_service.py b/backend/python-eval-function/src/dynamodb_service.py
index 4841e83..cf8fc03 100644
--- a/backend/python-eval-function/src/dynamodb_service.py
+++ b/backend/python-eval-function/src/dynamodb_service.py
@@ -7,6 +7,8 @@
 import boto3
 from botocore.exceptions import ClientError
 
+from metrics import normalize_metrics_config
+
 logger = logging.getLogger(__name__)
 
 
@@ -28,18 +30,34 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]:
             item = response['Item']
             models = json.loads(item.get('models', '[]'))
             weights = json.loads(item.get('weights', '{}'))
-            
+            # Older jobs may not have a `metrics` field; a malformed blob also
+            # falls back to all-enabled so a bad write can never wedge a job.
+            raw_metrics = item.get('metrics')
+            stored_metrics: Optional[Dict[str, Any]] = None
+            if raw_metrics:
+                try:
+                    stored_metrics = json.loads(raw_metrics)
+                except (TypeError, ValueError) as parse_err:
+                    logger.warning(
+                        f"Failed to parse stored metrics config "
+                        f"({parse_err}); defaulting to all metrics enabled"
+                    )
+            metrics = normalize_metrics_config(stored_metrics)
+
             job_config = {
                 'evaluation_id': item['evaluation_id'],
                 'dataset_id': item['dataset_id'],
                 'models': models,
                 'weights': weights,
+                'metrics': metrics,
                 'status': item.get('status', 'pending'),
                 'created_at': item.get('created_at', ''),
                 'total_samples': item.get('total_samples')
             }
-            
-            logger.info(f"Loaded job config: {len(models)} models, weights: {weights}")
+
+            logger.info(
+                f"Loaded job config: {len(models)} models, weights: {weights}, metrics: {metrics}"
+            )
             return job_config
             
         except ClientError as e:
diff --git a/backend/python-eval-function/src/geval_evaluator.py b/backend/python-eval-function/src/geval_evaluator.py
index 0665200..8a6d581 100644
--- a/backend/python-eval-function/src/geval_evaluator.py
+++ b/backend/python-eval-function/src/geval_evaluator.py
@@ -99,6 +99,8 @@ def evaluate(
         predictions: List[str],
         references: Optional[List[str]] = None,
         task_type: str = "summarization",
+        compute_reasoning: bool = True,
+        compute_faithfulness: bool = True,
     ) -> GEvalMetrics:
         if not inputs or not predictions:
             logger.warning("Empty inputs or predictions, skipping G-Eval")
@@ -108,10 +110,22 @@ def evaluate(
             logger.error("inputs and predictions length mismatch, skipping G-Eval")
             return GEvalMetrics()
 
+        if not compute_reasoning and not compute_faithfulness:
+            logger.info("Both G-Eval metrics disabled, skipping")
+            return GEvalMetrics()
+
         try:
             model = self._build_judge_model()
-            reasoning_metric = self._build_reasoning_metric(model, task_type)
-            faithfulness_metric = self._build_faithfulness_metric(model, task_type)
+            reasoning_metric = (
+                self._build_reasoning_metric(model, task_type)
+                if compute_reasoning
+                else None
+            )
+            faithfulness_metric = (
+                self._build_faithfulness_metric(model, task_type)
+                if compute_faithfulness
+                else None
+            )
         except Exception as e:
             logger.error(f"Failed to initialize G-Eval components: {e}", exc_info=True)
             return GEvalMetrics()
@@ -131,23 +145,25 @@ def evaluate(
                 actual_output=pred,
             )
 
-            try:
-                reasoning_metric.measure(test_case)
-                reasoning_scores.append(reasoning_metric.score)
-                logger.debug(
-                    f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
-                )
-            except Exception as e:
-                logger.warning(f"[{idx}] Reasoning metric failed: {e}")
-
-            try:
-                faithfulness_metric.measure(test_case)
-                faithfulness_scores.append(faithfulness_metric.score)
-                logger.debug(
-                    f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
-                )
-            except Exception as e:
-                logger.warning(f"[{idx}] Faithfulness metric failed: {e}")
+            if reasoning_metric is not None:
+                try:
+                    reasoning_metric.measure(test_case)
+                    reasoning_scores.append(reasoning_metric.score)
+                    logger.debug(
+                        f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
+                    )
+                except Exception as e:
+                    logger.warning(f"[{idx}] Reasoning metric failed: {e}")
+
+            if faithfulness_metric is not None:
+                try:
+                    faithfulness_metric.measure(test_case)
+                    faithfulness_scores.append(faithfulness_metric.score)
+                    logger.debug(
+                        f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
+                    )
+                except Exception as e:
+                    logger.warning(f"[{idx}] Faithfulness metric failed: {e}")
 
         result = GEvalMetrics()
 
diff --git a/backend/python-eval-function/src/main.py b/backend/python-eval-function/src/main.py
index 662f58a..ebfdd3f 100644
--- a/backend/python-eval-function/src/main.py
+++ b/backend/python-eval-function/src/main.py
@@ -74,8 +74,18 @@ def main():
         dataset_id = job_config['dataset_id']
         models = job_config['models']
         weights = job_config['weights']
-        
-        logger.info(f"Job config loaded - dataset: {dataset_id}, models: {len(models)}")
+        # `metrics_config` is already normalized by DynamoDBService.load_job —
+        # every key present, all bool.
+        metrics_config = job_config.get('metrics') or {}
+
+        from metrics import is_metric_enabled
+        compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning')
+        compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness')
+
+        logger.info(
+            f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, "
+            f"metrics={metrics_config or 'all'}"
+        )
         
         from dataset_loader import DatasetLoader
         dataset_loader = DatasetLoader()
@@ -134,7 +144,9 @@ def main():
                 predictions = [invocation_results[i].response_text for i in successful_indices]
                 references = [all_references[i] for i in successful_indices] if all_references else None
 
-                accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(predictions, references)
+                accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(
+                    predictions, references, selected=metrics_config
+                )
                 accuracy_results[model_id] = accuracy_metrics or AccuracyMetrics()
 
             logger.info(f"Summarization accuracy complete for {len(accuracy_results)} models")
@@ -149,7 +161,8 @@ def main():
                 references = [all_references[i] for i in successful_indices] if all_references else None
 
                 cls_metrics = classification_evaluator.calculate_classification_metrics(
-                    predictions, references, valid_classes=unique_classes
+                    predictions, references, valid_classes=unique_classes,
+                    selected=metrics_config,
                 )
 
                 acc = AccuracyMetrics()
@@ -163,28 +176,40 @@ def main():
 
             logger.info(f"Classification accuracy complete for {len(accuracy_results)} models")
 
-        from geval_evaluator import GEvalEvaluator
-        geval_evaluator = GEvalEvaluator()
-        
-        for model_id, invocation_results in results_by_model.items():
-            successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
-            predictions = [invocation_results[i].response_text for i in successful_indices]
-            inputs = [dataset.documents[i] for i in successful_indices]
-            
-            logger.info(f"Running G-Eval for model {model_id} on {len(predictions)} samples")
-            geval_metrics = geval_evaluator.evaluate(inputs, predictions, task_type=task_type)
-            
-            acc = accuracy_results.get(model_id)
-            if acc is None:
-                acc = AccuracyMetrics()
-                accuracy_results[model_id] = acc
-            
-            acc.geval_reasoning = geval_metrics.reasoning
-            acc.geval_faithfulness = geval_metrics.faithfulness
-            
-            logger.info(f"G-Eval complete for {model_id}")
-        
-        logger.info(f"G-Eval evaluation complete for all models")
+        if compute_geval_reasoning or compute_geval_faithfulness:
+            from geval_evaluator import GEvalEvaluator
+            geval_evaluator = GEvalEvaluator()
+
+            for model_id, invocation_results in results_by_model.items():
+                successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
+                predictions = [invocation_results[i].response_text for i in successful_indices]
+                inputs = [dataset.documents[i] for i in successful_indices]
+
+                logger.info(
+                    f"Running G-Eval for model {model_id} on {len(predictions)} samples "
+                    f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})"
+                )
+                geval_metrics = geval_evaluator.evaluate(
+                    inputs, predictions, task_type=task_type,
+                    compute_reasoning=compute_geval_reasoning,
+                    compute_faithfulness=compute_geval_faithfulness,
+                )
+
+                acc = accuracy_results.get(model_id)
+                if acc is None:
+                    acc = AccuracyMetrics()
+                    accuracy_results[model_id] = acc
+
+                if compute_geval_reasoning:
+                    acc.geval_reasoning = geval_metrics.reasoning
+                if compute_geval_faithfulness:
+                    acc.geval_faithfulness = geval_metrics.faithfulness
+
+                logger.info(f"G-Eval complete for {model_id}")
+
+            logger.info("G-Eval evaluation complete for all models")
+        else:
+            logger.info("G-Eval disabled via metrics config, skipping")
         
         from cost_calculator import CostCalculator
         cost_calculator = CostCalculator()
diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py
new file mode 100644
index 0000000..ac025b4
--- /dev/null
+++ b/backend/python-eval-function/src/metrics.py
@@ -0,0 +1,53 @@
+"""Single source of truth for metric keys and config helpers used by the
+evaluation engine.
+
+Keep `METRIC_KEYS` in sync with the TypeScript counterpart at
+`backend/src/models/Evaluation.ts` (and the frontend at
+`frontend/src/types/evaluation.ts`). The shape stored in DynamoDB is just a
+JSON object keyed by these names with boolean values.
+"""
+from typing import Dict, Mapping, Optional
+
+
+METRIC_KEYS: tuple[str, ...] = (
+    # Algorithmic — summarization
+    'bleu',
+    'rouge',
+    'meteor',
+    'levenshtein',
+    'bertscore',
+    # Algorithmic — classification
+    'classification_accuracy',
+    'precision_macro',
+    'recall_macro',
+    'f1_macro',
+    'f1_weighted',
+    # LLM-as-judge
+    'geval_reasoning',
+    'geval_faithfulness',
+)
+
+
+def is_metric_enabled(
+    selected: Optional[Mapping[str, bool]],
+    key: str,
+) -> bool:
+    """Return whether `key` should be computed.
+
+    Missing keys default to True so legacy jobs (and partial configs) keep the
+    pre-toggle behaviour of computing every metric.
+    """
+    if selected is None:
+        return True
+    return bool(selected.get(key, True))
+
+
+def normalize_metrics_config(
+    stored: Optional[Mapping[str, object]],
+) -> Dict[str, bool]:
+    """Coerce a stored metrics blob into a complete `{key: bool}` mapping.
+
+    Unknown keys in `stored` are dropped; missing keys default to True.
+    """
+    source: Mapping[str, object] = stored or {}
+    return {key: bool(source.get(key, True)) for key in METRIC_KEYS}
diff --git a/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts b/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts
index 283a89e..c130c7f 100644
--- a/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts
+++ b/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts
@@ -3,12 +3,21 @@ import type {
   APIGatewayProxyEventV2,
   APIGatewayProxyResultV2,
 } from 'aws-lambda';
-import { z } from 'zod';
+import { z, ZodRawShape } from 'zod';
 
+import { METRIC_KEYS } from '../../models/Evaluation';
 import { tokenEvaluationLaunchUseCase } from '../../useCases/EvaluationLaunch/EvaluationLaunchUseCase';
 import { handleHttpRequest } from '../api/handleHttpRequest';
 import { parseApiEvent } from '../api/parseApiEvent';
 
+const MetricsConfigSchema = z
+  .object(
+    Object.fromEntries(
+      METRIC_KEYS.map((key) => [key, z.boolean().optional()]),
+    ) as ZodRawShape,
+  )
+  .optional();
+
 const EvaluationRequestSchema = z.object({
   dataset_id: z.string().min(1),
   models: z
@@ -32,6 +41,7 @@ const EvaluationRequestSchema = z.object({
       cost: z.number().optional(),
     })
     .optional(),
+  metrics: MetricsConfigSchema,
 });
 
 export class EvaluationLaunchAdapter {
diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts
index 60e7d2e..6c5917e 100644
--- a/backend/src/models/Evaluation.ts
+++ b/backend/src/models/Evaluation.ts
@@ -9,10 +9,78 @@ export interface WeightConfig {
   cost: number;
 }
 
+/**
+ * Canonical list of every metric the engine knows how to compute, in display
+ * order, grouped by semantic category.
+ *
+ * This is the SINGLE SOURCE OF TRUTH for metric keys in the backend. Adding a
+ * new metric is a one-line change here — the type alias, the default config
+ * and the Zod payload schema are all derived from this list.
+ *
+ * Keep this list in sync with the frontend `METRIC_KEYS` (and the Python
+ * `metrics.METRIC_KEYS`); a cross-language enum would be ideal, but absent
+ * that, mirror the additions manually.
+ */
+export const METRIC_KEYS = [
+  // Algorithmic — summarization
+  'bleu',
+  'rouge',
+  'meteor',
+  'levenshtein',
+  'bertscore',
+  // Algorithmic — classification
+  'classification_accuracy',
+  'precision_macro',
+  'recall_macro',
+  'f1_macro',
+  'f1_weighted',
+  // LLM-as-judge
+  'geval_reasoning',
+  'geval_faithfulness',
+] as const;
+
+export type MetricKey = (typeof METRIC_KEYS)[number];
+
+/**
+ * Per-metric opt-in flags.
+ *
+ * Latency and cost are always computed (they are free from inference data).
+ *
+ * - Algorithmic / programmatic metrics are deterministic and run locally.
+ *   They are mostly fast; only BERTScore has a meaningful loading cost.
+ * - LLM-as-judge metrics call an extra Bedrock model per sample — they are
+ *   the main lever for both speed and spend.
+ *
+ * Metrics that are not applicable to the uploaded dataset (e.g. classification
+ * metrics on a summarization run) are skipped automatically.
+ */
+export type MetricsConfig = Record<MetricKey, boolean>;
+
+export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries(
+  METRIC_KEYS.map((k) => [k, true]),
+) as MetricsConfig;
+
+/**
+ * Merge a partial user-provided config with the defaults (all enabled).
+ * Unknown keys are ignored; missing keys fall back to the default value.
+ */
+export function resolveMetricsConfig(
+  metrics?: Partial<MetricsConfig>,
+): MetricsConfig {
+  const result = { ...DEFAULT_METRICS_CONFIG };
+  if (!metrics) return result;
+  for (const key of METRIC_KEYS) {
+    const provided = metrics[key];
+    if (provided !== undefined) result[key] = provided;
+  }
+  return result;
+}
+
 export interface EvaluationRequest {
   dataset_id: string;
   models: ModelConfig[];
   weights?: Partial<WeightConfig>;
+  metrics?: Partial<MetricsConfig>;
 }
 
 export interface EvaluationJob {
@@ -20,6 +88,7 @@ export interface EvaluationJob {
   dataset_id: string;
   models: ModelConfig[];
   weights: WeightConfig;
+  metrics: MetricsConfig;
   status: JobStatus;
   progress: number;
   current_model?: string;
diff --git a/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts b/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts
index 446cc54..a911806 100644
--- a/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts
+++ b/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts
@@ -8,8 +8,10 @@ import {
 import { createInjectionToken, inject } from '@trackit.io/di-container';
 import { randomUUID } from 'crypto';
 import {
+  DEFAULT_METRICS_CONFIG,
   EvaluationJob,
   JobStatus,
+  MetricsConfig,
   ModelConfig,
   ModelResult,
   Recommendation,
@@ -21,6 +23,7 @@ export type EvaluationJobsRepository = {
     datasetId: string,
     models: ModelConfig[],
     weights: WeightConfig,
+    metrics: MetricsConfig,
   ): Promise<EvaluationJob>;
 
   updateEvaluation(
@@ -51,6 +54,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository {
     datasetId: string,
     models: ModelConfig[],
     weights: WeightConfig,
+    metrics: MetricsConfig,
   ): Promise<EvaluationJob> {
     const evaluationId = randomUUID();
     const now = new Date().toISOString();
@@ -60,6 +64,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository {
       dataset_id: datasetId,
       models,
       weights,
+      metrics,
       status: 'pending',
       progress: 0,
       created_at: now,
@@ -74,6 +79,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository {
           dataset_id: { S: job.dataset_id },
           models: { S: JSON.stringify(job.models) },
           weights: { S: JSON.stringify(job.weights) },
+          metrics: { S: JSON.stringify(job.metrics) },
           status: { S: job.status },
           progress: { N: job.progress.toString() },
           created_at: { S: job.created_at },
@@ -176,6 +182,9 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository {
       dataset_id: item['dataset_id'].S!,
       models: JSON.parse(item['models'].S!) as ModelConfig[],
       weights: JSON.parse(item['weights'].S!) as WeightConfig,
+      metrics: item['metrics']?.S
+        ? (JSON.parse(item['metrics'].S) as MetricsConfig)
+        : { ...DEFAULT_METRICS_CONFIG },
       status: item['status'].S! as JobStatus,
       progress: Number(item['progress'].N ?? '0'),
       current_model: item['current_model']?.S,
diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
index 045f104..056afe0 100644
--- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
+++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
@@ -1,8 +1,13 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
-import type { EvaluationRequest } from '../../models/Evaluation.js';
+import {
+  DEFAULT_METRICS_CONFIG,
+  type EvaluationRequest,
+} from '../../models/Evaluation.js';
 import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js';
 import { FakeEvaluationLaunchUseCase } from './FakeEvaluationLaunchUseCase';
 
+const ALL_METRICS_ENABLED = { ...DEFAULT_METRICS_CONFIG };
+
 // Mock dependencies
 const mockEvaluationJobsRepository = {
   createEvaluation: vi.fn(),
@@ -64,6 +69,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
           latency: 0.33,
           cost: 0.34,
         },
+        ALL_METRICS_ENABLED,
       );
     });
 
@@ -91,10 +97,81 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
           latency: 0.33,
           cost: 0.34,
         },
+        ALL_METRICS_ENABLED,
       );
     });
   });
 
+  describe('Metrics toggles', () => {
+    it('should default to all metrics enabled when metrics not provided', async () => {
+      const request: EvaluationRequest = {
+        dataset_id: 'test-dataset-id',
+        models: [{ type: 'default', identifier: 'claude-sonnet' }],
+      };
+
+      await useCase.launchEvaluation(request);
+
+      const metricsArg =
+        mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
+      expect(metricsArg).toEqual(ALL_METRICS_ENABLED);
+    });
+
+    it('should respect explicitly disabled metrics', async () => {
+      const request: EvaluationRequest = {
+        dataset_id: 'test-dataset-id',
+        models: [{ type: 'default', identifier: 'claude-sonnet' }],
+        metrics: {
+          bertscore: false,
+          geval_reasoning: false,
+          geval_faithfulness: false,
+        },
+      };
+
+      await useCase.launchEvaluation(request);
+
+      const metricsArg =
+        mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
+      expect(metricsArg).toEqual({
+        ...ALL_METRICS_ENABLED,
+        bertscore: false,
+        geval_reasoning: false,
+        geval_faithfulness: false,
+      });
+    });
+
+    it('should fill in defaults for partially provided metrics', async () => {
+      const request: EvaluationRequest = {
+        dataset_id: 'test-dataset-id',
+        models: [{ type: 'default', identifier: 'claude-sonnet' }],
+        metrics: { geval_reasoning: false, geval_faithfulness: false },
+      };
+
+      await useCase.launchEvaluation(request);
+
+      const metricsArg =
+        mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
+      expect(metricsArg).toEqual({
+        ...ALL_METRICS_ENABLED,
+        geval_reasoning: false,
+        geval_faithfulness: false,
+      });
+    });
+
+    it('should support disabling a single algorithmic metric', async () => {
+      const request: EvaluationRequest = {
+        dataset_id: 'test-dataset-id',
+        models: [{ type: 'default', identifier: 'claude-sonnet' }],
+        metrics: { bleu: false },
+      };
+
+      await useCase.launchEvaluation(request);
+
+      const metricsArg =
+        mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
+      expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false });
+    });
+  });
+
   describe('Negative weight rejection', () => {
     it('should reject negative accuracy weight', async () => {
       const request: EvaluationRequest = {
@@ -216,6 +293,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
           latency: 0.33,
           cost: 0.34,
         },
+        ALL_METRICS_ENABLED,
       );
     });
 
@@ -349,11 +427,16 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
 
       expect(
         mockEvaluationJobsRepository.createEvaluation,
-      ).toHaveBeenCalledWith('test-dataset-id', request.models, {
-        accuracy: 0.33,
-        latency: 0.33,
-        cost: 0.34,
-      });
+      ).toHaveBeenCalledWith(
+        'test-dataset-id',
+        request.models,
+        {
+          accuracy: 0.33,
+          latency: 0.33,
+          cost: 0.34,
+        },
+        ALL_METRICS_ENABLED,
+      );
     });
 
     it('should accept a mix of default and custom models', async () => {
@@ -386,6 +469,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
           latency: 0.33,
           cost: 0.34,
         },
+        ALL_METRICS_ENABLED,
       );
     });
 
diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
index b715277..79ea6b6 100644
--- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
+++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
@@ -5,6 +5,7 @@ import {
   EvaluationJob,
   EvaluationRequest,
   ModelConfig,
+  resolveMetricsConfig,
   WeightConfig,
 } from '../../models/Evaluation';
 import { tokenBedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService';
@@ -33,11 +34,13 @@ class EvaluationLaunchUseCaseImpl implements EvaluationLaunchUseCase {
       );
 
     const normalizedWeights = this.normalizeWeights(request.weights);
+    const metrics = resolveMetricsConfig(request.metrics);
 
     const job = await this.evaluationJobsRepository.createEvaluation(
       request.dataset_id,
       modelsToPersist,
       normalizedWeights,
+      metrics,
     );
 
     await this.fargateService.launchTask(job.evaluation_id);
diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
index 60865a0..5fb8371 100644
--- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
+++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
@@ -1,8 +1,9 @@
-import type {
-  EvaluationJob,
-  EvaluationRequest,
-  ModelConfig,
-  WeightConfig,
+import {
+  resolveMetricsConfig,
+  type EvaluationJob,
+  type EvaluationRequest,
+  type ModelConfig,
+  type WeightConfig,
 } from '../../models/Evaluation.js';
 import { type BedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService.js';
 import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js';
@@ -40,11 +41,13 @@ export class FakeEvaluationLaunchUseCase implements EvaluationLaunchUseCase {
       );
 
     const normalizedWeights = this.normalizeWeights(request.weights);
+    const metrics = resolveMetricsConfig(request.metrics);
 
     const job = await this.evaluationJobsRepository.createEvaluation(
       request.dataset_id,
       modelsToPersist,
       normalizedWeights,
+      metrics,
     );
 
     await this.fargateService.launchTask(job.evaluation_id);
diff --git a/frontend/src/components/evaluator/MetricsWeights.tsx b/frontend/src/components/evaluator/MetricsWeights.tsx
index 122e11a..39d4fa8 100644
--- a/frontend/src/components/evaluator/MetricsWeights.tsx
+++ b/frontend/src/components/evaluator/MetricsWeights.tsx
@@ -1,12 +1,138 @@
 import { cn } from '@/lib/utils';
-import type { MetricsWeights as Weights } from '@/types/evaluation';
+import type {
+  MetricsToggles,
+  MetricsWeights as Weights,
+} from '@/types/evaluation';
 import { motion } from 'framer-motion';
-import { Info } from 'lucide-react';
+import { Info, Zap } from 'lucide-react';
 import { useEffect, useState } from 'react';
 
 interface MetricsWeightsProps {
   value: Weights;
   onChange: (weights: Weights) => void;
+  metrics: MetricsToggles;
+  onMetricsChange: (metrics: MetricsToggles) => void;
+}
+
+interface ToggleableMetric {
+  key: keyof MetricsToggles;
+  label: string;
+  description: string;
+  /** Free-form badge text (e.g. "Summarization", "Classification"). */
+  taskBadge?: string;
+}
+
+interface MetricGroup {
+  id: string;
+  title: string;
+  subtitle: string;
+  metrics: ToggleableMetric[];
+}
+
+const METRIC_GROUPS: MetricGroup[] = [
+  {
+    id: 'programmatic',
+    title: 'Programmatic metrics',
+    subtitle:
+      'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.',
+    metrics: [
+      {
+        key: 'bleu',
+        label: 'BLEU',
+        description: 'N-gram overlap with the reference.',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'rouge',
+        label: 'ROUGE',
+        description: 'Recall-oriented n-gram overlap.',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'meteor',
+        label: 'METEOR',
+        description: 'Stem-aware overlap, more lenient than BLEU.',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'levenshtein',
+        label: 'Levenshtein similarity',
+        description: 'Character-level edit-distance similarity.',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'bertscore',
+        label: 'BERTScore',
+        description:
+          'Embedding-based semantic similarity. Loads a transformer model once per run.',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'classification_accuracy',
+        label: 'Accuracy',
+        description: 'Fraction of predictions that match the reference label.',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'precision_macro',
+        label: 'Precision (macro)',
+        description: 'Per-class precision averaged across labels.',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'recall_macro',
+        label: 'Recall (macro)',
+        description: 'Per-class recall averaged across labels.',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_macro',
+        label: 'F1 (macro)',
+        description: 'Harmonic mean of precision and recall, unweighted.',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_weighted',
+        label: 'F1 (weighted)',
+        description: 'F1 weighted by class support.',
+        taskBadge: 'Classification',
+      },
+    ],
+  },
+  {
+    id: 'llm-judge',
+    title: 'LLM-as-judge metrics',
+    subtitle:
+      'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.',
+    metrics: [
+      {
+        key: 'geval_reasoning',
+        label: 'G-Eval — Reasoning',
+        description: 'How coherent and well-justified the output is.',
+      },
+      {
+        key: 'geval_faithfulness',
+        label: 'G-Eval — Faithfulness',
+        description: 'Whether the output sticks to the input (no hallucination).',
+      },
+    ],
+  },
+];
+
+function setGroupSelection(
+  current: MetricsToggles,
+  group: MetricGroup,
+  value: boolean,
+): MetricsToggles {
+  const next = { ...current };
+  for (const m of group.metrics) {
+    next[m.key] = value;
+  }
+  return next;
+}
+
+function countSelected(metrics: MetricsToggles, group: MetricGroup): number {
+  return group.metrics.reduce((n, m) => (metrics[m.key] ? n + 1 : n), 0);
 }
 
 const METRICS: { key: keyof Weights; label: string; description: string }[] = [
@@ -58,7 +184,12 @@ function syncDraftFromValue(prev: string, num: number): string {
   return String(num);
 }
 
-export function MetricsWeights({ value, onChange }: MetricsWeightsProps) {
+export function MetricsWeights({
+  value,
+  onChange,
+  metrics,
+  onMetricsChange,
+}: MetricsWeightsProps) {
   const [drafts, setDrafts] = useState({
     accuracy: String(value.accuracy),
     cost: String(value.cost),
@@ -176,6 +307,115 @@ export function MetricsWeights({ value, onChange }: MetricsWeightsProps) {
           is {total}%.
         </p>
       )}
+
+      <div className="mt-8">
+        <div className="flex items-center gap-2">
+          <Zap className="h-4 w-4 text-primary" />
+          <h2 className="text-sm font-semibold tracking-tight">
+            Pick the metrics to compute
+          </h2>
+        </div>
+        <p className="text-xs text-muted-foreground mt-1 mb-4">
+          Latency and cost are always computed. Metrics that don't apply to
+          your dataset (e.g. classification metrics on a summarization run)
+          are skipped automatically.
+        </p>
+
+        <div className="space-y-4">
+          {METRIC_GROUPS.map((group) => {
+            const selectedCount = countSelected(metrics, group);
+            const totalCount = group.metrics.length;
+            return (
+              <fieldset
+                key={group.id}
+                className="rounded-xl border border-border bg-surface p-3"
+              >
+                <legend className="px-1 text-sm font-semibold">
+                  {group.title}
+                </legend>
+                <p className="text-xs text-muted-foreground mt-1 mb-2">
+                  {group.subtitle}
+                </p>
+                <div className="flex items-center justify-between mb-2 text-xs">
+                  <span className="text-muted-foreground tabular-nums">
+                    {selectedCount} / {totalCount} selected
+                  </span>
+                  <div className="flex gap-3">
+                    <button
+                      type="button"
+                      onClick={() =>
+                        onMetricsChange(setGroupSelection(metrics, group, true))
+                      }
+                      className="text-primary hover:underline"
+                    >
+                      Select all
+                    </button>
+                    <button
+                      type="button"
+                      onClick={() =>
+                        onMetricsChange(
+                          setGroupSelection(metrics, group, false),
+                        )
+                      }
+                      className="text-muted-foreground hover:underline"
+                    >
+                      Clear
+                    </button>
+                  </div>
+                </div>
+
+                <div className="space-y-1.5">
+                  {group.metrics.map((metric) => {
+                    const enabled = metrics[metric.key];
+                    return (
+                      <label
+                        key={metric.key}
+                        htmlFor={`metric-${metric.key}`}
+                        className={cn(
+                          'flex items-start justify-between gap-3 rounded-lg border p-2.5 cursor-pointer transition-colors',
+                          enabled
+                            ? 'border-border bg-background'
+                            : 'border-border bg-muted/30',
+                        )}
+                      >
+                        <div className="min-w-0">
+                          <div className="flex items-center gap-2 flex-wrap">
+                            <p className="text-sm font-medium">
+                              {metric.label}
+                            </p>
+                            {metric.taskBadge && (
+                              <span className="rounded-full bg-muted px-2 py-0.5 text-[10px] font-medium text-muted-foreground">
+                                {metric.taskBadge}
+                              </span>
+                            )}
+                          </div>
+                          <p className="text-xs text-muted-foreground mt-0.5">
+                            {metric.description}
+                          </p>
+                        </div>
+                        <input
+                          id={`metric-${metric.key}`}
+                          type="checkbox"
+                          role="switch"
+                          aria-checked={enabled}
+                          checked={enabled}
+                          onChange={(e) =>
+                            onMetricsChange({
+                              ...metrics,
+                              [metric.key]: e.target.checked,
+                            })
+                          }
+                          className="mt-1 h-4 w-4 shrink-0 cursor-pointer accent-primary"
+                        />
+                      </label>
+                    );
+                  })}
+                </div>
+              </fieldset>
+            );
+          })}
+        </div>
+      </div>
     </motion.div>
   );
 }
diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx
index bf70b23..7dc00f6 100644
--- a/frontend/src/pages/Index.tsx
+++ b/frontend/src/pages/Index.tsx
@@ -11,6 +11,7 @@ import {
   useEvaluationStatus,
 } from '@/hooks/useEvaluation';
 import type { EvaluationConfig } from '@/types/evaluation';
+import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation';
 import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react';
 import { useCallback, useEffect, useState } from 'react';
 
@@ -21,6 +22,7 @@ export default function Index() {
   const [step, setStep] = useState(0);
   const [config, setConfig] = useState<EvaluationConfig>({
     weights: { accuracy: 40, cost: 30, latency: 30 },
+    metrics: { ...DEFAULT_METRICS_TOGGLES },
     selectedModels: [],
     datasetFile: null,
   });
@@ -74,6 +76,7 @@ export default function Index() {
           latency: config.weights.latency / 100,
           cost: config.weights.cost / 100,
         },
+        metrics: config.metrics,
       },
       {
         onSuccess: (data) => {
@@ -90,6 +93,7 @@ export default function Index() {
     datasetId,
     config.selectedModels,
     config.weights,
+    config.metrics,
     createEvaluationMutation,
   ]);
 
@@ -105,6 +109,7 @@ export default function Index() {
     setStep(0);
     setConfig({
       weights: { accuracy: 40, cost: 30, latency: 30 },
+      metrics: { ...DEFAULT_METRICS_TOGGLES },
       selectedModels: [],
       datasetFile: null,
     });
@@ -138,6 +143,10 @@ export default function Index() {
                 <MetricsWeights
                   value={config.weights}
                   onChange={(w) => setConfig({ ...config, weights: w })}
+                  metrics={config.metrics}
+                  onMetricsChange={(m) =>
+                    setConfig({ ...config, metrics: m })
+                  }
                 />
               )}
               {step === 1 && (
diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts
index 996d431..6fe06e4 100644
--- a/frontend/src/types/evaluation.ts
+++ b/frontend/src/types/evaluation.ts
@@ -4,6 +4,47 @@ export interface MetricsWeights {
   latency: number;
 }
 
+/**
+ * Canonical list of every metric the engine knows how to compute, in display
+ * order. Keep in sync with the backend `METRIC_KEYS` in
+ * `backend/src/models/Evaluation.ts` and `backend/python-eval-function/src/metrics.py`.
+ */
+export const METRIC_KEYS = [
+  // Algorithmic — summarization
+  'bleu',
+  'rouge',
+  'meteor',
+  'levenshtein',
+  'bertscore',
+  // Algorithmic — classification
+  'classification_accuracy',
+  'precision_macro',
+  'recall_macro',
+  'f1_macro',
+  'f1_weighted',
+  // LLM-as-judge
+  'geval_reasoning',
+  'geval_faithfulness',
+] as const;
+
+export type MetricKey = (typeof METRIC_KEYS)[number];
+
+/**
+ * Per-metric opt-in flags.
+ *
+ * - Algorithmic / programmatic metrics are deterministic, fast, and run locally
+ *   (BERTScore is the only one with a meaningful load cost).
+ * - LLM-as-judge metrics call an extra Bedrock model per sample — the main
+ *   lever for speed and spend.
+ *
+ * Metrics that don't apply to the uploaded dataset are skipped automatically.
+ */
+export type MetricsToggles = Record<MetricKey, boolean>;
+
+export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries(
+  METRIC_KEYS.map((k) => [k, true]),
+) as MetricsToggles;
+
 export interface ModelOption {
   id: string;
   name: string;
@@ -14,6 +55,7 @@ export interface ModelOption {
 
 export interface EvaluationConfig {
   weights: MetricsWeights;
+  metrics: MetricsToggles;
   selectedModels: string[];
   datasetFile: File | null;
 }
@@ -37,21 +79,21 @@ export interface EvaluationResult {
 export const AVAILABLE_MODELS: ModelOption[] = [
   // Amazon Nova
   {
-    id: 'us.amazon.nova-pro-v1:0',
+    id: 'amazon.nova-pro-v1:0',
     name: 'Nova Pro',
     provider: 'Amazon',
     contextWindow: '300K',
     costPer1kTokens: 0.0008,
   },
   {
-    id: 'us.amazon.nova-lite-v1:0',
+    id: 'amazon.nova-lite-v1:0',
     name: 'Nova Lite',
     provider: 'Amazon',
     contextWindow: '300K',
     costPer1kTokens: 0.00006,
   },
   {
-    id: 'us.amazon.nova-micro-v1:0',
+    id: 'amazon.nova-micro-v1:0',
     name: 'Nova Micro',
     provider: 'Amazon',
     contextWindow: '128K',
@@ -59,21 +101,21 @@ export const AVAILABLE_MODELS: ModelOption[] = [
   },
   // Anthropic
   {
-    id: 'us.anthropic.claude-opus-4-6-v1',
+    id: 'anthropic.claude-opus-4-6-v1',
     name: 'Claude Opus 4.6',
     provider: 'Anthropic',
     contextWindow: '200K',
     costPer1kTokens: 0.005,
   },
   {
-    id: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0',
+    id: 'anthropic.claude-sonnet-4-5-20250929-v1:0',
     name: 'Claude Sonnet 4.5',
     provider: 'Anthropic',
     contextWindow: '200K',
     costPer1kTokens: 0.003,
   },
   {
-    id: 'us.anthropic.claude-haiku-4-5-20251001-v1:0',
+    id: 'anthropic.claude-haiku-4-5-20251001-v1:0',
     name: 'Claude Haiku 4.5',
     provider: 'Anthropic',
     contextWindow: '200K',
@@ -85,6 +127,7 @@ export interface CreateEvaluationRequest {
   dataset_id: string;
   models: { type: 'default' | 'custom'; identifier: string }[];
   weights: { accuracy: number; latency: number; cost: number };
+  metrics?: MetricsToggles;
 }
 
 export interface DatasetUploadData {

From e1772785659a840da229d5fa0b4619281e41cdf6 Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Fri, 15 May 2026 12:22:24 +0200
Subject: [PATCH 2/9] fix: unused type

---
 .../src/classification_evaluator.py           |  5 ----
 .../src/dynamodb_service.py                   |  2 --
 backend/python-eval-function/src/metrics.py   | 18 ------------
 backend/src/models/Evaluation.ts              | 29 -------------------
 .../FakeEvaluationLaunchUseCase.ts            |  8 ++---
 frontend/src/types/evaluation.ts              | 15 ----------
 6 files changed, 4 insertions(+), 73 deletions(-)

diff --git a/backend/python-eval-function/src/classification_evaluator.py b/backend/python-eval-function/src/classification_evaluator.py
index 0438870..69ea449 100644
--- a/backend/python-eval-function/src/classification_evaluator.py
+++ b/backend/python-eval-function/src/classification_evaluator.py
@@ -15,9 +15,6 @@ class ClassificationMetrics:
     f1_macro: Optional[float] = None
     f1_weighted: Optional[float] = None
 
-
-# Maps the external (request-level) metric keys to the internal
-# ClassificationMetrics field names.
 _CLASSIFICATION_KEYS: tuple[str, ...] = (
     'classification_accuracy',
     'precision_macro',
@@ -61,8 +58,6 @@ def calculate_classification_metrics(
             logger.info("All references are empty, skipping classification metrics")
             return None
 
-        # If the user opted out of every classification metric, skip the
-        # sklearn computation entirely.
         if selected is not None and not any(
             is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS
         ):
diff --git a/backend/python-eval-function/src/dynamodb_service.py b/backend/python-eval-function/src/dynamodb_service.py
index cf8fc03..63504fa 100644
--- a/backend/python-eval-function/src/dynamodb_service.py
+++ b/backend/python-eval-function/src/dynamodb_service.py
@@ -30,8 +30,6 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]:
             item = response['Item']
             models = json.loads(item.get('models', '[]'))
             weights = json.loads(item.get('weights', '{}'))
-            # Older jobs may not have a `metrics` field; a malformed blob also
-            # falls back to all-enabled so a bad write can never wedge a job.
             raw_metrics = item.get('metrics')
             stored_metrics: Optional[Dict[str, Any]] = None
             if raw_metrics:
diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py
index ac025b4..fad56c2 100644
--- a/backend/python-eval-function/src/metrics.py
+++ b/backend/python-eval-function/src/metrics.py
@@ -1,14 +1,5 @@
-"""Single source of truth for metric keys and config helpers used by the
-evaluation engine.
-
-Keep `METRIC_KEYS` in sync with the TypeScript counterpart at
-`backend/src/models/Evaluation.ts` (and the frontend at
-`frontend/src/types/evaluation.ts`). The shape stored in DynamoDB is just a
-JSON object keyed by these names with boolean values.
-"""
 from typing import Dict, Mapping, Optional
 
-
 METRIC_KEYS: tuple[str, ...] = (
     # Algorithmic — summarization
     'bleu',
@@ -32,11 +23,6 @@ def is_metric_enabled(
     selected: Optional[Mapping[str, bool]],
     key: str,
 ) -> bool:
-    """Return whether `key` should be computed.
-
-    Missing keys default to True so legacy jobs (and partial configs) keep the
-    pre-toggle behaviour of computing every metric.
-    """
     if selected is None:
         return True
     return bool(selected.get(key, True))
@@ -45,9 +31,5 @@ def is_metric_enabled(
 def normalize_metrics_config(
     stored: Optional[Mapping[str, object]],
 ) -> Dict[str, bool]:
-    """Coerce a stored metrics blob into a complete `{key: bool}` mapping.
-
-    Unknown keys in `stored` are dropped; missing keys default to True.
-    """
     source: Mapping[str, object] = stored or {}
     return {key: bool(source.get(key, True)) for key in METRIC_KEYS}
diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts
index 6c5917e..92b4ab0 100644
--- a/backend/src/models/Evaluation.ts
+++ b/backend/src/models/Evaluation.ts
@@ -9,18 +9,6 @@ export interface WeightConfig {
   cost: number;
 }
 
-/**
- * Canonical list of every metric the engine knows how to compute, in display
- * order, grouped by semantic category.
- *
- * This is the SINGLE SOURCE OF TRUTH for metric keys in the backend. Adding a
- * new metric is a one-line change here — the type alias, the default config
- * and the Zod payload schema are all derived from this list.
- *
- * Keep this list in sync with the frontend `METRIC_KEYS` (and the Python
- * `metrics.METRIC_KEYS`); a cross-language enum would be ideal, but absent
- * that, mirror the additions manually.
- */
 export const METRIC_KEYS = [
   // Algorithmic — summarization
   'bleu',
@@ -41,29 +29,12 @@ export const METRIC_KEYS = [
 
 export type MetricKey = (typeof METRIC_KEYS)[number];
 
-/**
- * Per-metric opt-in flags.
- *
- * Latency and cost are always computed (they are free from inference data).
- *
- * - Algorithmic / programmatic metrics are deterministic and run locally.
- *   They are mostly fast; only BERTScore has a meaningful loading cost.
- * - LLM-as-judge metrics call an extra Bedrock model per sample — they are
- *   the main lever for both speed and spend.
- *
- * Metrics that are not applicable to the uploaded dataset (e.g. classification
- * metrics on a summarization run) are skipped automatically.
- */
 export type MetricsConfig = Record<MetricKey, boolean>;
 
 export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries(
   METRIC_KEYS.map((k) => [k, true]),
 ) as MetricsConfig;
 
-/**
- * Merge a partial user-provided config with the defaults (all enabled).
- * Unknown keys are ignored; missing keys fall back to the default value.
- */
 export function resolveMetricsConfig(
   metrics?: Partial<MetricsConfig>,
 ): MetricsConfig {
diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
index 5fb8371..2b57f1a 100644
--- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
+++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
@@ -1,9 +1,9 @@
 import {
   resolveMetricsConfig,
-  type EvaluationJob,
-  type EvaluationRequest,
-  type ModelConfig,
-  type WeightConfig,
+  EvaluationJob,
+  EvaluationRequest,
+  ModelConfig,
+  WeightConfig,
 } from '../../models/Evaluation.js';
 import { type BedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService.js';
 import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js';
diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts
index 6fe06e4..45c4833 100644
--- a/frontend/src/types/evaluation.ts
+++ b/frontend/src/types/evaluation.ts
@@ -4,11 +4,6 @@ export interface MetricsWeights {
   latency: number;
 }
 
-/**
- * Canonical list of every metric the engine knows how to compute, in display
- * order. Keep in sync with the backend `METRIC_KEYS` in
- * `backend/src/models/Evaluation.ts` and `backend/python-eval-function/src/metrics.py`.
- */
 export const METRIC_KEYS = [
   // Algorithmic — summarization
   'bleu',
@@ -29,16 +24,6 @@ export const METRIC_KEYS = [
 
 export type MetricKey = (typeof METRIC_KEYS)[number];
 
-/**
- * Per-metric opt-in flags.
- *
- * - Algorithmic / programmatic metrics are deterministic, fast, and run locally
- *   (BERTScore is the only one with a meaningful load cost).
- * - LLM-as-judge metrics call an extra Bedrock model per sample — the main
- *   lever for speed and spend.
- *
- * Metrics that don't apply to the uploaded dataset are skipped automatically.
- */
 export type MetricsToggles = Record<MetricKey, boolean>;
 
 export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries(

From 90eae833e6385a88fa800a301b27e654a303e821 Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Fri, 15 May 2026 12:59:02 +0200
Subject: [PATCH 3/9] feat: UI move metrics toggle to data set step

---
 .../components/evaluator/DatasetUpload.tsx    |  28 ++
 .../components/evaluator/MetricsPicker.tsx    | 261 ++++++++++++++++++
 .../components/evaluator/MetricsWeights.tsx   | 246 +----------------
 frontend/src/pages/Index.tsx                  |   8 +-
 4 files changed, 296 insertions(+), 247 deletions(-)
 create mode 100644 frontend/src/components/evaluator/MetricsPicker.tsx

diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx
index 72cb44f..c676b63 100644
--- a/frontend/src/components/evaluator/DatasetUpload.tsx
+++ b/frontend/src/components/evaluator/DatasetUpload.tsx
@@ -1,6 +1,11 @@
+import {
+  MetricsPicker,
+  type MetricsPickerTaskType,
+} from '@/components/evaluator/MetricsPicker';
 import { Button } from '@/components/ui/button';
 import { useUploadDataset } from '@/hooks/useEvaluation';
 import { cn } from '@/lib/utils';
+import type { MetricsToggles } from '@/types/evaluation';
 import { motion } from 'framer-motion';
 import {
   AlertCircle,
@@ -19,6 +24,8 @@ interface DatasetUploadProps {
   onStartEvaluation: () => void;
   onUploadSuccess: (data: { dataset_id: string; sample_count: number }) => void;
   isStarting?: boolean;
+  metrics: MetricsToggles;
+  onMetricsChange: (metrics: MetricsToggles) => void;
 }
 
 type TaskType = 'summarization' | 'classification';
@@ -76,12 +83,23 @@ const TASK_TYPES: {
 
 type FormatTab = 'csv' | 'jsonl';
 
+function resolveDetectedTask(data: {
+  has_summary: boolean;
+  has_class: boolean;
+}): MetricsPickerTaskType | undefined {
+  if (data.has_summary) return 'summarization';
+  if (data.has_class) return 'classification';
+  return undefined;
+}
+
 export function DatasetUpload({
   file,
   onChange,
   onStartEvaluation,
   onUploadSuccess,
   isStarting = false,
+  metrics,
+  onMetricsChange,
 }: DatasetUploadProps) {
   const [dragOver, setDragOver] = useState(false);
   const [activeTask, setActiveTask] = useState<TaskType>('summarization');
@@ -344,6 +362,16 @@ export function DatasetUpload({
         </div>
       )}
 
+      {uploadMutation.isSuccess && (
+        <div className="mt-6">
+          <MetricsPicker
+            metrics={metrics}
+            onChange={onMetricsChange}
+            taskType={resolveDetectedTask(uploadMutation.data)}
+          />
+        </div>
+      )}
+
       {uploadMutation.isSuccess && (
         <Button
           onClick={onStartEvaluation}
diff --git a/frontend/src/components/evaluator/MetricsPicker.tsx b/frontend/src/components/evaluator/MetricsPicker.tsx
new file mode 100644
index 0000000..f059916
--- /dev/null
+++ b/frontend/src/components/evaluator/MetricsPicker.tsx
@@ -0,0 +1,261 @@
+import { cn } from '@/lib/utils';
+import type { MetricsToggles } from '@/types/evaluation';
+import { Zap } from 'lucide-react';
+
+export type MetricsPickerTaskType = 'summarization' | 'classification';
+
+interface ToggleableMetric {
+  key: keyof MetricsToggles;
+  label: string;
+  description: string;
+  task?: MetricsPickerTaskType;
+  taskBadge?: string;
+}
+
+interface MetricGroup {
+  id: string;
+  title: string;
+  subtitle: string;
+  metrics: ToggleableMetric[];
+}
+
+const METRIC_GROUPS: MetricGroup[] = [
+  {
+    id: 'programmatic',
+    title: 'Programmatic metrics',
+    subtitle:
+      'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.',
+    metrics: [
+      {
+        key: 'bleu',
+        label: 'BLEU',
+        description: 'N-gram overlap with the reference.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'rouge',
+        label: 'ROUGE',
+        description: 'Recall-oriented n-gram overlap.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'meteor',
+        label: 'METEOR',
+        description: 'Stem-aware overlap, more lenient than BLEU.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'levenshtein',
+        label: 'Levenshtein similarity',
+        description: 'Character-level edit-distance similarity.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'bertscore',
+        label: 'BERTScore',
+        description:
+          'Embedding-based semantic similarity. Loads a transformer model once per run.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'classification_accuracy',
+        label: 'Accuracy',
+        description: 'Fraction of predictions that match the reference label.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'precision_macro',
+        label: 'Precision (macro)',
+        description: 'Per-class precision averaged across labels.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'recall_macro',
+        label: 'Recall (macro)',
+        description: 'Per-class recall averaged across labels.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_macro',
+        label: 'F1 (macro)',
+        description: 'Harmonic mean of precision and recall, unweighted.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_weighted',
+        label: 'F1 (weighted)',
+        description: 'F1 weighted by class support.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+    ],
+  },
+  {
+    id: 'llm-judge',
+    title: 'LLM-as-judge metrics',
+    subtitle:
+      'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.',
+    metrics: [
+      {
+        key: 'geval_reasoning',
+        label: 'G-Eval — Reasoning',
+        description: 'How coherent and well-justified the output is.',
+      },
+      {
+        key: 'geval_faithfulness',
+        label: 'G-Eval — Faithfulness',
+        description: 'Whether the output sticks to the input (no hallucination).',
+      },
+    ],
+  },
+];
+
+function setGroupSelection(
+  current: MetricsToggles,
+  group: MetricGroup,
+  value: boolean,
+): MetricsToggles {
+  const next = { ...current };
+  for (const m of group.metrics) {
+    next[m.key] = value;
+  }
+  return next;
+}
+
+function countSelected(metrics: MetricsToggles, group: MetricGroup): number {
+  return group.metrics.reduce((n, m) => (metrics[m.key] ? n + 1 : n), 0);
+}
+
+interface MetricsPickerProps {
+  metrics: MetricsToggles;
+  onChange: (metrics: MetricsToggles) => void;
+  taskType?: MetricsPickerTaskType;
+}
+
+export function MetricsPicker({
+  metrics,
+  onChange,
+  taskType,
+}: MetricsPickerProps) {
+  const visibleGroups = METRIC_GROUPS.map((group) => ({
+    ...group,
+    metrics: taskType
+      ? group.metrics.filter((m) => !m.task || m.task === taskType)
+      : group.metrics,
+  })).filter((g) => g.metrics.length > 0);
+
+  return (
+    <div>
+      <div className="flex items-center gap-2">
+        <Zap className="h-4 w-4 text-primary" />
+        <h2 className="text-sm font-semibold tracking-tight">
+          Pick the metrics to compute
+        </h2>
+      </div>
+      <p className="text-xs text-muted-foreground mt-1 mb-4">
+        Latency and cost are always computed. Disable expensive metrics like
+        BERTScore or G-Eval to speed up evaluation.
+      </p>
+
+      <div className="space-y-4">
+        {visibleGroups.map((group) => {
+          const selectedCount = countSelected(metrics, group);
+          const totalCount = group.metrics.length;
+          return (
+            <fieldset
+              key={group.id}
+              className="rounded-xl border border-border bg-surface p-3"
+            >
+              <legend className="px-1 text-sm font-semibold">
+                {group.title}
+              </legend>
+              <p className="text-xs text-muted-foreground mt-1 mb-2">
+                {group.subtitle}
+              </p>
+              <div className="flex items-center justify-between mb-2 text-xs">
+                <span className="text-muted-foreground tabular-nums">
+                  {selectedCount} / {totalCount} selected
+                </span>
+                <div className="flex gap-3">
+                  <button
+                    type="button"
+                    onClick={() =>
+                      onChange(setGroupSelection(metrics, group, true))
+                    }
+                    className="text-primary hover:underline"
+                  >
+                    Select all
+                  </button>
+                  <button
+                    type="button"
+                    onClick={() =>
+                      onChange(setGroupSelection(metrics, group, false))
+                    }
+                    className="text-muted-foreground hover:underline"
+                  >
+                    Clear
+                  </button>
+                </div>
+              </div>
+
+              <div className="space-y-1.5">
+                {group.metrics.map((metric) => {
+                  const enabled = metrics[metric.key];
+                  return (
+                    <label
+                      key={metric.key}
+                      htmlFor={`metric-${metric.key}`}
+                      className={cn(
+                        'flex items-start justify-between gap-3 rounded-lg border p-2.5 cursor-pointer transition-colors',
+                        enabled
+                          ? 'border-border bg-background'
+                          : 'border-border bg-muted/30',
+                      )}
+                    >
+                      <div className="min-w-0">
+                        <div className="flex items-center gap-2 flex-wrap">
+                          <p className="text-sm font-medium">{metric.label}</p>
+                          {metric.taskBadge && (
+                            <span className="rounded-full bg-muted px-2 py-0.5 text-[10px] font-medium text-muted-foreground">
+                              {metric.taskBadge}
+                            </span>
+                          )}
+                        </div>
+                        <p className="text-xs text-muted-foreground mt-0.5">
+                          {metric.description}
+                        </p>
+                      </div>
+                      <input
+                        id={`metric-${metric.key}`}
+                        type="checkbox"
+                        role="switch"
+                        aria-checked={enabled}
+                        checked={enabled}
+                        onChange={(e) =>
+                          onChange({
+                            ...metrics,
+                            [metric.key]: e.target.checked,
+                          })
+                        }
+                        className="mt-1 h-4 w-4 shrink-0 cursor-pointer accent-primary"
+                      />
+                    </label>
+                  );
+                })}
+              </div>
+            </fieldset>
+          );
+        })}
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/components/evaluator/MetricsWeights.tsx b/frontend/src/components/evaluator/MetricsWeights.tsx
index 39d4fa8..122e11a 100644
--- a/frontend/src/components/evaluator/MetricsWeights.tsx
+++ b/frontend/src/components/evaluator/MetricsWeights.tsx
@@ -1,138 +1,12 @@
 import { cn } from '@/lib/utils';
-import type {
-  MetricsToggles,
-  MetricsWeights as Weights,
-} from '@/types/evaluation';
+import type { MetricsWeights as Weights } from '@/types/evaluation';
 import { motion } from 'framer-motion';
-import { Info, Zap } from 'lucide-react';
+import { Info } from 'lucide-react';
 import { useEffect, useState } from 'react';
 
 interface MetricsWeightsProps {
   value: Weights;
   onChange: (weights: Weights) => void;
-  metrics: MetricsToggles;
-  onMetricsChange: (metrics: MetricsToggles) => void;
-}
-
-interface ToggleableMetric {
-  key: keyof MetricsToggles;
-  label: string;
-  description: string;
-  /** Free-form badge text (e.g. "Summarization", "Classification"). */
-  taskBadge?: string;
-}
-
-interface MetricGroup {
-  id: string;
-  title: string;
-  subtitle: string;
-  metrics: ToggleableMetric[];
-}
-
-const METRIC_GROUPS: MetricGroup[] = [
-  {
-    id: 'programmatic',
-    title: 'Programmatic metrics',
-    subtitle:
-      'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.',
-    metrics: [
-      {
-        key: 'bleu',
-        label: 'BLEU',
-        description: 'N-gram overlap with the reference.',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'rouge',
-        label: 'ROUGE',
-        description: 'Recall-oriented n-gram overlap.',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'meteor',
-        label: 'METEOR',
-        description: 'Stem-aware overlap, more lenient than BLEU.',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'levenshtein',
-        label: 'Levenshtein similarity',
-        description: 'Character-level edit-distance similarity.',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'bertscore',
-        label: 'BERTScore',
-        description:
-          'Embedding-based semantic similarity. Loads a transformer model once per run.',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'classification_accuracy',
-        label: 'Accuracy',
-        description: 'Fraction of predictions that match the reference label.',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'precision_macro',
-        label: 'Precision (macro)',
-        description: 'Per-class precision averaged across labels.',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'recall_macro',
-        label: 'Recall (macro)',
-        description: 'Per-class recall averaged across labels.',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'f1_macro',
-        label: 'F1 (macro)',
-        description: 'Harmonic mean of precision and recall, unweighted.',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'f1_weighted',
-        label: 'F1 (weighted)',
-        description: 'F1 weighted by class support.',
-        taskBadge: 'Classification',
-      },
-    ],
-  },
-  {
-    id: 'llm-judge',
-    title: 'LLM-as-judge metrics',
-    subtitle:
-      'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.',
-    metrics: [
-      {
-        key: 'geval_reasoning',
-        label: 'G-Eval — Reasoning',
-        description: 'How coherent and well-justified the output is.',
-      },
-      {
-        key: 'geval_faithfulness',
-        label: 'G-Eval — Faithfulness',
-        description: 'Whether the output sticks to the input (no hallucination).',
-      },
-    ],
-  },
-];
-
-function setGroupSelection(
-  current: MetricsToggles,
-  group: MetricGroup,
-  value: boolean,
-): MetricsToggles {
-  const next = { ...current };
-  for (const m of group.metrics) {
-    next[m.key] = value;
-  }
-  return next;
-}
-
-function countSelected(metrics: MetricsToggles, group: MetricGroup): number {
-  return group.metrics.reduce((n, m) => (metrics[m.key] ? n + 1 : n), 0);
 }
 
 const METRICS: { key: keyof Weights; label: string; description: string }[] = [
@@ -184,12 +58,7 @@ function syncDraftFromValue(prev: string, num: number): string {
   return String(num);
 }
 
-export function MetricsWeights({
-  value,
-  onChange,
-  metrics,
-  onMetricsChange,
-}: MetricsWeightsProps) {
+export function MetricsWeights({ value, onChange }: MetricsWeightsProps) {
   const [drafts, setDrafts] = useState({
     accuracy: String(value.accuracy),
     cost: String(value.cost),
@@ -307,115 +176,6 @@ export function MetricsWeights({
           is {total}%.
         </p>
       )}
-
-      <div className="mt-8">
-        <div className="flex items-center gap-2">
-          <Zap className="h-4 w-4 text-primary" />
-          <h2 className="text-sm font-semibold tracking-tight">
-            Pick the metrics to compute
-          </h2>
-        </div>
-        <p className="text-xs text-muted-foreground mt-1 mb-4">
-          Latency and cost are always computed. Metrics that don't apply to
-          your dataset (e.g. classification metrics on a summarization run)
-          are skipped automatically.
-        </p>
-
-        <div className="space-y-4">
-          {METRIC_GROUPS.map((group) => {
-            const selectedCount = countSelected(metrics, group);
-            const totalCount = group.metrics.length;
-            return (
-              <fieldset
-                key={group.id}
-                className="rounded-xl border border-border bg-surface p-3"
-              >
-                <legend className="px-1 text-sm font-semibold">
-                  {group.title}
-                </legend>
-                <p className="text-xs text-muted-foreground mt-1 mb-2">
-                  {group.subtitle}
-                </p>
-                <div className="flex items-center justify-between mb-2 text-xs">
-                  <span className="text-muted-foreground tabular-nums">
-                    {selectedCount} / {totalCount} selected
-                  </span>
-                  <div className="flex gap-3">
-                    <button
-                      type="button"
-                      onClick={() =>
-                        onMetricsChange(setGroupSelection(metrics, group, true))
-                      }
-                      className="text-primary hover:underline"
-                    >
-                      Select all
-                    </button>
-                    <button
-                      type="button"
-                      onClick={() =>
-                        onMetricsChange(
-                          setGroupSelection(metrics, group, false),
-                        )
-                      }
-                      className="text-muted-foreground hover:underline"
-                    >
-                      Clear
-                    </button>
-                  </div>
-                </div>
-
-                <div className="space-y-1.5">
-                  {group.metrics.map((metric) => {
-                    const enabled = metrics[metric.key];
-                    return (
-                      <label
-                        key={metric.key}
-                        htmlFor={`metric-${metric.key}`}
-                        className={cn(
-                          'flex items-start justify-between gap-3 rounded-lg border p-2.5 cursor-pointer transition-colors',
-                          enabled
-                            ? 'border-border bg-background'
-                            : 'border-border bg-muted/30',
-                        )}
-                      >
-                        <div className="min-w-0">
-                          <div className="flex items-center gap-2 flex-wrap">
-                            <p className="text-sm font-medium">
-                              {metric.label}
-                            </p>
-                            {metric.taskBadge && (
-                              <span className="rounded-full bg-muted px-2 py-0.5 text-[10px] font-medium text-muted-foreground">
-                                {metric.taskBadge}
-                              </span>
-                            )}
-                          </div>
-                          <p className="text-xs text-muted-foreground mt-0.5">
-                            {metric.description}
-                          </p>
-                        </div>
-                        <input
-                          id={`metric-${metric.key}`}
-                          type="checkbox"
-                          role="switch"
-                          aria-checked={enabled}
-                          checked={enabled}
-                          onChange={(e) =>
-                            onMetricsChange({
-                              ...metrics,
-                              [metric.key]: e.target.checked,
-                            })
-                          }
-                          className="mt-1 h-4 w-4 shrink-0 cursor-pointer accent-primary"
-                        />
-                      </label>
-                    );
-                  })}
-                </div>
-              </fieldset>
-            );
-          })}
-        </div>
-      </div>
     </motion.div>
   );
 }
diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx
index 7dc00f6..e02af82 100644
--- a/frontend/src/pages/Index.tsx
+++ b/frontend/src/pages/Index.tsx
@@ -143,10 +143,6 @@ export default function Index() {
                 <MetricsWeights
                   value={config.weights}
                   onChange={(w) => setConfig({ ...config, weights: w })}
-                  metrics={config.metrics}
-                  onMetricsChange={(m) =>
-                    setConfig({ ...config, metrics: m })
-                  }
                 />
               )}
               {step === 1 && (
@@ -162,6 +158,10 @@ export default function Index() {
                   onStartEvaluation={handleStartEvaluation}
                   onUploadSuccess={handleUploadSuccess}
                   isStarting={createEvaluationMutation.isPending}
+                  metrics={config.metrics}
+                  onMetricsChange={(m) =>
+                    setConfig({ ...config, metrics: m })
+                  }
                 />
               )}
 

From b59d229131f491f30710fce2278bb200b540ed48 Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Fri, 15 May 2026 12:59:14 +0200
Subject: [PATCH 4/9] feat: Update readme for metricpicker new file

---
 frontend/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/frontend/README.md b/frontend/README.md
index bfb7321..172796e 100644
--- a/frontend/README.md
+++ b/frontend/README.md
@@ -26,7 +26,8 @@ frontend/
 │   │   ├── evaluator/          # Step components for the evaluation workflow
 │   │   │   ├── MetricsWeights.tsx    # Step 1 — tune accuracy / latency / cost weights
 │   │   │   ├── ModelSelection.tsx    # Step 2 — pick Bedrock models to evaluate
-│   │   │   ├── DatasetUpload.tsx     # Step 3 — upload CSV or JSONL dataset
+│   │   │   ├── DatasetUpload.tsx     # Step 3 — upload dataset and pick metrics to compute
+│   │   │   ├── MetricsPicker.tsx     # Task-aware picker for which metrics to compute
 │   │   │   ├── ProgressView.tsx      # Step 4a — real-time job progress
 │   │   │   ├── ResultsView.tsx       # Step 4b — ranked results and recommendation
 │   │   │   └── StepIndicator.tsx     # Navigation breadcrumb for the stepper

From 3326065de52e709f9472e54264c8a28ee041e379 Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Mon, 18 May 2026 11:37:26 +0200
Subject: [PATCH 5/9] feat(backend):  remove gEval logic from main to keep it
 as pure orchestration and moved it on the geval class

---
 .../src/geval_evaluator.py                    | 30 ++++++++++++++-----
 backend/python-eval-function/src/main.py      | 18 ++++-------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/backend/python-eval-function/src/geval_evaluator.py b/backend/python-eval-function/src/geval_evaluator.py
index 8a6d581..41d55c2 100644
--- a/backend/python-eval-function/src/geval_evaluator.py
+++ b/backend/python-eval-function/src/geval_evaluator.py
@@ -1,8 +1,10 @@
 import os
 import logging
-from typing import Optional, List
+from typing import Optional, List, Mapping
 from dataclasses import dataclass
 
+from metrics import is_metric_enabled
+
 logger = logging.getLogger(__name__)
 
 JUDGE_MODEL_DEFAULT = "us.meta.llama4-maverick-17b-instruct-v1:0"
@@ -15,10 +17,20 @@ class GEvalMetrics:
 
 
 class GEvalEvaluator:
-    def __init__(self, judge_model_id: Optional[str] = None, region: Optional[str] = None):
+    def __init__(
+        self,
+        metrics_config: Optional[Mapping[str, bool]] = None,
+        judge_model_id: Optional[str] = None,
+        region: Optional[str] = None,
+    ):
         self.judge_model_id = judge_model_id or os.environ.get("GEVAL_JUDGE_MODEL", JUDGE_MODEL_DEFAULT)
         self.region = region or os.environ.get("AWS_REGION", "us-west-2")
-        logger.info(f"GEvalEvaluator initialized with judge={self.judge_model_id}, region={self.region}")
+        self.compute_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning')
+        self.compute_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness')
+        logger.info(
+            f"GEvalEvaluator initialized with judge={self.judge_model_id}, region={self.region}, "
+            f"reasoning={self.compute_reasoning}, faithfulness={self.compute_faithfulness}"
+        )
 
     def _build_judge_model(self):
         from deepeval.models import AmazonBedrockModel
@@ -93,14 +105,16 @@ def _build_faithfulness_metric(self, model, task_type: str = "summarization"):
             async_mode=False,
         )
 
+    @property
+    def enabled(self) -> bool:
+        return self.compute_reasoning or self.compute_faithfulness
+
     def evaluate(
         self,
         inputs: List[str],
         predictions: List[str],
         references: Optional[List[str]] = None,
         task_type: str = "summarization",
-        compute_reasoning: bool = True,
-        compute_faithfulness: bool = True,
     ) -> GEvalMetrics:
         if not inputs or not predictions:
             logger.warning("Empty inputs or predictions, skipping G-Eval")
@@ -110,7 +124,7 @@ def evaluate(
             logger.error("inputs and predictions length mismatch, skipping G-Eval")
             return GEvalMetrics()
 
-        if not compute_reasoning and not compute_faithfulness:
+        if not self.enabled:
             logger.info("Both G-Eval metrics disabled, skipping")
             return GEvalMetrics()
 
@@ -118,12 +132,12 @@ def evaluate(
             model = self._build_judge_model()
             reasoning_metric = (
                 self._build_reasoning_metric(model, task_type)
-                if compute_reasoning
+                if self.compute_reasoning
                 else None
             )
             faithfulness_metric = (
                 self._build_faithfulness_metric(model, task_type)
-                if compute_faithfulness
+                if self.compute_faithfulness
                 else None
             )
         except Exception as e:
diff --git a/backend/python-eval-function/src/main.py b/backend/python-eval-function/src/main.py
index ebfdd3f..619eabc 100644
--- a/backend/python-eval-function/src/main.py
+++ b/backend/python-eval-function/src/main.py
@@ -78,10 +78,6 @@ def main():
         # every key present, all bool.
         metrics_config = job_config.get('metrics') or {}
 
-        from metrics import is_metric_enabled
-        compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning')
-        compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness')
-
         logger.info(
             f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, "
             f"metrics={metrics_config or 'all'}"
@@ -176,10 +172,10 @@ def main():
 
             logger.info(f"Classification accuracy complete for {len(accuracy_results)} models")
 
-        if compute_geval_reasoning or compute_geval_faithfulness:
-            from geval_evaluator import GEvalEvaluator
-            geval_evaluator = GEvalEvaluator()
+        from geval_evaluator import GEvalEvaluator
+        geval_evaluator = GEvalEvaluator(metrics_config=metrics_config)
 
+        if geval_evaluator.enabled:
             for model_id, invocation_results in results_by_model.items():
                 successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
                 predictions = [invocation_results[i].response_text for i in successful_indices]
@@ -187,12 +183,10 @@ def main():
 
                 logger.info(
                     f"Running G-Eval for model {model_id} on {len(predictions)} samples "
-                    f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})"
+                    f"(reasoning={geval_evaluator.compute_reasoning}, faithfulness={geval_evaluator.compute_faithfulness})"
                 )
                 geval_metrics = geval_evaluator.evaluate(
                     inputs, predictions, task_type=task_type,
-                    compute_reasoning=compute_geval_reasoning,
-                    compute_faithfulness=compute_geval_faithfulness,
                 )
 
                 acc = accuracy_results.get(model_id)
@@ -200,9 +194,9 @@ def main():
                     acc = AccuracyMetrics()
                     accuracy_results[model_id] = acc
 
-                if compute_geval_reasoning:
+                if geval_evaluator.compute_reasoning:
                     acc.geval_reasoning = geval_metrics.reasoning
-                if compute_geval_faithfulness:
+                if geval_evaluator.compute_faithfulness:
                     acc.geval_faithfulness = geval_metrics.faithfulness
 
                 logger.info(f"G-Eval complete for {model_id}")

From 5cc3787f0063d544c0b04d9ad89701e977b77a59 Mon Sep 17 00:00:00 2001
From: Raymond LIM <raymond.lim672@gmail.com>
Date: Mon, 18 May 2026 11:38:22 +0200
Subject: [PATCH 6/9] Apply suggestion from @DaoudAA

Co-authored-by: DaoudAA <127060080+DaoudAA@users.noreply.github.com>
---
 frontend/src/components/evaluator/MetricsPicker.tsx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/frontend/src/components/evaluator/MetricsPicker.tsx b/frontend/src/components/evaluator/MetricsPicker.tsx
index f059916..b6af133 100644
--- a/frontend/src/components/evaluator/MetricsPicker.tsx
+++ b/frontend/src/components/evaluator/MetricsPicker.tsx
@@ -162,8 +162,7 @@ export function MetricsPicker({
         </h2>
       </div>
       <p className="text-xs text-muted-foreground mt-1 mb-4">
-        Latency and cost are always computed. Disable expensive metrics like
-        BERTScore or G-Eval to speed up evaluation.
+        Latency and cost are always computed regardless of choice of accuracy metrics.
       </p>
 
       <div className="space-y-4">

From ec87000337832b2e11f72103b04b8b5274de9c9a Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Mon, 18 May 2026 15:53:41 +0200
Subject: [PATCH 7/9] feat: Remove unecessary key in the API payload sent by
 the frontend + omitted key will be set to false

---
 backend/python-eval-function/src/metrics.py   |   6 +-
 backend/src/models/Evaluation.ts              |   2 +-
 .../components/evaluator/DatasetUpload.tsx    |  18 +--
 .../components/evaluator/MetricsPicker.tsx    | 122 +-----------------
 frontend/src/pages/Index.tsx                  |  15 ++-
 frontend/src/types/evaluation.ts              | 121 ++++++++++++++++-
 frontend/src/utils/metrics.ts                 |  35 +++++
 7 files changed, 182 insertions(+), 137 deletions(-)
 create mode 100644 frontend/src/utils/metrics.ts

diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py
index fad56c2..61c2616 100644
--- a/backend/python-eval-function/src/metrics.py
+++ b/backend/python-eval-function/src/metrics.py
@@ -24,12 +24,12 @@ def is_metric_enabled(
     key: str,
 ) -> bool:
     if selected is None:
-        return True
-    return bool(selected.get(key, True))
+        return False
+    return bool(selected.get(key, False))
 
 
 def normalize_metrics_config(
     stored: Optional[Mapping[str, object]],
 ) -> Dict[str, bool]:
     source: Mapping[str, object] = stored or {}
-    return {key: bool(source.get(key, True)) for key in METRIC_KEYS}
+    return {key: bool(source.get(key, False)) for key in METRIC_KEYS}
diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts
index 92b4ab0..f5eee37 100644
--- a/backend/src/models/Evaluation.ts
+++ b/backend/src/models/Evaluation.ts
@@ -32,7 +32,7 @@ export type MetricKey = (typeof METRIC_KEYS)[number];
 export type MetricsConfig = Record<MetricKey, boolean>;
 
 export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries(
-  METRIC_KEYS.map((k) => [k, true]),
+  METRIC_KEYS.map((k) => [k, false]),
 ) as MetricsConfig;
 
 export function resolveMetricsConfig(
diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx
index c676b63..de9ecc5 100644
--- a/frontend/src/components/evaluator/DatasetUpload.tsx
+++ b/frontend/src/components/evaluator/DatasetUpload.tsx
@@ -1,11 +1,8 @@
-import {
-  MetricsPicker,
-  type MetricsPickerTaskType,
-} from '@/components/evaluator/MetricsPicker';
+import { MetricsPicker } from '@/components/evaluator/MetricsPicker';
 import { Button } from '@/components/ui/button';
 import { useUploadDataset } from '@/hooks/useEvaluation';
 import { cn } from '@/lib/utils';
-import type { MetricsToggles } from '@/types/evaluation';
+import type { MetricsToggles, TaskType } from '@/types/evaluation';
 import { motion } from 'framer-motion';
 import {
   AlertCircle,
@@ -22,14 +19,16 @@ interface DatasetUploadProps {
   file: File | null;
   onChange: (file: File | null) => void;
   onStartEvaluation: () => void;
-  onUploadSuccess: (data: { dataset_id: string; sample_count: number }) => void;
+  onUploadSuccess: (data: {
+    dataset_id: string;
+    sample_count: number;
+    taskType: TaskType | undefined;
+  }) => void;
   isStarting?: boolean;
   metrics: MetricsToggles;
   onMetricsChange: (metrics: MetricsToggles) => void;
 }
 
-type TaskType = 'summarization' | 'classification';
-
 const TASK_TYPES: {
   id: TaskType;
   label: string;
@@ -86,7 +85,7 @@ type FormatTab = 'csv' | 'jsonl';
 function resolveDetectedTask(data: {
   has_summary: boolean;
   has_class: boolean;
-}): MetricsPickerTaskType | undefined {
+}): TaskType | undefined {
   if (data.has_summary) return 'summarization';
   if (data.has_class) return 'classification';
   return undefined;
@@ -114,6 +113,7 @@ export function DatasetUpload({
         onUploadSuccess({
           dataset_id: data.dataset_id,
           sample_count: data.sample_count,
+          taskType: resolveDetectedTask(data),
         });
       },
     });
diff --git a/frontend/src/components/evaluator/MetricsPicker.tsx b/frontend/src/components/evaluator/MetricsPicker.tsx
index b6af133..350a6ea 100644
--- a/frontend/src/components/evaluator/MetricsPicker.tsx
+++ b/frontend/src/components/evaluator/MetricsPicker.tsx
@@ -1,124 +1,8 @@
 import { cn } from '@/lib/utils';
-import type { MetricsToggles } from '@/types/evaluation';
+import type { MetricGroup, MetricsToggles, TaskType } from '@/types/evaluation';
+import { METRIC_GROUPS } from '@/types/evaluation';
 import { Zap } from 'lucide-react';
 
-export type MetricsPickerTaskType = 'summarization' | 'classification';
-
-interface ToggleableMetric {
-  key: keyof MetricsToggles;
-  label: string;
-  description: string;
-  task?: MetricsPickerTaskType;
-  taskBadge?: string;
-}
-
-interface MetricGroup {
-  id: string;
-  title: string;
-  subtitle: string;
-  metrics: ToggleableMetric[];
-}
-
-const METRIC_GROUPS: MetricGroup[] = [
-  {
-    id: 'programmatic',
-    title: 'Programmatic metrics',
-    subtitle:
-      'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.',
-    metrics: [
-      {
-        key: 'bleu',
-        label: 'BLEU',
-        description: 'N-gram overlap with the reference.',
-        task: 'summarization',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'rouge',
-        label: 'ROUGE',
-        description: 'Recall-oriented n-gram overlap.',
-        task: 'summarization',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'meteor',
-        label: 'METEOR',
-        description: 'Stem-aware overlap, more lenient than BLEU.',
-        task: 'summarization',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'levenshtein',
-        label: 'Levenshtein similarity',
-        description: 'Character-level edit-distance similarity.',
-        task: 'summarization',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'bertscore',
-        label: 'BERTScore',
-        description:
-          'Embedding-based semantic similarity. Loads a transformer model once per run.',
-        task: 'summarization',
-        taskBadge: 'Summarization',
-      },
-      {
-        key: 'classification_accuracy',
-        label: 'Accuracy',
-        description: 'Fraction of predictions that match the reference label.',
-        task: 'classification',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'precision_macro',
-        label: 'Precision (macro)',
-        description: 'Per-class precision averaged across labels.',
-        task: 'classification',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'recall_macro',
-        label: 'Recall (macro)',
-        description: 'Per-class recall averaged across labels.',
-        task: 'classification',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'f1_macro',
-        label: 'F1 (macro)',
-        description: 'Harmonic mean of precision and recall, unweighted.',
-        task: 'classification',
-        taskBadge: 'Classification',
-      },
-      {
-        key: 'f1_weighted',
-        label: 'F1 (weighted)',
-        description: 'F1 weighted by class support.',
-        task: 'classification',
-        taskBadge: 'Classification',
-      },
-    ],
-  },
-  {
-    id: 'llm-judge',
-    title: 'LLM-as-judge metrics',
-    subtitle:
-      'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.',
-    metrics: [
-      {
-        key: 'geval_reasoning',
-        label: 'G-Eval — Reasoning',
-        description: 'How coherent and well-justified the output is.',
-      },
-      {
-        key: 'geval_faithfulness',
-        label: 'G-Eval — Faithfulness',
-        description: 'Whether the output sticks to the input (no hallucination).',
-      },
-    ],
-  },
-];
-
 function setGroupSelection(
   current: MetricsToggles,
   group: MetricGroup,
@@ -138,7 +22,7 @@ function countSelected(metrics: MetricsToggles, group: MetricGroup): number {
 interface MetricsPickerProps {
   metrics: MetricsToggles;
   onChange: (metrics: MetricsToggles) => void;
-  taskType?: MetricsPickerTaskType;
+  taskType?: TaskType;
 }
 
 export function MetricsPicker({
diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx
index e02af82..d071b43 100644
--- a/frontend/src/pages/Index.tsx
+++ b/frontend/src/pages/Index.tsx
@@ -10,8 +10,9 @@ import {
   useEvaluationResults,
   useEvaluationStatus,
 } from '@/hooks/useEvaluation';
-import type { EvaluationConfig } from '@/types/evaluation';
+import type { EvaluationConfig, TaskType } from '@/types/evaluation';
 import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation';
+import { buildDefaultsForTask, pickEnabledMetrics } from '@/utils/metrics';
 import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react';
 import { useCallback, useEffect, useState } from 'react';
 
@@ -76,7 +77,7 @@ export default function Index() {
           latency: config.weights.latency / 100,
           cost: config.weights.cost / 100,
         },
-        metrics: config.metrics,
+        metrics: pickEnabledMetrics(config.metrics),
       },
       {
         onSuccess: (data) => {
@@ -98,8 +99,16 @@ export default function Index() {
   ]);
 
   const handleUploadSuccess = useCallback(
-    (data: { dataset_id: string; sample_count: number }) => {
+    (data: {
+      dataset_id: string;
+      sample_count: number;
+      taskType: TaskType | undefined;
+    }) => {
       setDatasetId(data.dataset_id);
+      setConfig((prev) => ({
+        ...prev,
+        metrics: buildDefaultsForTask(data.taskType),
+      }));
     },
     [],
   );
diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts
index 45c4833..1d5b0e9 100644
--- a/frontend/src/types/evaluation.ts
+++ b/frontend/src/types/evaluation.ts
@@ -27,9 +27,126 @@ export type MetricKey = (typeof METRIC_KEYS)[number];
 export type MetricsToggles = Record<MetricKey, boolean>;
 
 export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries(
-  METRIC_KEYS.map((k) => [k, true]),
+  METRIC_KEYS.map((k) => [k, false]),
 ) as MetricsToggles;
 
+export type TaskType = 'summarization' | 'classification';
+
+export interface ToggleableMetric {
+  key: MetricKey;
+  label: string;
+  description: string;
+  task?: TaskType;
+  taskBadge?: string;
+}
+
+export interface MetricGroup {
+  id: string;
+  title: string;
+  subtitle: string;
+  metrics: ToggleableMetric[];
+}
+
+export const METRIC_GROUPS: MetricGroup[] = [
+  {
+    id: 'programmatic',
+    title: 'Programmatic metrics',
+    subtitle:
+      'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.',
+    metrics: [
+      {
+        key: 'bleu',
+        label: 'BLEU',
+        description: 'N-gram overlap with the reference.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'rouge',
+        label: 'ROUGE',
+        description: 'Recall-oriented n-gram overlap.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'meteor',
+        label: 'METEOR',
+        description: 'Stem-aware overlap, more lenient than BLEU.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'levenshtein',
+        label: 'Levenshtein similarity',
+        description: 'Character-level edit-distance similarity.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'bertscore',
+        label: 'BERTScore',
+        description:
+          'Embedding-based semantic similarity. Loads a transformer model once per run.',
+        task: 'summarization',
+        taskBadge: 'Summarization',
+      },
+      {
+        key: 'classification_accuracy',
+        label: 'Accuracy',
+        description: 'Fraction of predictions that match the reference label.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'precision_macro',
+        label: 'Precision (macro)',
+        description: 'Per-class precision averaged across labels.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'recall_macro',
+        label: 'Recall (macro)',
+        description: 'Per-class recall averaged across labels.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_macro',
+        label: 'F1 (macro)',
+        description: 'Harmonic mean of precision and recall, unweighted.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+      {
+        key: 'f1_weighted',
+        label: 'F1 (weighted)',
+        description: 'F1 weighted by class support.',
+        task: 'classification',
+        taskBadge: 'Classification',
+      },
+    ],
+  },
+  {
+    id: 'llm-judge',
+    title: 'LLM-as-judge metrics',
+    subtitle:
+      'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.',
+    metrics: [
+      {
+        key: 'geval_reasoning',
+        label: 'G-Eval — Reasoning',
+        description: 'How coherent and well-justified the output is.',
+      },
+      {
+        key: 'geval_faithfulness',
+        label: 'G-Eval — Faithfulness',
+        description: 'Whether the output sticks to the input (no hallucination).',
+      },
+    ],
+  },
+];
+
 export interface ModelOption {
   id: string;
   name: string;
@@ -112,7 +229,7 @@ export interface CreateEvaluationRequest {
   dataset_id: string;
   models: { type: 'default' | 'custom'; identifier: string }[];
   weights: { accuracy: number; latency: number; cost: number };
-  metrics?: MetricsToggles;
+  metrics?: Partial<Record<MetricKey, boolean>>;
 }
 
 export interface DatasetUploadData {
diff --git a/frontend/src/utils/metrics.ts b/frontend/src/utils/metrics.ts
new file mode 100644
index 0000000..53c8c7a
--- /dev/null
+++ b/frontend/src/utils/metrics.ts
@@ -0,0 +1,35 @@
+import {
+  METRIC_GROUPS,
+  type MetricKey,
+  type MetricsToggles,
+  type TaskType,
+  METRIC_KEYS,
+} from '@/types/evaluation';
+
+export function buildDefaultsForTask(
+  taskType: TaskType | undefined,
+): MetricsToggles {
+  const base = Object.fromEntries(
+    METRIC_KEYS.map((k) => [k, false]),
+  ) as MetricsToggles;
+
+  if (!taskType) return base;
+
+  for (const group of METRIC_GROUPS) {
+    for (const m of group.metrics) {
+      if (!m.task || m.task === taskType) {
+        base[m.key] = true;
+      }
+    }
+  }
+
+  return base;
+}
+
+export function pickEnabledMetrics(
+  toggles: MetricsToggles,
+): Partial<Record<MetricKey, true>> {
+  return Object.fromEntries(
+    Object.entries(toggles).filter(([, v]) => v),
+  ) as Partial<Record<MetricKey, true>>;
+}

From 4c84eb8bf2a1f9c13dfcdb093bdf9bc6a65cc107 Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Wed, 20 May 2026 14:20:44 +0200
Subject: [PATCH 8/9] feat: reverted model id to previous one

---
 frontend/src/types/evaluation.ts | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts
index 1d5b0e9..c490d8d 100644
--- a/frontend/src/types/evaluation.ts
+++ b/frontend/src/types/evaluation.ts
@@ -181,21 +181,21 @@ export interface EvaluationResult {
 export const AVAILABLE_MODELS: ModelOption[] = [
   // Amazon Nova
   {
-    id: 'amazon.nova-pro-v1:0',
+    id: 'us.amazon.nova-pro-v1:0',
     name: 'Nova Pro',
     provider: 'Amazon',
     contextWindow: '300K',
     costPer1kTokens: 0.0008,
   },
   {
-    id: 'amazon.nova-lite-v1:0',
+    id: 'us.amazon.nova-lite-v1:0',
     name: 'Nova Lite',
     provider: 'Amazon',
     contextWindow: '300K',
     costPer1kTokens: 0.00006,
   },
   {
-    id: 'amazon.nova-micro-v1:0',
+    id: 'us.amazon.nova-micro-v1:0',
     name: 'Nova Micro',
     provider: 'Amazon',
     contextWindow: '128K',
@@ -203,21 +203,21 @@ export const AVAILABLE_MODELS: ModelOption[] = [
   },
   // Anthropic
   {
-    id: 'anthropic.claude-opus-4-6-v1',
+    id: 'us.anthropic.claude-opus-4-6-v1',
     name: 'Claude Opus 4.6',
     provider: 'Anthropic',
     contextWindow: '200K',
     costPer1kTokens: 0.005,
   },
   {
-    id: 'anthropic.claude-sonnet-4-5-20250929-v1:0',
+    id: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0',
     name: 'Claude Sonnet 4.5',
     provider: 'Anthropic',
     contextWindow: '200K',
     costPer1kTokens: 0.003,
   },
   {
-    id: 'anthropic.claude-haiku-4-5-20251001-v1:0',
+    id: 'us.anthropic.claude-haiku-4-5-20251001-v1:0',
     name: 'Claude Haiku 4.5',
     provider: 'Anthropic',
     contextWindow: '200K',

From ec1c87429299ed56c4ac60f35df6985497631e5e Mon Sep 17 00:00:00 2001
From: LimRaymond <raymond.lim672@gmail.com>
Date: Wed, 20 May 2026 15:27:10 +0200
Subject: [PATCH 9/9] feat: Enforce at least 1 metrics is choosen before
 launching an evaluation

---
 .../EvaluationLaunchUseCase.test.ts           | 26 ++++++++++++++++---
 .../EvaluationLaunchUseCase.ts                | 11 ++++++++
 .../FakeEvaluationLaunchUseCase.ts            |  4 +++
 .../components/evaluator/DatasetUpload.tsx    |  8 +++++-
 frontend/src/pages/Index.tsx                  |  7 ++++-
 frontend/src/utils/metrics.ts                 |  4 +++
 6 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
index 056afe0..8dd6fd6 100644
--- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
+++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts
@@ -121,6 +121,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
         dataset_id: 'test-dataset-id',
         models: [{ type: 'default', identifier: 'claude-sonnet' }],
         metrics: {
+          bleu: true,
           bertscore: false,
           geval_reasoning: false,
           geval_faithfulness: false,
@@ -133,6 +134,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
         mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
       expect(metricsArg).toEqual({
         ...ALL_METRICS_ENABLED,
+        bleu: true,
         bertscore: false,
         geval_reasoning: false,
         geval_faithfulness: false,
@@ -143,7 +145,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
       const request: EvaluationRequest = {
         dataset_id: 'test-dataset-id',
         models: [{ type: 'default', identifier: 'claude-sonnet' }],
-        metrics: { geval_reasoning: false, geval_faithfulness: false },
+        metrics: { rouge: true, geval_reasoning: false, geval_faithfulness: false },
       };
 
       await useCase.launchEvaluation(request);
@@ -152,6 +154,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
         mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
       expect(metricsArg).toEqual({
         ...ALL_METRICS_ENABLED,
+        rouge: true,
         geval_reasoning: false,
         geval_faithfulness: false,
       });
@@ -161,14 +164,31 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => {
       const request: EvaluationRequest = {
         dataset_id: 'test-dataset-id',
         models: [{ type: 'default', identifier: 'claude-sonnet' }],
-        metrics: { bleu: false },
+        metrics: { bleu: false, rouge: true },
       };
 
       await useCase.launchEvaluation(request);
 
       const metricsArg =
         mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3];
-      expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false });
+      expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false, rouge: true });
+    });
+
+    it('should reject when all metrics are explicitly disabled', async () => {
+      const allDisabled: Partial<Record<string, boolean>> = {};
+      for (const key of Object.keys(ALL_METRICS_ENABLED)) {
+        allDisabled[key] = false;
+      }
+
+      const request: EvaluationRequest = {
+        dataset_id: 'test-dataset-id',
+        models: [{ type: 'default', identifier: 'claude-sonnet' }],
+        metrics: allDisabled,
+      };
+
+      await expect(useCase.launchEvaluation(request)).rejects.toThrow(
+        'At least one accuracy metric must be selected',
+      );
     });
   });
 
diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
index 79ea6b6..35494df 100644
--- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
+++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts
@@ -36,6 +36,17 @@ class EvaluationLaunchUseCaseImpl implements EvaluationLaunchUseCase {
     const normalizedWeights = this.normalizeWeights(request.weights);
     const metrics = resolveMetricsConfig(request.metrics);
 
+    if (
+      request.metrics &&
+      !Object.values(request.metrics).some((v) => v === true)
+    ) {
+      throw new BasicError(
+        BasicErrorType.BAD_REQUEST,
+        'NO_METRICS_SELECTED',
+        'At least one accuracy metric must be selected',
+      );
+    }
+
     const job = await this.evaluationJobsRepository.createEvaluation(
       request.dataset_id,
       modelsToPersist,
diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
index 2b57f1a..b30d82d 100644
--- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
+++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts
@@ -43,6 +43,10 @@ export class FakeEvaluationLaunchUseCase implements EvaluationLaunchUseCase {
     const normalizedWeights = this.normalizeWeights(request.weights);
     const metrics = resolveMetricsConfig(request.metrics);
 
+    if (request.metrics && !Object.values(request.metrics).some((v) => v === true)) {
+      throw new Error('At least one accuracy metric must be selected');
+    }
+
     const job = await this.evaluationJobsRepository.createEvaluation(
       request.dataset_id,
       modelsToPersist,
diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx
index de9ecc5..e074076 100644
--- a/frontend/src/components/evaluator/DatasetUpload.tsx
+++ b/frontend/src/components/evaluator/DatasetUpload.tsx
@@ -3,6 +3,7 @@ import { Button } from '@/components/ui/button';
 import { useUploadDataset } from '@/hooks/useEvaluation';
 import { cn } from '@/lib/utils';
 import type { MetricsToggles, TaskType } from '@/types/evaluation';
+import { hasAtLeastOneMetric } from '@/utils/metrics';
 import { motion } from 'framer-motion';
 import {
   AlertCircle,
@@ -369,6 +370,11 @@ export function DatasetUpload({
             onChange={onMetricsChange}
             taskType={resolveDetectedTask(uploadMutation.data)}
           />
+          {!hasAtLeastOneMetric(metrics) && (
+            <p className="mt-2 text-xs text-destructive" role="alert">
+              Select at least one accuracy metric to continue.
+            </p>
+          )}
         </div>
       )}
 
@@ -377,7 +383,7 @@ export function DatasetUpload({
           onClick={onStartEvaluation}
           className="mt-6 w-full"
           size="lg"
-          disabled={isStarting}
+          disabled={isStarting || !hasAtLeastOneMetric(metrics)}
         >
           {isStarting ? (
             <>
diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx
index d071b43..64d274a 100644
--- a/frontend/src/pages/Index.tsx
+++ b/frontend/src/pages/Index.tsx
@@ -12,7 +12,7 @@ import {
 } from '@/hooks/useEvaluation';
 import type { EvaluationConfig, TaskType } from '@/types/evaluation';
 import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation';
-import { buildDefaultsForTask, pickEnabledMetrics } from '@/utils/metrics';
+import { buildDefaultsForTask, hasAtLeastOneMetric, pickEnabledMetrics } from '@/utils/metrics';
 import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react';
 import { useCallback, useEffect, useState } from 'react';
 
@@ -64,6 +64,11 @@ export default function Index() {
       config.weights.accuracy + config.weights.cost + config.weights.latency;
     if (sum !== 100) return;
 
+    if (!hasAtLeastOneMetric(config.metrics)) {
+      setError('At least one accuracy metric must be selected.');
+      return;
+    }
+
     setError(null);
     createEvaluationMutation.mutate(
       {
diff --git a/frontend/src/utils/metrics.ts b/frontend/src/utils/metrics.ts
index 53c8c7a..5752450 100644
--- a/frontend/src/utils/metrics.ts
+++ b/frontend/src/utils/metrics.ts
@@ -26,6 +26,10 @@ export function buildDefaultsForTask(
   return base;
 }
 
+export function hasAtLeastOneMetric(toggles: MetricsToggles): boolean {
+  return Object.values(toggles).some(Boolean);
+}
+
 export function pickEnabledMetrics(
   toggles: MetricsToggles,
 ): Partial<Record<MetricKey, true>> {