From 4ab84b3246c3cf48f348730477f944f096ff37fc Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Wed, 13 May 2026 23:38:06 +0200 Subject: [PATCH 1/9] feat: Initial toggle for metrics --- .../src/accuracy_evaluator.py | 39 ++- .../src/classification_evaluator.py | 46 +++- .../src/dynamodb_service.py | 24 +- .../src/geval_evaluator.py | 54 ++-- backend/python-eval-function/src/main.py | 77 ++++-- backend/python-eval-function/src/metrics.py | 53 ++++ .../EvaluationLaunchAdapter.ts | 12 +- backend/src/models/Evaluation.ts | 69 +++++ .../EvaluationJobsRepository.ts | 9 + .../EvaluationLaunchUseCase.test.ts | 96 ++++++- .../EvaluationLaunchUseCase.ts | 3 + .../FakeEvaluationLaunchUseCase.ts | 13 +- .../components/evaluator/MetricsWeights.tsx | 246 +++++++++++++++++- frontend/src/pages/Index.tsx | 9 + frontend/src/types/evaluation.ts | 55 +++- 15 files changed, 715 insertions(+), 90 deletions(-) create mode 100644 backend/python-eval-function/src/metrics.py diff --git a/backend/python-eval-function/src/accuracy_evaluator.py b/backend/python-eval-function/src/accuracy_evaluator.py index a2785a3..a73538b 100644 --- a/backend/python-eval-function/src/accuracy_evaluator.py +++ b/backend/python-eval-function/src/accuracy_evaluator.py @@ -1,7 +1,9 @@ import logging -from typing import Optional, List, Dict +from typing import Optional, List, Mapping from dataclasses import dataclass +from metrics import is_metric_enabled + logger = logging.getLogger(__name__) @@ -29,7 +31,8 @@ def __init__(self): def calculate_accuracy_metrics( self, predictions: List[str], - references: Optional[List[str]] + references: Optional[List[str]], + selected: Optional[Mapping[str, bool]] = None, ) -> Optional[AccuracyMetrics]: if references is None or len(references) == 0: logger.info("No reference outputs available, skipping accuracy metrics") @@ -38,18 +41,26 @@ def calculate_accuracy_metrics( if all(ref is None or ref == "" for ref in references): logger.info("All reference outputs are empty, skipping accuracy metrics") return None - - logger.info(f"Calculating accuracy metrics for {len(predictions)} predictions") - + + logger.info( + f"Calculating accuracy metrics for {len(predictions)} predictions " + f"(selected={dict(selected) if selected is not None else 'all'})" + ) + try: metrics = AccuracyMetrics() - - metrics.bleu = self._calculate_bleu(predictions, references) - metrics.rouge = self._calculate_rouge(predictions, references) - metrics.meteor = self._calculate_meteor(predictions, references) - metrics.levenshtein = self._calculate_levenshtein(predictions, references) - metrics.bertscore = self._calculate_bertscore(predictions, references) - + + if is_metric_enabled(selected, 'bleu'): + metrics.bleu = self._calculate_bleu(predictions, references) + if is_metric_enabled(selected, 'rouge'): + metrics.rouge = self._calculate_rouge(predictions, references) + if is_metric_enabled(selected, 'meteor'): + metrics.meteor = self._calculate_meteor(predictions, references) + if is_metric_enabled(selected, 'levenshtein'): + metrics.levenshtein = self._calculate_levenshtein(predictions, references) + if is_metric_enabled(selected, 'bertscore'): + metrics.bertscore = self._calculate_bertscore(predictions, references) + def fmt(v): return f"{v:.4f}" if v is not None else "N/A" logger.info( f"Accuracy metrics calculated - " @@ -57,9 +68,9 @@ def fmt(v): return f"{v:.4f}" if v is not None else "N/A" f"METEOR={fmt(metrics.meteor)}, Levenshtein={fmt(metrics.levenshtein)}, " f"BERTScore={fmt(metrics.bertscore)}" ) - + return metrics - + except Exception as e: logger.error(f"Error calculating accuracy metrics: {e}", exc_info=True) return None diff --git a/backend/python-eval-function/src/classification_evaluator.py b/backend/python-eval-function/src/classification_evaluator.py index 44a64d7..0438870 100644 --- a/backend/python-eval-function/src/classification_evaluator.py +++ b/backend/python-eval-function/src/classification_evaluator.py @@ -1,7 +1,9 @@ import logging -from typing import Optional, List +from typing import Optional, List, Mapping from dataclasses import dataclass +from metrics import is_metric_enabled + logger = logging.getLogger(__name__) @@ -14,6 +16,17 @@ class ClassificationMetrics: f1_weighted: Optional[float] = None +# Maps the external (request-level) metric keys to the internal +# ClassificationMetrics field names. +_CLASSIFICATION_KEYS: tuple[str, ...] = ( + 'classification_accuracy', + 'precision_macro', + 'recall_macro', + 'f1_macro', + 'f1_weighted', +) + + def normalize_prediction(prediction: str, valid_classes: List[str]) -> str: cleaned = prediction.strip() @@ -38,6 +51,7 @@ def calculate_classification_metrics( predictions: List[str], references: List[str], valid_classes: Optional[List[str]] = None, + selected: Optional[Mapping[str, bool]] = None, ) -> Optional[ClassificationMetrics]: if not references or not predictions: logger.info("No references or predictions, skipping classification metrics") @@ -47,6 +61,14 @@ def calculate_classification_metrics( logger.info("All references are empty, skipping classification metrics") return None + # If the user opted out of every classification metric, skip the + # sklearn computation entirely. + if selected is not None and not any( + is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS + ): + logger.info("All classification metrics disabled, skipping computation") + return ClassificationMetrics() + if valid_classes is None: valid_classes = list(set(references)) @@ -56,7 +78,7 @@ def calculate_classification_metrics( logger.info( f"Calculating classification metrics for {len(normalized_preds)} predictions " - f"across {len(valid_classes)} classes" + f"across {len(valid_classes)} classes (selected={dict(selected) if selected is not None else 'all'})" ) try: @@ -86,11 +108,21 @@ def calculate_classification_metrics( ) metrics = ClassificationMetrics( - accuracy=round(acc, 4), - precision_macro=round(precision, 4), - recall_macro=round(recall, 4), - f1_macro=round(f1, 4), - f1_weighted=round(f1_w, 4), + accuracy=round(acc, 4) + if is_metric_enabled(selected, 'classification_accuracy') + else None, + precision_macro=round(precision, 4) + if is_metric_enabled(selected, 'precision_macro') + else None, + recall_macro=round(recall, 4) + if is_metric_enabled(selected, 'recall_macro') + else None, + f1_macro=round(f1, 4) + if is_metric_enabled(selected, 'f1_macro') + else None, + f1_weighted=round(f1_w, 4) + if is_metric_enabled(selected, 'f1_weighted') + else None, ) logger.info( diff --git a/backend/python-eval-function/src/dynamodb_service.py b/backend/python-eval-function/src/dynamodb_service.py index 4841e83..cf8fc03 100644 --- a/backend/python-eval-function/src/dynamodb_service.py +++ b/backend/python-eval-function/src/dynamodb_service.py @@ -7,6 +7,8 @@ import boto3 from botocore.exceptions import ClientError +from metrics import normalize_metrics_config + logger = logging.getLogger(__name__) @@ -28,18 +30,34 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]: item = response['Item'] models = json.loads(item.get('models', '[]')) weights = json.loads(item.get('weights', '{}')) - + # Older jobs may not have a `metrics` field; a malformed blob also + # falls back to all-enabled so a bad write can never wedge a job. + raw_metrics = item.get('metrics') + stored_metrics: Optional[Dict[str, Any]] = None + if raw_metrics: + try: + stored_metrics = json.loads(raw_metrics) + except (TypeError, ValueError) as parse_err: + logger.warning( + f"Failed to parse stored metrics config " + f"({parse_err}); defaulting to all metrics enabled" + ) + metrics = normalize_metrics_config(stored_metrics) + job_config = { 'evaluation_id': item['evaluation_id'], 'dataset_id': item['dataset_id'], 'models': models, 'weights': weights, + 'metrics': metrics, 'status': item.get('status', 'pending'), 'created_at': item.get('created_at', ''), 'total_samples': item.get('total_samples') } - - logger.info(f"Loaded job config: {len(models)} models, weights: {weights}") + + logger.info( + f"Loaded job config: {len(models)} models, weights: {weights}, metrics: {metrics}" + ) return job_config except ClientError as e: diff --git a/backend/python-eval-function/src/geval_evaluator.py b/backend/python-eval-function/src/geval_evaluator.py index 0665200..8a6d581 100644 --- a/backend/python-eval-function/src/geval_evaluator.py +++ b/backend/python-eval-function/src/geval_evaluator.py @@ -99,6 +99,8 @@ def evaluate( predictions: List[str], references: Optional[List[str]] = None, task_type: str = "summarization", + compute_reasoning: bool = True, + compute_faithfulness: bool = True, ) -> GEvalMetrics: if not inputs or not predictions: logger.warning("Empty inputs or predictions, skipping G-Eval") @@ -108,10 +110,22 @@ def evaluate( logger.error("inputs and predictions length mismatch, skipping G-Eval") return GEvalMetrics() + if not compute_reasoning and not compute_faithfulness: + logger.info("Both G-Eval metrics disabled, skipping") + return GEvalMetrics() + try: model = self._build_judge_model() - reasoning_metric = self._build_reasoning_metric(model, task_type) - faithfulness_metric = self._build_faithfulness_metric(model, task_type) + reasoning_metric = ( + self._build_reasoning_metric(model, task_type) + if compute_reasoning + else None + ) + faithfulness_metric = ( + self._build_faithfulness_metric(model, task_type) + if compute_faithfulness + else None + ) except Exception as e: logger.error(f"Failed to initialize G-Eval components: {e}", exc_info=True) return GEvalMetrics() @@ -131,23 +145,25 @@ def evaluate( actual_output=pred, ) - try: - reasoning_metric.measure(test_case) - reasoning_scores.append(reasoning_metric.score) - logger.debug( - f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}" - ) - except Exception as e: - logger.warning(f"[{idx}] Reasoning metric failed: {e}") - - try: - faithfulness_metric.measure(test_case) - faithfulness_scores.append(faithfulness_metric.score) - logger.debug( - f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}" - ) - except Exception as e: - logger.warning(f"[{idx}] Faithfulness metric failed: {e}") + if reasoning_metric is not None: + try: + reasoning_metric.measure(test_case) + reasoning_scores.append(reasoning_metric.score) + logger.debug( + f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}" + ) + except Exception as e: + logger.warning(f"[{idx}] Reasoning metric failed: {e}") + + if faithfulness_metric is not None: + try: + faithfulness_metric.measure(test_case) + faithfulness_scores.append(faithfulness_metric.score) + logger.debug( + f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}" + ) + except Exception as e: + logger.warning(f"[{idx}] Faithfulness metric failed: {e}") result = GEvalMetrics() diff --git a/backend/python-eval-function/src/main.py b/backend/python-eval-function/src/main.py index 662f58a..ebfdd3f 100644 --- a/backend/python-eval-function/src/main.py +++ b/backend/python-eval-function/src/main.py @@ -74,8 +74,18 @@ def main(): dataset_id = job_config['dataset_id'] models = job_config['models'] weights = job_config['weights'] - - logger.info(f"Job config loaded - dataset: {dataset_id}, models: {len(models)}") + # `metrics_config` is already normalized by DynamoDBService.load_job — + # every key present, all bool. + metrics_config = job_config.get('metrics') or {} + + from metrics import is_metric_enabled + compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning') + compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness') + + logger.info( + f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, " + f"metrics={metrics_config or 'all'}" + ) from dataset_loader import DatasetLoader dataset_loader = DatasetLoader() @@ -134,7 +144,9 @@ def main(): predictions = [invocation_results[i].response_text for i in successful_indices] references = [all_references[i] for i in successful_indices] if all_references else None - accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(predictions, references) + accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics( + predictions, references, selected=metrics_config + ) accuracy_results[model_id] = accuracy_metrics or AccuracyMetrics() logger.info(f"Summarization accuracy complete for {len(accuracy_results)} models") @@ -149,7 +161,8 @@ def main(): references = [all_references[i] for i in successful_indices] if all_references else None cls_metrics = classification_evaluator.calculate_classification_metrics( - predictions, references, valid_classes=unique_classes + predictions, references, valid_classes=unique_classes, + selected=metrics_config, ) acc = AccuracyMetrics() @@ -163,28 +176,40 @@ def main(): logger.info(f"Classification accuracy complete for {len(accuracy_results)} models") - from geval_evaluator import GEvalEvaluator - geval_evaluator = GEvalEvaluator() - - for model_id, invocation_results in results_by_model.items(): - successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None] - predictions = [invocation_results[i].response_text for i in successful_indices] - inputs = [dataset.documents[i] for i in successful_indices] - - logger.info(f"Running G-Eval for model {model_id} on {len(predictions)} samples") - geval_metrics = geval_evaluator.evaluate(inputs, predictions, task_type=task_type) - - acc = accuracy_results.get(model_id) - if acc is None: - acc = AccuracyMetrics() - accuracy_results[model_id] = acc - - acc.geval_reasoning = geval_metrics.reasoning - acc.geval_faithfulness = geval_metrics.faithfulness - - logger.info(f"G-Eval complete for {model_id}") - - logger.info(f"G-Eval evaluation complete for all models") + if compute_geval_reasoning or compute_geval_faithfulness: + from geval_evaluator import GEvalEvaluator + geval_evaluator = GEvalEvaluator() + + for model_id, invocation_results in results_by_model.items(): + successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None] + predictions = [invocation_results[i].response_text for i in successful_indices] + inputs = [dataset.documents[i] for i in successful_indices] + + logger.info( + f"Running G-Eval for model {model_id} on {len(predictions)} samples " + f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})" + ) + geval_metrics = geval_evaluator.evaluate( + inputs, predictions, task_type=task_type, + compute_reasoning=compute_geval_reasoning, + compute_faithfulness=compute_geval_faithfulness, + ) + + acc = accuracy_results.get(model_id) + if acc is None: + acc = AccuracyMetrics() + accuracy_results[model_id] = acc + + if compute_geval_reasoning: + acc.geval_reasoning = geval_metrics.reasoning + if compute_geval_faithfulness: + acc.geval_faithfulness = geval_metrics.faithfulness + + logger.info(f"G-Eval complete for {model_id}") + + logger.info("G-Eval evaluation complete for all models") + else: + logger.info("G-Eval disabled via metrics config, skipping") from cost_calculator import CostCalculator cost_calculator = CostCalculator() diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py new file mode 100644 index 0000000..ac025b4 --- /dev/null +++ b/backend/python-eval-function/src/metrics.py @@ -0,0 +1,53 @@ +"""Single source of truth for metric keys and config helpers used by the +evaluation engine. + +Keep `METRIC_KEYS` in sync with the TypeScript counterpart at +`backend/src/models/Evaluation.ts` (and the frontend at +`frontend/src/types/evaluation.ts`). The shape stored in DynamoDB is just a +JSON object keyed by these names with boolean values. +""" +from typing import Dict, Mapping, Optional + + +METRIC_KEYS: tuple[str, ...] = ( + # Algorithmic — summarization + 'bleu', + 'rouge', + 'meteor', + 'levenshtein', + 'bertscore', + # Algorithmic — classification + 'classification_accuracy', + 'precision_macro', + 'recall_macro', + 'f1_macro', + 'f1_weighted', + # LLM-as-judge + 'geval_reasoning', + 'geval_faithfulness', +) + + +def is_metric_enabled( + selected: Optional[Mapping[str, bool]], + key: str, +) -> bool: + """Return whether `key` should be computed. + + Missing keys default to True so legacy jobs (and partial configs) keep the + pre-toggle behaviour of computing every metric. + """ + if selected is None: + return True + return bool(selected.get(key, True)) + + +def normalize_metrics_config( + stored: Optional[Mapping[str, object]], +) -> Dict[str, bool]: + """Coerce a stored metrics blob into a complete `{key: bool}` mapping. + + Unknown keys in `stored` are dropped; missing keys default to True. + """ + source: Mapping[str, object] = stored or {} + return {key: bool(source.get(key, True)) for key in METRIC_KEYS} diff --git a/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts b/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts index 283a89e..c130c7f 100644 --- a/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts +++ b/backend/src/handlers/EvaluationLaunch/EvaluationLaunchAdapter.ts @@ -3,12 +3,21 @@ import type { APIGatewayProxyEventV2, APIGatewayProxyResultV2, } from 'aws-lambda'; -import { z } from 'zod'; +import { z, ZodRawShape } from 'zod'; +import { METRIC_KEYS } from '../../models/Evaluation'; import { tokenEvaluationLaunchUseCase } from '../../useCases/EvaluationLaunch/EvaluationLaunchUseCase'; import { handleHttpRequest } from '../api/handleHttpRequest'; import { parseApiEvent } from '../api/parseApiEvent'; +const MetricsConfigSchema = z + .object( + Object.fromEntries( + METRIC_KEYS.map((key) => [key, z.boolean().optional()]), + ) as ZodRawShape, + ) + .optional(); + const EvaluationRequestSchema = z.object({ dataset_id: z.string().min(1), models: z @@ -32,6 +41,7 @@ const EvaluationRequestSchema = z.object({ cost: z.number().optional(), }) .optional(), + metrics: MetricsConfigSchema, }); export class EvaluationLaunchAdapter { diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts index 60e7d2e..6c5917e 100644 --- a/backend/src/models/Evaluation.ts +++ b/backend/src/models/Evaluation.ts @@ -9,10 +9,78 @@ export interface WeightConfig { cost: number; } +/** + * Canonical list of every metric the engine knows how to compute, in display + * order, grouped by semantic category. + * + * This is the SINGLE SOURCE OF TRUTH for metric keys in the backend. Adding a + * new metric is a one-line change here — the type alias, the default config + * and the Zod payload schema are all derived from this list. + * + * Keep this list in sync with the frontend `METRIC_KEYS` (and the Python + * `metrics.METRIC_KEYS`); a cross-language enum would be ideal, but absent + * that, mirror the additions manually. + */ +export const METRIC_KEYS = [ + // Algorithmic — summarization + 'bleu', + 'rouge', + 'meteor', + 'levenshtein', + 'bertscore', + // Algorithmic — classification + 'classification_accuracy', + 'precision_macro', + 'recall_macro', + 'f1_macro', + 'f1_weighted', + // LLM-as-judge + 'geval_reasoning', + 'geval_faithfulness', +] as const; + +export type MetricKey = (typeof METRIC_KEYS)[number]; + +/** + * Per-metric opt-in flags. + * + * Latency and cost are always computed (they are free from inference data). + * + * - Algorithmic / programmatic metrics are deterministic and run locally. + * They are mostly fast; only BERTScore has a meaningful loading cost. + * - LLM-as-judge metrics call an extra Bedrock model per sample — they are + * the main lever for both speed and spend. + * + * Metrics that are not applicable to the uploaded dataset (e.g. classification + * metrics on a summarization run) are skipped automatically. + */ +export type MetricsConfig = Record; + +export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries( + METRIC_KEYS.map((k) => [k, true]), +) as MetricsConfig; + +/** + * Merge a partial user-provided config with the defaults (all enabled). + * Unknown keys are ignored; missing keys fall back to the default value. + */ +export function resolveMetricsConfig( + metrics?: Partial, +): MetricsConfig { + const result = { ...DEFAULT_METRICS_CONFIG }; + if (!metrics) return result; + for (const key of METRIC_KEYS) { + const provided = metrics[key]; + if (provided !== undefined) result[key] = provided; + } + return result; +} + export interface EvaluationRequest { dataset_id: string; models: ModelConfig[]; weights?: Partial; + metrics?: Partial; } export interface EvaluationJob { @@ -20,6 +88,7 @@ export interface EvaluationJob { dataset_id: string; models: ModelConfig[]; weights: WeightConfig; + metrics: MetricsConfig; status: JobStatus; progress: number; current_model?: string; diff --git a/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts b/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts index 446cc54..a911806 100644 --- a/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts +++ b/backend/src/services/EvaluationJobsRepository/EvaluationJobsRepository.ts @@ -8,8 +8,10 @@ import { import { createInjectionToken, inject } from '@trackit.io/di-container'; import { randomUUID } from 'crypto'; import { + DEFAULT_METRICS_CONFIG, EvaluationJob, JobStatus, + MetricsConfig, ModelConfig, ModelResult, Recommendation, @@ -21,6 +23,7 @@ export type EvaluationJobsRepository = { datasetId: string, models: ModelConfig[], weights: WeightConfig, + metrics: MetricsConfig, ): Promise; updateEvaluation( @@ -51,6 +54,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository { datasetId: string, models: ModelConfig[], weights: WeightConfig, + metrics: MetricsConfig, ): Promise { const evaluationId = randomUUID(); const now = new Date().toISOString(); @@ -60,6 +64,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository { dataset_id: datasetId, models, weights, + metrics, status: 'pending', progress: 0, created_at: now, @@ -74,6 +79,7 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository { dataset_id: { S: job.dataset_id }, models: { S: JSON.stringify(job.models) }, weights: { S: JSON.stringify(job.weights) }, + metrics: { S: JSON.stringify(job.metrics) }, status: { S: job.status }, progress: { N: job.progress.toString() }, created_at: { S: job.created_at }, @@ -176,6 +182,9 @@ class EvaluationJobsRepositoryImpl implements EvaluationJobsRepository { dataset_id: item['dataset_id'].S!, models: JSON.parse(item['models'].S!) as ModelConfig[], weights: JSON.parse(item['weights'].S!) as WeightConfig, + metrics: item['metrics']?.S + ? (JSON.parse(item['metrics'].S) as MetricsConfig) + : { ...DEFAULT_METRICS_CONFIG }, status: item['status'].S! as JobStatus, progress: Number(item['progress'].N ?? '0'), current_model: item['current_model']?.S, diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts index 045f104..056afe0 100644 --- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts +++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts @@ -1,8 +1,13 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; -import type { EvaluationRequest } from '../../models/Evaluation.js'; +import { + DEFAULT_METRICS_CONFIG, + type EvaluationRequest, +} from '../../models/Evaluation.js'; import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js'; import { FakeEvaluationLaunchUseCase } from './FakeEvaluationLaunchUseCase'; +const ALL_METRICS_ENABLED = { ...DEFAULT_METRICS_CONFIG }; + // Mock dependencies const mockEvaluationJobsRepository = { createEvaluation: vi.fn(), @@ -64,6 +69,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { latency: 0.33, cost: 0.34, }, + ALL_METRICS_ENABLED, ); }); @@ -91,10 +97,81 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { latency: 0.33, cost: 0.34, }, + ALL_METRICS_ENABLED, ); }); }); + describe('Metrics toggles', () => { + it('should default to all metrics enabled when metrics not provided', async () => { + const request: EvaluationRequest = { + dataset_id: 'test-dataset-id', + models: [{ type: 'default', identifier: 'claude-sonnet' }], + }; + + await useCase.launchEvaluation(request); + + const metricsArg = + mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; + expect(metricsArg).toEqual(ALL_METRICS_ENABLED); + }); + + it('should respect explicitly disabled metrics', async () => { + const request: EvaluationRequest = { + dataset_id: 'test-dataset-id', + models: [{ type: 'default', identifier: 'claude-sonnet' }], + metrics: { + bertscore: false, + geval_reasoning: false, + geval_faithfulness: false, + }, + }; + + await useCase.launchEvaluation(request); + + const metricsArg = + mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; + expect(metricsArg).toEqual({ + ...ALL_METRICS_ENABLED, + bertscore: false, + geval_reasoning: false, + geval_faithfulness: false, + }); + }); + + it('should fill in defaults for partially provided metrics', async () => { + const request: EvaluationRequest = { + dataset_id: 'test-dataset-id', + models: [{ type: 'default', identifier: 'claude-sonnet' }], + metrics: { geval_reasoning: false, geval_faithfulness: false }, + }; + + await useCase.launchEvaluation(request); + + const metricsArg = + mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; + expect(metricsArg).toEqual({ + ...ALL_METRICS_ENABLED, + geval_reasoning: false, + geval_faithfulness: false, + }); + }); + + it('should support disabling a single algorithmic metric', async () => { + const request: EvaluationRequest = { + dataset_id: 'test-dataset-id', + models: [{ type: 'default', identifier: 'claude-sonnet' }], + metrics: { bleu: false }, + }; + + await useCase.launchEvaluation(request); + + const metricsArg = + mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; + expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false }); + }); + }); + describe('Negative weight rejection', () => { it('should reject negative accuracy weight', async () => { const request: EvaluationRequest = { @@ -216,6 +293,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { latency: 0.33, cost: 0.34, }, + ALL_METRICS_ENABLED, ); }); @@ -349,11 +427,16 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { expect( mockEvaluationJobsRepository.createEvaluation, - ).toHaveBeenCalledWith('test-dataset-id', request.models, { - accuracy: 0.33, - latency: 0.33, - cost: 0.34, - }); + ).toHaveBeenCalledWith( + 'test-dataset-id', + request.models, + { + accuracy: 0.33, + latency: 0.33, + cost: 0.34, + }, + ALL_METRICS_ENABLED, + ); }); it('should accept a mix of default and custom models', async () => { @@ -386,6 +469,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { latency: 0.33, cost: 0.34, }, + ALL_METRICS_ENABLED, ); }); diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts index b715277..79ea6b6 100644 --- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts +++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts @@ -5,6 +5,7 @@ import { EvaluationJob, EvaluationRequest, ModelConfig, + resolveMetricsConfig, WeightConfig, } from '../../models/Evaluation'; import { tokenBedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService'; @@ -33,11 +34,13 @@ class EvaluationLaunchUseCaseImpl implements EvaluationLaunchUseCase { ); const normalizedWeights = this.normalizeWeights(request.weights); + const metrics = resolveMetricsConfig(request.metrics); const job = await this.evaluationJobsRepository.createEvaluation( request.dataset_id, modelsToPersist, normalizedWeights, + metrics, ); await this.fargateService.launchTask(job.evaluation_id); diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts index 60865a0..5fb8371 100644 --- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts +++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts @@ -1,8 +1,9 @@ -import type { - EvaluationJob, - EvaluationRequest, - ModelConfig, - WeightConfig, +import { + resolveMetricsConfig, + type EvaluationJob, + type EvaluationRequest, + type ModelConfig, + type WeightConfig, } from '../../models/Evaluation.js'; import { type BedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService.js'; import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js'; @@ -40,11 +41,13 @@ export class FakeEvaluationLaunchUseCase implements EvaluationLaunchUseCase { ); const normalizedWeights = this.normalizeWeights(request.weights); + const metrics = resolveMetricsConfig(request.metrics); const job = await this.evaluationJobsRepository.createEvaluation( request.dataset_id, modelsToPersist, normalizedWeights, + metrics, ); await this.fargateService.launchTask(job.evaluation_id); diff --git a/frontend/src/components/evaluator/MetricsWeights.tsx b/frontend/src/components/evaluator/MetricsWeights.tsx index 122e11a..39d4fa8 100644 --- a/frontend/src/components/evaluator/MetricsWeights.tsx +++ b/frontend/src/components/evaluator/MetricsWeights.tsx @@ -1,12 +1,138 @@ import { cn } from '@/lib/utils'; -import type { MetricsWeights as Weights } from '@/types/evaluation'; +import type { + MetricsToggles, + MetricsWeights as Weights, +} from '@/types/evaluation'; import { motion } from 'framer-motion'; -import { Info } from 'lucide-react'; +import { Info, Zap } from 'lucide-react'; import { useEffect, useState } from 'react'; interface MetricsWeightsProps { value: Weights; onChange: (weights: Weights) => void; + metrics: MetricsToggles; + onMetricsChange: (metrics: MetricsToggles) => void; +} + +interface ToggleableMetric { + key: keyof MetricsToggles; + label: string; + description: string; + /** Free-form badge text (e.g. "Summarization", "Classification"). */ + taskBadge?: string; +} + +interface MetricGroup { + id: string; + title: string; + subtitle: string; + metrics: ToggleableMetric[]; +} + +const METRIC_GROUPS: MetricGroup[] = [ + { + id: 'programmatic', + title: 'Programmatic metrics', + subtitle: + 'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.', + metrics: [ + { + key: 'bleu', + label: 'BLEU', + description: 'N-gram overlap with the reference.', + taskBadge: 'Summarization', + }, + { + key: 'rouge', + label: 'ROUGE', + description: 'Recall-oriented n-gram overlap.', + taskBadge: 'Summarization', + }, + { + key: 'meteor', + label: 'METEOR', + description: 'Stem-aware overlap, more lenient than BLEU.', + taskBadge: 'Summarization', + }, + { + key: 'levenshtein', + label: 'Levenshtein similarity', + description: 'Character-level edit-distance similarity.', + taskBadge: 'Summarization', + }, + { + key: 'bertscore', + label: 'BERTScore', + description: + 'Embedding-based semantic similarity. Loads a transformer model once per run.', + taskBadge: 'Summarization', + }, + { + key: 'classification_accuracy', + label: 'Accuracy', + description: 'Fraction of predictions that match the reference label.', + taskBadge: 'Classification', + }, + { + key: 'precision_macro', + label: 'Precision (macro)', + description: 'Per-class precision averaged across labels.', + taskBadge: 'Classification', + }, + { + key: 'recall_macro', + label: 'Recall (macro)', + description: 'Per-class recall averaged across labels.', + taskBadge: 'Classification', + }, + { + key: 'f1_macro', + label: 'F1 (macro)', + description: 'Harmonic mean of precision and recall, unweighted.', + taskBadge: 'Classification', + }, + { + key: 'f1_weighted', + label: 'F1 (weighted)', + description: 'F1 weighted by class support.', + taskBadge: 'Classification', + }, + ], + }, + { + id: 'llm-judge', + title: 'LLM-as-judge metrics', + subtitle: + 'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.', + metrics: [ + { + key: 'geval_reasoning', + label: 'G-Eval — Reasoning', + description: 'How coherent and well-justified the output is.', + }, + { + key: 'geval_faithfulness', + label: 'G-Eval — Faithfulness', + description: 'Whether the output sticks to the input (no hallucination).', + }, + ], + }, +]; + +function setGroupSelection( + current: MetricsToggles, + group: MetricGroup, + value: boolean, +): MetricsToggles { + const next = { ...current }; + for (const m of group.metrics) { + next[m.key] = value; + } + return next; +} + +function countSelected(metrics: MetricsToggles, group: MetricGroup): number { + return group.metrics.reduce((n, m) => (metrics[m.key] ? n + 1 : n), 0); } const METRICS: { key: keyof Weights; label: string; description: string }[] = [ @@ -58,7 +184,12 @@ function syncDraftFromValue(prev: string, num: number): string { return String(num); } -export function MetricsWeights({ value, onChange }: MetricsWeightsProps) { +export function MetricsWeights({ + value, + onChange, + metrics, + onMetricsChange, +}: MetricsWeightsProps) { const [drafts, setDrafts] = useState({ accuracy: String(value.accuracy), cost: String(value.cost), @@ -176,6 +307,115 @@ export function MetricsWeights({ value, onChange }: MetricsWeightsProps) { is {total}%.

)} + +
+
+ +

+ Pick the metrics to compute +

+
+

+ Latency and cost are always computed. Metrics that don't apply to + your dataset (e.g. classification metrics on a summarization run) + are skipped automatically. +

+ +
+ {METRIC_GROUPS.map((group) => { + const selectedCount = countSelected(metrics, group); + const totalCount = group.metrics.length; + return ( +
+ + {group.title} + +

+ {group.subtitle} +

+
+ + {selectedCount} / {totalCount} selected + +
+ + +
+
+ +
+ {group.metrics.map((metric) => { + const enabled = metrics[metric.key]; + return ( + + ); + })} +
+
+ ); + })} +
+
); } diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx index bf70b23..7dc00f6 100644 --- a/frontend/src/pages/Index.tsx +++ b/frontend/src/pages/Index.tsx @@ -11,6 +11,7 @@ import { useEvaluationStatus, } from '@/hooks/useEvaluation'; import type { EvaluationConfig } from '@/types/evaluation'; +import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation'; import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react'; import { useCallback, useEffect, useState } from 'react'; @@ -21,6 +22,7 @@ export default function Index() { const [step, setStep] = useState(0); const [config, setConfig] = useState({ weights: { accuracy: 40, cost: 30, latency: 30 }, + metrics: { ...DEFAULT_METRICS_TOGGLES }, selectedModels: [], datasetFile: null, }); @@ -74,6 +76,7 @@ export default function Index() { latency: config.weights.latency / 100, cost: config.weights.cost / 100, }, + metrics: config.metrics, }, { onSuccess: (data) => { @@ -90,6 +93,7 @@ export default function Index() { datasetId, config.selectedModels, config.weights, + config.metrics, createEvaluationMutation, ]); @@ -105,6 +109,7 @@ export default function Index() { setStep(0); setConfig({ weights: { accuracy: 40, cost: 30, latency: 30 }, + metrics: { ...DEFAULT_METRICS_TOGGLES }, selectedModels: [], datasetFile: null, }); @@ -138,6 +143,10 @@ export default function Index() { setConfig({ ...config, weights: w })} + metrics={config.metrics} + onMetricsChange={(m) => + setConfig({ ...config, metrics: m }) + } /> )} {step === 1 && ( diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts index 996d431..6fe06e4 100644 --- a/frontend/src/types/evaluation.ts +++ b/frontend/src/types/evaluation.ts @@ -4,6 +4,47 @@ export interface MetricsWeights { latency: number; } +/** + * Canonical list of every metric the engine knows how to compute, in display + * order. Keep in sync with the backend `METRIC_KEYS` in + * `backend/src/models/Evaluation.ts` and `backend/python-eval-function/src/metrics.py`. + */ +export const METRIC_KEYS = [ + // Algorithmic — summarization + 'bleu', + 'rouge', + 'meteor', + 'levenshtein', + 'bertscore', + // Algorithmic — classification + 'classification_accuracy', + 'precision_macro', + 'recall_macro', + 'f1_macro', + 'f1_weighted', + // LLM-as-judge + 'geval_reasoning', + 'geval_faithfulness', +] as const; + +export type MetricKey = (typeof METRIC_KEYS)[number]; + +/** + * Per-metric opt-in flags. + * + * - Algorithmic / programmatic metrics are deterministic, fast, and run locally + * (BERTScore is the only one with a meaningful load cost). + * - LLM-as-judge metrics call an extra Bedrock model per sample — the main + * lever for speed and spend. + * + * Metrics that don't apply to the uploaded dataset are skipped automatically. + */ +export type MetricsToggles = Record; + +export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries( + METRIC_KEYS.map((k) => [k, true]), +) as MetricsToggles; + export interface ModelOption { id: string; name: string; @@ -14,6 +55,7 @@ export interface ModelOption { export interface EvaluationConfig { weights: MetricsWeights; + metrics: MetricsToggles; selectedModels: string[]; datasetFile: File | null; } @@ -37,21 +79,21 @@ export interface EvaluationResult { export const AVAILABLE_MODELS: ModelOption[] = [ // Amazon Nova { - id: 'us.amazon.nova-pro-v1:0', + id: 'amazon.nova-pro-v1:0', name: 'Nova Pro', provider: 'Amazon', contextWindow: '300K', costPer1kTokens: 0.0008, }, { - id: 'us.amazon.nova-lite-v1:0', + id: 'amazon.nova-lite-v1:0', name: 'Nova Lite', provider: 'Amazon', contextWindow: '300K', costPer1kTokens: 0.00006, }, { - id: 'us.amazon.nova-micro-v1:0', + id: 'amazon.nova-micro-v1:0', name: 'Nova Micro', provider: 'Amazon', contextWindow: '128K', @@ -59,21 +101,21 @@ export const AVAILABLE_MODELS: ModelOption[] = [ }, // Anthropic { - id: 'us.anthropic.claude-opus-4-6-v1', + id: 'anthropic.claude-opus-4-6-v1', name: 'Claude Opus 4.6', provider: 'Anthropic', contextWindow: '200K', costPer1kTokens: 0.005, }, { - id: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', + id: 'anthropic.claude-sonnet-4-5-20250929-v1:0', name: 'Claude Sonnet 4.5', provider: 'Anthropic', contextWindow: '200K', costPer1kTokens: 0.003, }, { - id: 'us.anthropic.claude-haiku-4-5-20251001-v1:0', + id: 'anthropic.claude-haiku-4-5-20251001-v1:0', name: 'Claude Haiku 4.5', provider: 'Anthropic', contextWindow: '200K', @@ -85,6 +127,7 @@ export interface CreateEvaluationRequest { dataset_id: string; models: { type: 'default' | 'custom'; identifier: string }[]; weights: { accuracy: number; latency: number; cost: number }; + metrics?: MetricsToggles; } export interface DatasetUploadData { From e1772785659a840da229d5fa0b4619281e41cdf6 Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Fri, 15 May 2026 12:22:24 +0200 Subject: [PATCH 2/9] fix: unused type --- .../src/classification_evaluator.py | 5 ---- .../src/dynamodb_service.py | 2 -- backend/python-eval-function/src/metrics.py | 18 ------------ backend/src/models/Evaluation.ts | 29 ------------------- .../FakeEvaluationLaunchUseCase.ts | 8 ++--- frontend/src/types/evaluation.ts | 15 ---------- 6 files changed, 4 insertions(+), 73 deletions(-) diff --git a/backend/python-eval-function/src/classification_evaluator.py b/backend/python-eval-function/src/classification_evaluator.py index 0438870..69ea449 100644 --- a/backend/python-eval-function/src/classification_evaluator.py +++ b/backend/python-eval-function/src/classification_evaluator.py @@ -15,9 +15,6 @@ class ClassificationMetrics: f1_macro: Optional[float] = None f1_weighted: Optional[float] = None - -# Maps the external (request-level) metric keys to the internal -# ClassificationMetrics field names. _CLASSIFICATION_KEYS: tuple[str, ...] = ( 'classification_accuracy', 'precision_macro', @@ -61,8 +58,6 @@ def calculate_classification_metrics( logger.info("All references are empty, skipping classification metrics") return None - # If the user opted out of every classification metric, skip the - # sklearn computation entirely. if selected is not None and not any( is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS ): diff --git a/backend/python-eval-function/src/dynamodb_service.py b/backend/python-eval-function/src/dynamodb_service.py index cf8fc03..63504fa 100644 --- a/backend/python-eval-function/src/dynamodb_service.py +++ b/backend/python-eval-function/src/dynamodb_service.py @@ -30,8 +30,6 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]: item = response['Item'] models = json.loads(item.get('models', '[]')) weights = json.loads(item.get('weights', '{}')) - # Older jobs may not have a `metrics` field; a malformed blob also - # falls back to all-enabled so a bad write can never wedge a job. raw_metrics = item.get('metrics') stored_metrics: Optional[Dict[str, Any]] = None if raw_metrics: diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py index ac025b4..fad56c2 100644 --- a/backend/python-eval-function/src/metrics.py +++ b/backend/python-eval-function/src/metrics.py @@ -1,14 +1,5 @@ -"""Single source of truth for metric keys and config helpers used by the -evaluation engine. - -Keep `METRIC_KEYS` in sync with the TypeScript counterpart at -`backend/src/models/Evaluation.ts` (and the frontend at -`frontend/src/types/evaluation.ts`). The shape stored in DynamoDB is just a -JSON object keyed by these names with boolean values. -""" from typing import Dict, Mapping, Optional - METRIC_KEYS: tuple[str, ...] = ( # Algorithmic — summarization 'bleu', @@ -32,11 +23,6 @@ def is_metric_enabled( selected: Optional[Mapping[str, bool]], key: str, ) -> bool: - """Return whether `key` should be computed. - - Missing keys default to True so legacy jobs (and partial configs) keep the - pre-toggle behaviour of computing every metric. - """ if selected is None: return True return bool(selected.get(key, True)) @@ -45,9 +31,5 @@ def is_metric_enabled( def normalize_metrics_config( stored: Optional[Mapping[str, object]], ) -> Dict[str, bool]: - """Coerce a stored metrics blob into a complete `{key: bool}` mapping. - - Unknown keys in `stored` are dropped; missing keys default to True. - """ source: Mapping[str, object] = stored or {} return {key: bool(source.get(key, True)) for key in METRIC_KEYS} diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts index 6c5917e..92b4ab0 100644 --- a/backend/src/models/Evaluation.ts +++ b/backend/src/models/Evaluation.ts @@ -9,18 +9,6 @@ export interface WeightConfig { cost: number; } -/** - * Canonical list of every metric the engine knows how to compute, in display - * order, grouped by semantic category. - * - * This is the SINGLE SOURCE OF TRUTH for metric keys in the backend. Adding a - * new metric is a one-line change here — the type alias, the default config - * and the Zod payload schema are all derived from this list. - * - * Keep this list in sync with the frontend `METRIC_KEYS` (and the Python - * `metrics.METRIC_KEYS`); a cross-language enum would be ideal, but absent - * that, mirror the additions manually. - */ export const METRIC_KEYS = [ // Algorithmic — summarization 'bleu', @@ -41,29 +29,12 @@ export const METRIC_KEYS = [ export type MetricKey = (typeof METRIC_KEYS)[number]; -/** - * Per-metric opt-in flags. - * - * Latency and cost are always computed (they are free from inference data). - * - * - Algorithmic / programmatic metrics are deterministic and run locally. - * They are mostly fast; only BERTScore has a meaningful loading cost. - * - LLM-as-judge metrics call an extra Bedrock model per sample — they are - * the main lever for both speed and spend. - * - * Metrics that are not applicable to the uploaded dataset (e.g. classification - * metrics on a summarization run) are skipped automatically. - */ export type MetricsConfig = Record; export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries( METRIC_KEYS.map((k) => [k, true]), ) as MetricsConfig; -/** - * Merge a partial user-provided config with the defaults (all enabled). - * Unknown keys are ignored; missing keys fall back to the default value. - */ export function resolveMetricsConfig( metrics?: Partial, ): MetricsConfig { diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts index 5fb8371..2b57f1a 100644 --- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts +++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts @@ -1,9 +1,9 @@ import { resolveMetricsConfig, - type EvaluationJob, - type EvaluationRequest, - type ModelConfig, - type WeightConfig, + EvaluationJob, + EvaluationRequest, + ModelConfig, + WeightConfig, } from '../../models/Evaluation.js'; import { type BedrockModelValidationService } from '../../services/BedrockModelValidationService/BedrockModelValidationService.js'; import { FakeBedrockModelValidationService } from '../../services/BedrockModelValidationService/FakeBedrockModelValidationService.js'; diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts index 6fe06e4..45c4833 100644 --- a/frontend/src/types/evaluation.ts +++ b/frontend/src/types/evaluation.ts @@ -4,11 +4,6 @@ export interface MetricsWeights { latency: number; } -/** - * Canonical list of every metric the engine knows how to compute, in display - * order. Keep in sync with the backend `METRIC_KEYS` in - * `backend/src/models/Evaluation.ts` and `backend/python-eval-function/src/metrics.py`. - */ export const METRIC_KEYS = [ // Algorithmic — summarization 'bleu', @@ -29,16 +24,6 @@ export const METRIC_KEYS = [ export type MetricKey = (typeof METRIC_KEYS)[number]; -/** - * Per-metric opt-in flags. - * - * - Algorithmic / programmatic metrics are deterministic, fast, and run locally - * (BERTScore is the only one with a meaningful load cost). - * - LLM-as-judge metrics call an extra Bedrock model per sample — the main - * lever for speed and spend. - * - * Metrics that don't apply to the uploaded dataset are skipped automatically. - */ export type MetricsToggles = Record; export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries( From 90eae833e6385a88fa800a301b27e654a303e821 Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Fri, 15 May 2026 12:59:02 +0200 Subject: [PATCH 3/9] feat: UI move metrics toggle to data set step --- .../components/evaluator/DatasetUpload.tsx | 28 ++ .../components/evaluator/MetricsPicker.tsx | 261 ++++++++++++++++++ .../components/evaluator/MetricsWeights.tsx | 246 +---------------- frontend/src/pages/Index.tsx | 8 +- 4 files changed, 296 insertions(+), 247 deletions(-) create mode 100644 frontend/src/components/evaluator/MetricsPicker.tsx diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx index 72cb44f..c676b63 100644 --- a/frontend/src/components/evaluator/DatasetUpload.tsx +++ b/frontend/src/components/evaluator/DatasetUpload.tsx @@ -1,6 +1,11 @@ +import { + MetricsPicker, + type MetricsPickerTaskType, +} from '@/components/evaluator/MetricsPicker'; import { Button } from '@/components/ui/button'; import { useUploadDataset } from '@/hooks/useEvaluation'; import { cn } from '@/lib/utils'; +import type { MetricsToggles } from '@/types/evaluation'; import { motion } from 'framer-motion'; import { AlertCircle, @@ -19,6 +24,8 @@ interface DatasetUploadProps { onStartEvaluation: () => void; onUploadSuccess: (data: { dataset_id: string; sample_count: number }) => void; isStarting?: boolean; + metrics: MetricsToggles; + onMetricsChange: (metrics: MetricsToggles) => void; } type TaskType = 'summarization' | 'classification'; @@ -76,12 +83,23 @@ const TASK_TYPES: { type FormatTab = 'csv' | 'jsonl'; +function resolveDetectedTask(data: { + has_summary: boolean; + has_class: boolean; +}): MetricsPickerTaskType | undefined { + if (data.has_summary) return 'summarization'; + if (data.has_class) return 'classification'; + return undefined; +} + export function DatasetUpload({ file, onChange, onStartEvaluation, onUploadSuccess, isStarting = false, + metrics, + onMetricsChange, }: DatasetUploadProps) { const [dragOver, setDragOver] = useState(false); const [activeTask, setActiveTask] = useState('summarization'); @@ -344,6 +362,16 @@ export function DatasetUpload({ )} + {uploadMutation.isSuccess && ( +
+ +
+ )} + {uploadMutation.isSuccess && ( + + + + +
+ {group.metrics.map((metric) => { + const enabled = metrics[metric.key]; + return ( + + ); + })} +
+ + ); + })} + + + ); +} diff --git a/frontend/src/components/evaluator/MetricsWeights.tsx b/frontend/src/components/evaluator/MetricsWeights.tsx index 39d4fa8..122e11a 100644 --- a/frontend/src/components/evaluator/MetricsWeights.tsx +++ b/frontend/src/components/evaluator/MetricsWeights.tsx @@ -1,138 +1,12 @@ import { cn } from '@/lib/utils'; -import type { - MetricsToggles, - MetricsWeights as Weights, -} from '@/types/evaluation'; +import type { MetricsWeights as Weights } from '@/types/evaluation'; import { motion } from 'framer-motion'; -import { Info, Zap } from 'lucide-react'; +import { Info } from 'lucide-react'; import { useEffect, useState } from 'react'; interface MetricsWeightsProps { value: Weights; onChange: (weights: Weights) => void; - metrics: MetricsToggles; - onMetricsChange: (metrics: MetricsToggles) => void; -} - -interface ToggleableMetric { - key: keyof MetricsToggles; - label: string; - description: string; - /** Free-form badge text (e.g. "Summarization", "Classification"). */ - taskBadge?: string; -} - -interface MetricGroup { - id: string; - title: string; - subtitle: string; - metrics: ToggleableMetric[]; -} - -const METRIC_GROUPS: MetricGroup[] = [ - { - id: 'programmatic', - title: 'Programmatic metrics', - subtitle: - 'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.', - metrics: [ - { - key: 'bleu', - label: 'BLEU', - description: 'N-gram overlap with the reference.', - taskBadge: 'Summarization', - }, - { - key: 'rouge', - label: 'ROUGE', - description: 'Recall-oriented n-gram overlap.', - taskBadge: 'Summarization', - }, - { - key: 'meteor', - label: 'METEOR', - description: 'Stem-aware overlap, more lenient than BLEU.', - taskBadge: 'Summarization', - }, - { - key: 'levenshtein', - label: 'Levenshtein similarity', - description: 'Character-level edit-distance similarity.', - taskBadge: 'Summarization', - }, - { - key: 'bertscore', - label: 'BERTScore', - description: - 'Embedding-based semantic similarity. Loads a transformer model once per run.', - taskBadge: 'Summarization', - }, - { - key: 'classification_accuracy', - label: 'Accuracy', - description: 'Fraction of predictions that match the reference label.', - taskBadge: 'Classification', - }, - { - key: 'precision_macro', - label: 'Precision (macro)', - description: 'Per-class precision averaged across labels.', - taskBadge: 'Classification', - }, - { - key: 'recall_macro', - label: 'Recall (macro)', - description: 'Per-class recall averaged across labels.', - taskBadge: 'Classification', - }, - { - key: 'f1_macro', - label: 'F1 (macro)', - description: 'Harmonic mean of precision and recall, unweighted.', - taskBadge: 'Classification', - }, - { - key: 'f1_weighted', - label: 'F1 (weighted)', - description: 'F1 weighted by class support.', - taskBadge: 'Classification', - }, - ], - }, - { - id: 'llm-judge', - title: 'LLM-as-judge metrics', - subtitle: - 'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.', - metrics: [ - { - key: 'geval_reasoning', - label: 'G-Eval — Reasoning', - description: 'How coherent and well-justified the output is.', - }, - { - key: 'geval_faithfulness', - label: 'G-Eval — Faithfulness', - description: 'Whether the output sticks to the input (no hallucination).', - }, - ], - }, -]; - -function setGroupSelection( - current: MetricsToggles, - group: MetricGroup, - value: boolean, -): MetricsToggles { - const next = { ...current }; - for (const m of group.metrics) { - next[m.key] = value; - } - return next; -} - -function countSelected(metrics: MetricsToggles, group: MetricGroup): number { - return group.metrics.reduce((n, m) => (metrics[m.key] ? n + 1 : n), 0); } const METRICS: { key: keyof Weights; label: string; description: string }[] = [ @@ -184,12 +58,7 @@ function syncDraftFromValue(prev: string, num: number): string { return String(num); } -export function MetricsWeights({ - value, - onChange, - metrics, - onMetricsChange, -}: MetricsWeightsProps) { +export function MetricsWeights({ value, onChange }: MetricsWeightsProps) { const [drafts, setDrafts] = useState({ accuracy: String(value.accuracy), cost: String(value.cost), @@ -307,115 +176,6 @@ export function MetricsWeights({ is {total}%.

)} - -
-
- -

- Pick the metrics to compute -

-
-

- Latency and cost are always computed. Metrics that don't apply to - your dataset (e.g. classification metrics on a summarization run) - are skipped automatically. -

- -
- {METRIC_GROUPS.map((group) => { - const selectedCount = countSelected(metrics, group); - const totalCount = group.metrics.length; - return ( -
- - {group.title} - -

- {group.subtitle} -

-
- - {selectedCount} / {totalCount} selected - -
- - -
-
- -
- {group.metrics.map((metric) => { - const enabled = metrics[metric.key]; - return ( - - ); - })} -
-
- ); - })} -
-
); } diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx index 7dc00f6..e02af82 100644 --- a/frontend/src/pages/Index.tsx +++ b/frontend/src/pages/Index.tsx @@ -143,10 +143,6 @@ export default function Index() { setConfig({ ...config, weights: w })} - metrics={config.metrics} - onMetricsChange={(m) => - setConfig({ ...config, metrics: m }) - } /> )} {step === 1 && ( @@ -162,6 +158,10 @@ export default function Index() { onStartEvaluation={handleStartEvaluation} onUploadSuccess={handleUploadSuccess} isStarting={createEvaluationMutation.isPending} + metrics={config.metrics} + onMetricsChange={(m) => + setConfig({ ...config, metrics: m }) + } /> )} From b59d229131f491f30710fce2278bb200b540ed48 Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Fri, 15 May 2026 12:59:14 +0200 Subject: [PATCH 4/9] feat: Update readme for metricpicker new file --- frontend/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/README.md b/frontend/README.md index bfb7321..172796e 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -26,7 +26,8 @@ frontend/ │ │ ├── evaluator/ # Step components for the evaluation workflow │ │ │ ├── MetricsWeights.tsx # Step 1 — tune accuracy / latency / cost weights │ │ │ ├── ModelSelection.tsx # Step 2 — pick Bedrock models to evaluate -│ │ │ ├── DatasetUpload.tsx # Step 3 — upload CSV or JSONL dataset +│ │ │ ├── DatasetUpload.tsx # Step 3 — upload dataset and pick metrics to compute +│ │ │ ├── MetricsPicker.tsx # Task-aware picker for which metrics to compute │ │ │ ├── ProgressView.tsx # Step 4a — real-time job progress │ │ │ ├── ResultsView.tsx # Step 4b — ranked results and recommendation │ │ │ └── StepIndicator.tsx # Navigation breadcrumb for the stepper From 3326065de52e709f9472e54264c8a28ee041e379 Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Mon, 18 May 2026 11:37:26 +0200 Subject: [PATCH 5/9] feat(backend): remove gEval logic from main to keep it as pure orchestration and moved it on the geval class --- .../src/geval_evaluator.py | 30 ++++++++++++++----- backend/python-eval-function/src/main.py | 18 ++++------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/backend/python-eval-function/src/geval_evaluator.py b/backend/python-eval-function/src/geval_evaluator.py index 8a6d581..41d55c2 100644 --- a/backend/python-eval-function/src/geval_evaluator.py +++ b/backend/python-eval-function/src/geval_evaluator.py @@ -1,8 +1,10 @@ import os import logging -from typing import Optional, List +from typing import Optional, List, Mapping from dataclasses import dataclass +from metrics import is_metric_enabled + logger = logging.getLogger(__name__) JUDGE_MODEL_DEFAULT = "us.meta.llama4-maverick-17b-instruct-v1:0" @@ -15,10 +17,20 @@ class GEvalMetrics: class GEvalEvaluator: - def __init__(self, judge_model_id: Optional[str] = None, region: Optional[str] = None): + def __init__( + self, + metrics_config: Optional[Mapping[str, bool]] = None, + judge_model_id: Optional[str] = None, + region: Optional[str] = None, + ): self.judge_model_id = judge_model_id or os.environ.get("GEVAL_JUDGE_MODEL", JUDGE_MODEL_DEFAULT) self.region = region or os.environ.get("AWS_REGION", "us-west-2") - logger.info(f"GEvalEvaluator initialized with judge={self.judge_model_id}, region={self.region}") + self.compute_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning') + self.compute_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness') + logger.info( + f"GEvalEvaluator initialized with judge={self.judge_model_id}, region={self.region}, " + f"reasoning={self.compute_reasoning}, faithfulness={self.compute_faithfulness}" + ) def _build_judge_model(self): from deepeval.models import AmazonBedrockModel @@ -93,14 +105,16 @@ def _build_faithfulness_metric(self, model, task_type: str = "summarization"): async_mode=False, ) + @property + def enabled(self) -> bool: + return self.compute_reasoning or self.compute_faithfulness + def evaluate( self, inputs: List[str], predictions: List[str], references: Optional[List[str]] = None, task_type: str = "summarization", - compute_reasoning: bool = True, - compute_faithfulness: bool = True, ) -> GEvalMetrics: if not inputs or not predictions: logger.warning("Empty inputs or predictions, skipping G-Eval") @@ -110,7 +124,7 @@ def evaluate( logger.error("inputs and predictions length mismatch, skipping G-Eval") return GEvalMetrics() - if not compute_reasoning and not compute_faithfulness: + if not self.enabled: logger.info("Both G-Eval metrics disabled, skipping") return GEvalMetrics() @@ -118,12 +132,12 @@ def evaluate( model = self._build_judge_model() reasoning_metric = ( self._build_reasoning_metric(model, task_type) - if compute_reasoning + if self.compute_reasoning else None ) faithfulness_metric = ( self._build_faithfulness_metric(model, task_type) - if compute_faithfulness + if self.compute_faithfulness else None ) except Exception as e: diff --git a/backend/python-eval-function/src/main.py b/backend/python-eval-function/src/main.py index ebfdd3f..619eabc 100644 --- a/backend/python-eval-function/src/main.py +++ b/backend/python-eval-function/src/main.py @@ -78,10 +78,6 @@ def main(): # every key present, all bool. metrics_config = job_config.get('metrics') or {} - from metrics import is_metric_enabled - compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning') - compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness') - logger.info( f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, " f"metrics={metrics_config or 'all'}" @@ -176,10 +172,10 @@ def main(): logger.info(f"Classification accuracy complete for {len(accuracy_results)} models") - if compute_geval_reasoning or compute_geval_faithfulness: - from geval_evaluator import GEvalEvaluator - geval_evaluator = GEvalEvaluator() + from geval_evaluator import GEvalEvaluator + geval_evaluator = GEvalEvaluator(metrics_config=metrics_config) + if geval_evaluator.enabled: for model_id, invocation_results in results_by_model.items(): successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None] predictions = [invocation_results[i].response_text for i in successful_indices] @@ -187,12 +183,10 @@ def main(): logger.info( f"Running G-Eval for model {model_id} on {len(predictions)} samples " - f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})" + f"(reasoning={geval_evaluator.compute_reasoning}, faithfulness={geval_evaluator.compute_faithfulness})" ) geval_metrics = geval_evaluator.evaluate( inputs, predictions, task_type=task_type, - compute_reasoning=compute_geval_reasoning, - compute_faithfulness=compute_geval_faithfulness, ) acc = accuracy_results.get(model_id) @@ -200,9 +194,9 @@ def main(): acc = AccuracyMetrics() accuracy_results[model_id] = acc - if compute_geval_reasoning: + if geval_evaluator.compute_reasoning: acc.geval_reasoning = geval_metrics.reasoning - if compute_geval_faithfulness: + if geval_evaluator.compute_faithfulness: acc.geval_faithfulness = geval_metrics.faithfulness logger.info(f"G-Eval complete for {model_id}") From 5cc3787f0063d544c0b04d9ad89701e977b77a59 Mon Sep 17 00:00:00 2001 From: Raymond LIM Date: Mon, 18 May 2026 11:38:22 +0200 Subject: [PATCH 6/9] Apply suggestion from @DaoudAA Co-authored-by: DaoudAA <127060080+DaoudAA@users.noreply.github.com> --- frontend/src/components/evaluator/MetricsPicker.tsx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/frontend/src/components/evaluator/MetricsPicker.tsx b/frontend/src/components/evaluator/MetricsPicker.tsx index f059916..b6af133 100644 --- a/frontend/src/components/evaluator/MetricsPicker.tsx +++ b/frontend/src/components/evaluator/MetricsPicker.tsx @@ -162,8 +162,7 @@ export function MetricsPicker({

- Latency and cost are always computed. Disable expensive metrics like - BERTScore or G-Eval to speed up evaluation. + Latency and cost are always computed regardless of choice of accuracy metrics.

From ec87000337832b2e11f72103b04b8b5274de9c9a Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Mon, 18 May 2026 15:53:41 +0200 Subject: [PATCH 7/9] feat: Remove unecessary key in the API payload sent by the frontend + omitted key will be set to false --- backend/python-eval-function/src/metrics.py | 6 +- backend/src/models/Evaluation.ts | 2 +- .../components/evaluator/DatasetUpload.tsx | 18 +-- .../components/evaluator/MetricsPicker.tsx | 122 +----------------- frontend/src/pages/Index.tsx | 15 ++- frontend/src/types/evaluation.ts | 121 ++++++++++++++++- frontend/src/utils/metrics.ts | 35 +++++ 7 files changed, 182 insertions(+), 137 deletions(-) create mode 100644 frontend/src/utils/metrics.ts diff --git a/backend/python-eval-function/src/metrics.py b/backend/python-eval-function/src/metrics.py index fad56c2..61c2616 100644 --- a/backend/python-eval-function/src/metrics.py +++ b/backend/python-eval-function/src/metrics.py @@ -24,12 +24,12 @@ def is_metric_enabled( key: str, ) -> bool: if selected is None: - return True - return bool(selected.get(key, True)) + return False + return bool(selected.get(key, False)) def normalize_metrics_config( stored: Optional[Mapping[str, object]], ) -> Dict[str, bool]: source: Mapping[str, object] = stored or {} - return {key: bool(source.get(key, True)) for key in METRIC_KEYS} + return {key: bool(source.get(key, False)) for key in METRIC_KEYS} diff --git a/backend/src/models/Evaluation.ts b/backend/src/models/Evaluation.ts index 92b4ab0..f5eee37 100644 --- a/backend/src/models/Evaluation.ts +++ b/backend/src/models/Evaluation.ts @@ -32,7 +32,7 @@ export type MetricKey = (typeof METRIC_KEYS)[number]; export type MetricsConfig = Record; export const DEFAULT_METRICS_CONFIG: MetricsConfig = Object.fromEntries( - METRIC_KEYS.map((k) => [k, true]), + METRIC_KEYS.map((k) => [k, false]), ) as MetricsConfig; export function resolveMetricsConfig( diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx index c676b63..de9ecc5 100644 --- a/frontend/src/components/evaluator/DatasetUpload.tsx +++ b/frontend/src/components/evaluator/DatasetUpload.tsx @@ -1,11 +1,8 @@ -import { - MetricsPicker, - type MetricsPickerTaskType, -} from '@/components/evaluator/MetricsPicker'; +import { MetricsPicker } from '@/components/evaluator/MetricsPicker'; import { Button } from '@/components/ui/button'; import { useUploadDataset } from '@/hooks/useEvaluation'; import { cn } from '@/lib/utils'; -import type { MetricsToggles } from '@/types/evaluation'; +import type { MetricsToggles, TaskType } from '@/types/evaluation'; import { motion } from 'framer-motion'; import { AlertCircle, @@ -22,14 +19,16 @@ interface DatasetUploadProps { file: File | null; onChange: (file: File | null) => void; onStartEvaluation: () => void; - onUploadSuccess: (data: { dataset_id: string; sample_count: number }) => void; + onUploadSuccess: (data: { + dataset_id: string; + sample_count: number; + taskType: TaskType | undefined; + }) => void; isStarting?: boolean; metrics: MetricsToggles; onMetricsChange: (metrics: MetricsToggles) => void; } -type TaskType = 'summarization' | 'classification'; - const TASK_TYPES: { id: TaskType; label: string; @@ -86,7 +85,7 @@ type FormatTab = 'csv' | 'jsonl'; function resolveDetectedTask(data: { has_summary: boolean; has_class: boolean; -}): MetricsPickerTaskType | undefined { +}): TaskType | undefined { if (data.has_summary) return 'summarization'; if (data.has_class) return 'classification'; return undefined; @@ -114,6 +113,7 @@ export function DatasetUpload({ onUploadSuccess({ dataset_id: data.dataset_id, sample_count: data.sample_count, + taskType: resolveDetectedTask(data), }); }, }); diff --git a/frontend/src/components/evaluator/MetricsPicker.tsx b/frontend/src/components/evaluator/MetricsPicker.tsx index b6af133..350a6ea 100644 --- a/frontend/src/components/evaluator/MetricsPicker.tsx +++ b/frontend/src/components/evaluator/MetricsPicker.tsx @@ -1,124 +1,8 @@ import { cn } from '@/lib/utils'; -import type { MetricsToggles } from '@/types/evaluation'; +import type { MetricGroup, MetricsToggles, TaskType } from '@/types/evaluation'; +import { METRIC_GROUPS } from '@/types/evaluation'; import { Zap } from 'lucide-react'; -export type MetricsPickerTaskType = 'summarization' | 'classification'; - -interface ToggleableMetric { - key: keyof MetricsToggles; - label: string; - description: string; - task?: MetricsPickerTaskType; - taskBadge?: string; -} - -interface MetricGroup { - id: string; - title: string; - subtitle: string; - metrics: ToggleableMetric[]; -} - -const METRIC_GROUPS: MetricGroup[] = [ - { - id: 'programmatic', - title: 'Programmatic metrics', - subtitle: - 'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.', - metrics: [ - { - key: 'bleu', - label: 'BLEU', - description: 'N-gram overlap with the reference.', - task: 'summarization', - taskBadge: 'Summarization', - }, - { - key: 'rouge', - label: 'ROUGE', - description: 'Recall-oriented n-gram overlap.', - task: 'summarization', - taskBadge: 'Summarization', - }, - { - key: 'meteor', - label: 'METEOR', - description: 'Stem-aware overlap, more lenient than BLEU.', - task: 'summarization', - taskBadge: 'Summarization', - }, - { - key: 'levenshtein', - label: 'Levenshtein similarity', - description: 'Character-level edit-distance similarity.', - task: 'summarization', - taskBadge: 'Summarization', - }, - { - key: 'bertscore', - label: 'BERTScore', - description: - 'Embedding-based semantic similarity. Loads a transformer model once per run.', - task: 'summarization', - taskBadge: 'Summarization', - }, - { - key: 'classification_accuracy', - label: 'Accuracy', - description: 'Fraction of predictions that match the reference label.', - task: 'classification', - taskBadge: 'Classification', - }, - { - key: 'precision_macro', - label: 'Precision (macro)', - description: 'Per-class precision averaged across labels.', - task: 'classification', - taskBadge: 'Classification', - }, - { - key: 'recall_macro', - label: 'Recall (macro)', - description: 'Per-class recall averaged across labels.', - task: 'classification', - taskBadge: 'Classification', - }, - { - key: 'f1_macro', - label: 'F1 (macro)', - description: 'Harmonic mean of precision and recall, unweighted.', - task: 'classification', - taskBadge: 'Classification', - }, - { - key: 'f1_weighted', - label: 'F1 (weighted)', - description: 'F1 weighted by class support.', - task: 'classification', - taskBadge: 'Classification', - }, - ], - }, - { - id: 'llm-judge', - title: 'LLM-as-judge metrics', - subtitle: - 'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.', - metrics: [ - { - key: 'geval_reasoning', - label: 'G-Eval — Reasoning', - description: 'How coherent and well-justified the output is.', - }, - { - key: 'geval_faithfulness', - label: 'G-Eval — Faithfulness', - description: 'Whether the output sticks to the input (no hallucination).', - }, - ], - }, -]; - function setGroupSelection( current: MetricsToggles, group: MetricGroup, @@ -138,7 +22,7 @@ function countSelected(metrics: MetricsToggles, group: MetricGroup): number { interface MetricsPickerProps { metrics: MetricsToggles; onChange: (metrics: MetricsToggles) => void; - taskType?: MetricsPickerTaskType; + taskType?: TaskType; } export function MetricsPicker({ diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx index e02af82..d071b43 100644 --- a/frontend/src/pages/Index.tsx +++ b/frontend/src/pages/Index.tsx @@ -10,8 +10,9 @@ import { useEvaluationResults, useEvaluationStatus, } from '@/hooks/useEvaluation'; -import type { EvaluationConfig } from '@/types/evaluation'; +import type { EvaluationConfig, TaskType } from '@/types/evaluation'; import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation'; +import { buildDefaultsForTask, pickEnabledMetrics } from '@/utils/metrics'; import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react'; import { useCallback, useEffect, useState } from 'react'; @@ -76,7 +77,7 @@ export default function Index() { latency: config.weights.latency / 100, cost: config.weights.cost / 100, }, - metrics: config.metrics, + metrics: pickEnabledMetrics(config.metrics), }, { onSuccess: (data) => { @@ -98,8 +99,16 @@ export default function Index() { ]); const handleUploadSuccess = useCallback( - (data: { dataset_id: string; sample_count: number }) => { + (data: { + dataset_id: string; + sample_count: number; + taskType: TaskType | undefined; + }) => { setDatasetId(data.dataset_id); + setConfig((prev) => ({ + ...prev, + metrics: buildDefaultsForTask(data.taskType), + })); }, [], ); diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts index 45c4833..1d5b0e9 100644 --- a/frontend/src/types/evaluation.ts +++ b/frontend/src/types/evaluation.ts @@ -27,9 +27,126 @@ export type MetricKey = (typeof METRIC_KEYS)[number]; export type MetricsToggles = Record; export const DEFAULT_METRICS_TOGGLES: MetricsToggles = Object.fromEntries( - METRIC_KEYS.map((k) => [k, true]), + METRIC_KEYS.map((k) => [k, false]), ) as MetricsToggles; +export type TaskType = 'summarization' | 'classification'; + +export interface ToggleableMetric { + key: MetricKey; + label: string; + description: string; + task?: TaskType; + taskBadge?: string; +} + +export interface MetricGroup { + id: string; + title: string; + subtitle: string; + metrics: ToggleableMetric[]; +} + +export const METRIC_GROUPS: MetricGroup[] = [ + { + id: 'programmatic', + title: 'Programmatic metrics', + subtitle: + 'Deterministic scoring computed locally. Fast and cheap — only BERTScore has a noticeable load cost.', + metrics: [ + { + key: 'bleu', + label: 'BLEU', + description: 'N-gram overlap with the reference.', + task: 'summarization', + taskBadge: 'Summarization', + }, + { + key: 'rouge', + label: 'ROUGE', + description: 'Recall-oriented n-gram overlap.', + task: 'summarization', + taskBadge: 'Summarization', + }, + { + key: 'meteor', + label: 'METEOR', + description: 'Stem-aware overlap, more lenient than BLEU.', + task: 'summarization', + taskBadge: 'Summarization', + }, + { + key: 'levenshtein', + label: 'Levenshtein similarity', + description: 'Character-level edit-distance similarity.', + task: 'summarization', + taskBadge: 'Summarization', + }, + { + key: 'bertscore', + label: 'BERTScore', + description: + 'Embedding-based semantic similarity. Loads a transformer model once per run.', + task: 'summarization', + taskBadge: 'Summarization', + }, + { + key: 'classification_accuracy', + label: 'Accuracy', + description: 'Fraction of predictions that match the reference label.', + task: 'classification', + taskBadge: 'Classification', + }, + { + key: 'precision_macro', + label: 'Precision (macro)', + description: 'Per-class precision averaged across labels.', + task: 'classification', + taskBadge: 'Classification', + }, + { + key: 'recall_macro', + label: 'Recall (macro)', + description: 'Per-class recall averaged across labels.', + task: 'classification', + taskBadge: 'Classification', + }, + { + key: 'f1_macro', + label: 'F1 (macro)', + description: 'Harmonic mean of precision and recall, unweighted.', + task: 'classification', + taskBadge: 'Classification', + }, + { + key: 'f1_weighted', + label: 'F1 (weighted)', + description: 'F1 weighted by class support.', + task: 'classification', + taskBadge: 'Classification', + }, + ], + }, + { + id: 'llm-judge', + title: 'LLM-as-judge metrics', + subtitle: + 'Quality scores produced by an extra Bedrock model per sample. Most accurate, but the slowest and most expensive metrics.', + metrics: [ + { + key: 'geval_reasoning', + label: 'G-Eval — Reasoning', + description: 'How coherent and well-justified the output is.', + }, + { + key: 'geval_faithfulness', + label: 'G-Eval — Faithfulness', + description: 'Whether the output sticks to the input (no hallucination).', + }, + ], + }, +]; + export interface ModelOption { id: string; name: string; @@ -112,7 +229,7 @@ export interface CreateEvaluationRequest { dataset_id: string; models: { type: 'default' | 'custom'; identifier: string }[]; weights: { accuracy: number; latency: number; cost: number }; - metrics?: MetricsToggles; + metrics?: Partial>; } export interface DatasetUploadData { diff --git a/frontend/src/utils/metrics.ts b/frontend/src/utils/metrics.ts new file mode 100644 index 0000000..53c8c7a --- /dev/null +++ b/frontend/src/utils/metrics.ts @@ -0,0 +1,35 @@ +import { + METRIC_GROUPS, + type MetricKey, + type MetricsToggles, + type TaskType, + METRIC_KEYS, +} from '@/types/evaluation'; + +export function buildDefaultsForTask( + taskType: TaskType | undefined, +): MetricsToggles { + const base = Object.fromEntries( + METRIC_KEYS.map((k) => [k, false]), + ) as MetricsToggles; + + if (!taskType) return base; + + for (const group of METRIC_GROUPS) { + for (const m of group.metrics) { + if (!m.task || m.task === taskType) { + base[m.key] = true; + } + } + } + + return base; +} + +export function pickEnabledMetrics( + toggles: MetricsToggles, +): Partial> { + return Object.fromEntries( + Object.entries(toggles).filter(([, v]) => v), + ) as Partial>; +} From 4c84eb8bf2a1f9c13dfcdb093bdf9bc6a65cc107 Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Wed, 20 May 2026 14:20:44 +0200 Subject: [PATCH 8/9] feat: reverted model id to previous one --- frontend/src/types/evaluation.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/src/types/evaluation.ts b/frontend/src/types/evaluation.ts index 1d5b0e9..c490d8d 100644 --- a/frontend/src/types/evaluation.ts +++ b/frontend/src/types/evaluation.ts @@ -181,21 +181,21 @@ export interface EvaluationResult { export const AVAILABLE_MODELS: ModelOption[] = [ // Amazon Nova { - id: 'amazon.nova-pro-v1:0', + id: 'us.amazon.nova-pro-v1:0', name: 'Nova Pro', provider: 'Amazon', contextWindow: '300K', costPer1kTokens: 0.0008, }, { - id: 'amazon.nova-lite-v1:0', + id: 'us.amazon.nova-lite-v1:0', name: 'Nova Lite', provider: 'Amazon', contextWindow: '300K', costPer1kTokens: 0.00006, }, { - id: 'amazon.nova-micro-v1:0', + id: 'us.amazon.nova-micro-v1:0', name: 'Nova Micro', provider: 'Amazon', contextWindow: '128K', @@ -203,21 +203,21 @@ export const AVAILABLE_MODELS: ModelOption[] = [ }, // Anthropic { - id: 'anthropic.claude-opus-4-6-v1', + id: 'us.anthropic.claude-opus-4-6-v1', name: 'Claude Opus 4.6', provider: 'Anthropic', contextWindow: '200K', costPer1kTokens: 0.005, }, { - id: 'anthropic.claude-sonnet-4-5-20250929-v1:0', + id: 'us.anthropic.claude-sonnet-4-5-20250929-v1:0', name: 'Claude Sonnet 4.5', provider: 'Anthropic', contextWindow: '200K', costPer1kTokens: 0.003, }, { - id: 'anthropic.claude-haiku-4-5-20251001-v1:0', + id: 'us.anthropic.claude-haiku-4-5-20251001-v1:0', name: 'Claude Haiku 4.5', provider: 'Anthropic', contextWindow: '200K', From ec1c87429299ed56c4ac60f35df6985497631e5e Mon Sep 17 00:00:00 2001 From: LimRaymond Date: Wed, 20 May 2026 15:27:10 +0200 Subject: [PATCH 9/9] feat: Enforce at least 1 metrics is choosen before launching an evaluation --- .../EvaluationLaunchUseCase.test.ts | 26 ++++++++++++++++--- .../EvaluationLaunchUseCase.ts | 11 ++++++++ .../FakeEvaluationLaunchUseCase.ts | 4 +++ .../components/evaluator/DatasetUpload.tsx | 8 +++++- frontend/src/pages/Index.tsx | 7 ++++- frontend/src/utils/metrics.ts | 4 +++ 6 files changed, 55 insertions(+), 5 deletions(-) diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts index 056afe0..8dd6fd6 100644 --- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts +++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.test.ts @@ -121,6 +121,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { dataset_id: 'test-dataset-id', models: [{ type: 'default', identifier: 'claude-sonnet' }], metrics: { + bleu: true, bertscore: false, geval_reasoning: false, geval_faithfulness: false, @@ -133,6 +134,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, + bleu: true, bertscore: false, geval_reasoning: false, geval_faithfulness: false, @@ -143,7 +145,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { const request: EvaluationRequest = { dataset_id: 'test-dataset-id', models: [{ type: 'default', identifier: 'claude-sonnet' }], - metrics: { geval_reasoning: false, geval_faithfulness: false }, + metrics: { rouge: true, geval_reasoning: false, geval_faithfulness: false }, }; await useCase.launchEvaluation(request); @@ -152,6 +154,7 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, + rouge: true, geval_reasoning: false, geval_faithfulness: false, }); @@ -161,14 +164,31 @@ describe('EvaluationLaunchUseCase - Weight Configuration', () => { const request: EvaluationRequest = { dataset_id: 'test-dataset-id', models: [{ type: 'default', identifier: 'claude-sonnet' }], - metrics: { bleu: false }, + metrics: { bleu: false, rouge: true }, }; await useCase.launchEvaluation(request); const metricsArg = mockEvaluationJobsRepository.createEvaluation.mock.calls[0][3]; - expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false }); + expect(metricsArg).toEqual({ ...ALL_METRICS_ENABLED, bleu: false, rouge: true }); + }); + + it('should reject when all metrics are explicitly disabled', async () => { + const allDisabled: Partial> = {}; + for (const key of Object.keys(ALL_METRICS_ENABLED)) { + allDisabled[key] = false; + } + + const request: EvaluationRequest = { + dataset_id: 'test-dataset-id', + models: [{ type: 'default', identifier: 'claude-sonnet' }], + metrics: allDisabled, + }; + + await expect(useCase.launchEvaluation(request)).rejects.toThrow( + 'At least one accuracy metric must be selected', + ); }); }); diff --git a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts index 79ea6b6..35494df 100644 --- a/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts +++ b/backend/src/useCases/EvaluationLaunch/EvaluationLaunchUseCase.ts @@ -36,6 +36,17 @@ class EvaluationLaunchUseCaseImpl implements EvaluationLaunchUseCase { const normalizedWeights = this.normalizeWeights(request.weights); const metrics = resolveMetricsConfig(request.metrics); + if ( + request.metrics && + !Object.values(request.metrics).some((v) => v === true) + ) { + throw new BasicError( + BasicErrorType.BAD_REQUEST, + 'NO_METRICS_SELECTED', + 'At least one accuracy metric must be selected', + ); + } + const job = await this.evaluationJobsRepository.createEvaluation( request.dataset_id, modelsToPersist, diff --git a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts index 2b57f1a..b30d82d 100644 --- a/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts +++ b/backend/src/useCases/EvaluationLaunch/FakeEvaluationLaunchUseCase.ts @@ -43,6 +43,10 @@ export class FakeEvaluationLaunchUseCase implements EvaluationLaunchUseCase { const normalizedWeights = this.normalizeWeights(request.weights); const metrics = resolveMetricsConfig(request.metrics); + if (request.metrics && !Object.values(request.metrics).some((v) => v === true)) { + throw new Error('At least one accuracy metric must be selected'); + } + const job = await this.evaluationJobsRepository.createEvaluation( request.dataset_id, modelsToPersist, diff --git a/frontend/src/components/evaluator/DatasetUpload.tsx b/frontend/src/components/evaluator/DatasetUpload.tsx index de9ecc5..e074076 100644 --- a/frontend/src/components/evaluator/DatasetUpload.tsx +++ b/frontend/src/components/evaluator/DatasetUpload.tsx @@ -3,6 +3,7 @@ import { Button } from '@/components/ui/button'; import { useUploadDataset } from '@/hooks/useEvaluation'; import { cn } from '@/lib/utils'; import type { MetricsToggles, TaskType } from '@/types/evaluation'; +import { hasAtLeastOneMetric } from '@/utils/metrics'; import { motion } from 'framer-motion'; import { AlertCircle, @@ -369,6 +370,11 @@ export function DatasetUpload({ onChange={onMetricsChange} taskType={resolveDetectedTask(uploadMutation.data)} /> + {!hasAtLeastOneMetric(metrics) && ( +

+ Select at least one accuracy metric to continue. +

+ )}
)} @@ -377,7 +383,7 @@ export function DatasetUpload({ onClick={onStartEvaluation} className="mt-6 w-full" size="lg" - disabled={isStarting} + disabled={isStarting || !hasAtLeastOneMetric(metrics)} > {isStarting ? ( <> diff --git a/frontend/src/pages/Index.tsx b/frontend/src/pages/Index.tsx index d071b43..64d274a 100644 --- a/frontend/src/pages/Index.tsx +++ b/frontend/src/pages/Index.tsx @@ -12,7 +12,7 @@ import { } from '@/hooks/useEvaluation'; import type { EvaluationConfig, TaskType } from '@/types/evaluation'; import { DEFAULT_METRICS_TOGGLES } from '@/types/evaluation'; -import { buildDefaultsForTask, pickEnabledMetrics } from '@/utils/metrics'; +import { buildDefaultsForTask, hasAtLeastOneMetric, pickEnabledMetrics } from '@/utils/metrics'; import { AlertCircle, ArrowLeft, ArrowRight, RotateCcw } from 'lucide-react'; import { useCallback, useEffect, useState } from 'react'; @@ -64,6 +64,11 @@ export default function Index() { config.weights.accuracy + config.weights.cost + config.weights.latency; if (sum !== 100) return; + if (!hasAtLeastOneMetric(config.metrics)) { + setError('At least one accuracy metric must be selected.'); + return; + } + setError(null); createEvaluationMutation.mutate( { diff --git a/frontend/src/utils/metrics.ts b/frontend/src/utils/metrics.ts index 53c8c7a..5752450 100644 --- a/frontend/src/utils/metrics.ts +++ b/frontend/src/utils/metrics.ts @@ -26,6 +26,10 @@ export function buildDefaultsForTask( return base; } +export function hasAtLeastOneMetric(toggles: MetricsToggles): boolean { + return Object.values(toggles).some(Boolean); +} + export function pickEnabledMetrics( toggles: MetricsToggles, ): Partial> {