Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions backend/python-eval-function/src/accuracy_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from typing import Optional, List, Dict
from typing import Optional, List, Mapping
from dataclasses import dataclass

from metrics import is_metric_enabled

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -29,7 +31,8 @@ def __init__(self):
def calculate_accuracy_metrics(
self,
predictions: List[str],
references: Optional[List[str]]
references: Optional[List[str]],
selected: Optional[Mapping[str, bool]] = None,
) -> Optional[AccuracyMetrics]:
if references is None or len(references) == 0:
logger.info("No reference outputs available, skipping accuracy metrics")
Expand All @@ -38,28 +41,36 @@ def calculate_accuracy_metrics(
if all(ref is None or ref == "" for ref in references):
logger.info("All reference outputs are empty, skipping accuracy metrics")
return None

logger.info(f"Calculating accuracy metrics for {len(predictions)} predictions")


logger.info(
f"Calculating accuracy metrics for {len(predictions)} predictions "
f"(selected={dict(selected) if selected is not None else 'all'})"
)

try:
metrics = AccuracyMetrics()

metrics.bleu = self._calculate_bleu(predictions, references)
metrics.rouge = self._calculate_rouge(predictions, references)
metrics.meteor = self._calculate_meteor(predictions, references)
metrics.levenshtein = self._calculate_levenshtein(predictions, references)
metrics.bertscore = self._calculate_bertscore(predictions, references)


if is_metric_enabled(selected, 'bleu'):
metrics.bleu = self._calculate_bleu(predictions, references)
if is_metric_enabled(selected, 'rouge'):
metrics.rouge = self._calculate_rouge(predictions, references)
if is_metric_enabled(selected, 'meteor'):
metrics.meteor = self._calculate_meteor(predictions, references)
if is_metric_enabled(selected, 'levenshtein'):
metrics.levenshtein = self._calculate_levenshtein(predictions, references)
if is_metric_enabled(selected, 'bertscore'):
metrics.bertscore = self._calculate_bertscore(predictions, references)

def fmt(v): return f"{v:.4f}" if v is not None else "N/A"
logger.info(
f"Accuracy metrics calculated - "
f"BLEU={fmt(metrics.bleu)}, ROUGE={fmt(metrics.rouge)}, "
f"METEOR={fmt(metrics.meteor)}, Levenshtein={fmt(metrics.levenshtein)}, "
f"BERTScore={fmt(metrics.bertscore)}"
)

return metrics

except Exception as e:
logger.error(f"Error calculating accuracy metrics: {e}", exc_info=True)
return None
Expand Down
41 changes: 34 additions & 7 deletions backend/python-eval-function/src/classification_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from typing import Optional, List
from typing import Optional, List, Mapping
from dataclasses import dataclass

from metrics import is_metric_enabled

logger = logging.getLogger(__name__)


Expand All @@ -13,6 +15,14 @@ class ClassificationMetrics:
f1_macro: Optional[float] = None
f1_weighted: Optional[float] = None

_CLASSIFICATION_KEYS: tuple[str, ...] = (
'classification_accuracy',
'precision_macro',
'recall_macro',
'f1_macro',
'f1_weighted',
)


def normalize_prediction(prediction: str, valid_classes: List[str]) -> str:
cleaned = prediction.strip()
Expand All @@ -38,6 +48,7 @@ def calculate_classification_metrics(
predictions: List[str],
references: List[str],
valid_classes: Optional[List[str]] = None,
selected: Optional[Mapping[str, bool]] = None,
) -> Optional[ClassificationMetrics]:
if not references or not predictions:
logger.info("No references or predictions, skipping classification metrics")
Expand All @@ -47,6 +58,12 @@ def calculate_classification_metrics(
logger.info("All references are empty, skipping classification metrics")
return None

if selected is not None and not any(
is_metric_enabled(selected, k) for k in _CLASSIFICATION_KEYS
):
logger.info("All classification metrics disabled, skipping computation")
return ClassificationMetrics()

if valid_classes is None:
valid_classes = list(set(references))

Expand All @@ -56,7 +73,7 @@ def calculate_classification_metrics(

logger.info(
f"Calculating classification metrics for {len(normalized_preds)} predictions "
f"across {len(valid_classes)} classes"
f"across {len(valid_classes)} classes (selected={dict(selected) if selected is not None else 'all'})"
)

try:
Expand Down Expand Up @@ -86,11 +103,21 @@ def calculate_classification_metrics(
)

metrics = ClassificationMetrics(
accuracy=round(acc, 4),
precision_macro=round(precision, 4),
recall_macro=round(recall, 4),
f1_macro=round(f1, 4),
f1_weighted=round(f1_w, 4),
accuracy=round(acc, 4)
if is_metric_enabled(selected, 'classification_accuracy')
else None,
precision_macro=round(precision, 4)
if is_metric_enabled(selected, 'precision_macro')
else None,
recall_macro=round(recall, 4)
if is_metric_enabled(selected, 'recall_macro')
else None,
f1_macro=round(f1, 4)
if is_metric_enabled(selected, 'f1_macro')
else None,
f1_weighted=round(f1_w, 4)
if is_metric_enabled(selected, 'f1_weighted')
else None,
)

logger.info(
Expand Down
22 changes: 19 additions & 3 deletions backend/python-eval-function/src/dynamodb_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import boto3
from botocore.exceptions import ClientError

from metrics import normalize_metrics_config

logger = logging.getLogger(__name__)


Expand All @@ -28,18 +30,32 @@ def load_job(self, evaluation_id: str) -> Dict[str, Any]:
item = response['Item']
models = json.loads(item.get('models', '[]'))
weights = json.loads(item.get('weights', '{}'))

raw_metrics = item.get('metrics')
stored_metrics: Optional[Dict[str, Any]] = None
if raw_metrics:
try:
stored_metrics = json.loads(raw_metrics)
except (TypeError, ValueError) as parse_err:
logger.warning(
f"Failed to parse stored metrics config "
f"({parse_err}); defaulting to all metrics enabled"
)
metrics = normalize_metrics_config(stored_metrics)

job_config = {
'evaluation_id': item['evaluation_id'],
'dataset_id': item['dataset_id'],
'models': models,
'weights': weights,
'metrics': metrics,
'status': item.get('status', 'pending'),
'created_at': item.get('created_at', ''),
'total_samples': item.get('total_samples')
}

logger.info(f"Loaded job config: {len(models)} models, weights: {weights}")

logger.info(
f"Loaded job config: {len(models)} models, weights: {weights}, metrics: {metrics}"
)
return job_config

except ClientError as e:
Expand Down
54 changes: 35 additions & 19 deletions backend/python-eval-function/src/geval_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ def evaluate(
predictions: List[str],
references: Optional[List[str]] = None,
task_type: str = "summarization",
compute_reasoning: bool = True,
compute_faithfulness: bool = True,
) -> GEvalMetrics:
if not inputs or not predictions:
logger.warning("Empty inputs or predictions, skipping G-Eval")
Expand All @@ -108,10 +110,22 @@ def evaluate(
logger.error("inputs and predictions length mismatch, skipping G-Eval")
return GEvalMetrics()

if not compute_reasoning and not compute_faithfulness:
logger.info("Both G-Eval metrics disabled, skipping")
return GEvalMetrics()

try:
model = self._build_judge_model()
reasoning_metric = self._build_reasoning_metric(model, task_type)
faithfulness_metric = self._build_faithfulness_metric(model, task_type)
reasoning_metric = (
self._build_reasoning_metric(model, task_type)
if compute_reasoning
else None
)
faithfulness_metric = (
self._build_faithfulness_metric(model, task_type)
if compute_faithfulness
else None
)
except Exception as e:
logger.error(f"Failed to initialize G-Eval components: {e}", exc_info=True)
return GEvalMetrics()
Expand All @@ -131,23 +145,25 @@ def evaluate(
actual_output=pred,
)

try:
reasoning_metric.measure(test_case)
reasoning_scores.append(reasoning_metric.score)
logger.debug(
f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
)
except Exception as e:
logger.warning(f"[{idx}] Reasoning metric failed: {e}")

try:
faithfulness_metric.measure(test_case)
faithfulness_scores.append(faithfulness_metric.score)
logger.debug(
f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
)
except Exception as e:
logger.warning(f"[{idx}] Faithfulness metric failed: {e}")
if reasoning_metric is not None:
try:
reasoning_metric.measure(test_case)
reasoning_scores.append(reasoning_metric.score)
logger.debug(
f"[{idx}] Reasoning score={reasoning_metric.score:.4f} reason={reasoning_metric.reason}"
)
except Exception as e:
logger.warning(f"[{idx}] Reasoning metric failed: {e}")

if faithfulness_metric is not None:
try:
faithfulness_metric.measure(test_case)
faithfulness_scores.append(faithfulness_metric.score)
logger.debug(
f"[{idx}] Faithfulness score={faithfulness_metric.score:.4f} reason={faithfulness_metric.reason}"
)
except Exception as e:
logger.warning(f"[{idx}] Faithfulness metric failed: {e}")

result = GEvalMetrics()

Expand Down
77 changes: 51 additions & 26 deletions backend/python-eval-function/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,18 @@ def main():
dataset_id = job_config['dataset_id']
models = job_config['models']
weights = job_config['weights']

logger.info(f"Job config loaded - dataset: {dataset_id}, models: {len(models)}")
# `metrics_config` is already normalized by DynamoDBService.load_job —
# every key present, all bool.
metrics_config = job_config.get('metrics') or {}

from metrics import is_metric_enabled
compute_geval_reasoning = is_metric_enabled(metrics_config, 'geval_reasoning')
compute_geval_faithfulness = is_metric_enabled(metrics_config, 'geval_faithfulness')

logger.info(
f"Job config loaded - dataset: {dataset_id}, models: {len(models)}, "
f"metrics={metrics_config or 'all'}"
)

from dataset_loader import DatasetLoader
dataset_loader = DatasetLoader()
Expand Down Expand Up @@ -134,7 +144,9 @@ def main():
predictions = [invocation_results[i].response_text for i in successful_indices]
references = [all_references[i] for i in successful_indices] if all_references else None

accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(predictions, references)
accuracy_metrics = accuracy_evaluator.calculate_accuracy_metrics(
predictions, references, selected=metrics_config
)
accuracy_results[model_id] = accuracy_metrics or AccuracyMetrics()

logger.info(f"Summarization accuracy complete for {len(accuracy_results)} models")
Expand All @@ -149,7 +161,8 @@ def main():
references = [all_references[i] for i in successful_indices] if all_references else None

cls_metrics = classification_evaluator.calculate_classification_metrics(
predictions, references, valid_classes=unique_classes
predictions, references, valid_classes=unique_classes,
selected=metrics_config,
)

acc = AccuracyMetrics()
Expand All @@ -163,28 +176,40 @@ def main():

logger.info(f"Classification accuracy complete for {len(accuracy_results)} models")

from geval_evaluator import GEvalEvaluator
geval_evaluator = GEvalEvaluator()

for model_id, invocation_results in results_by_model.items():
successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
predictions = [invocation_results[i].response_text for i in successful_indices]
inputs = [dataset.documents[i] for i in successful_indices]

logger.info(f"Running G-Eval for model {model_id} on {len(predictions)} samples")
geval_metrics = geval_evaluator.evaluate(inputs, predictions, task_type=task_type)

acc = accuracy_results.get(model_id)
if acc is None:
acc = AccuracyMetrics()
accuracy_results[model_id] = acc

acc.geval_reasoning = geval_metrics.reasoning
acc.geval_faithfulness = geval_metrics.faithfulness

logger.info(f"G-Eval complete for {model_id}")

logger.info(f"G-Eval evaluation complete for all models")
if compute_geval_reasoning or compute_geval_faithfulness:
from geval_evaluator import GEvalEvaluator
geval_evaluator = GEvalEvaluator()

for model_id, invocation_results in results_by_model.items():
successful_indices = [i for i, r in enumerate(invocation_results) if r.error is None]
predictions = [invocation_results[i].response_text for i in successful_indices]
inputs = [dataset.documents[i] for i in successful_indices]

logger.info(
f"Running G-Eval for model {model_id} on {len(predictions)} samples "
f"(reasoning={compute_geval_reasoning}, faithfulness={compute_geval_faithfulness})"
)
geval_metrics = geval_evaluator.evaluate(
inputs, predictions, task_type=task_type,
compute_reasoning=compute_geval_reasoning,
compute_faithfulness=compute_geval_faithfulness,
)

acc = accuracy_results.get(model_id)
if acc is None:
acc = AccuracyMetrics()
accuracy_results[model_id] = acc

if compute_geval_reasoning:
acc.geval_reasoning = geval_metrics.reasoning
if compute_geval_faithfulness:
acc.geval_faithfulness = geval_metrics.faithfulness

logger.info(f"G-Eval complete for {model_id}")

logger.info("G-Eval evaluation complete for all models")
else:
logger.info("G-Eval disabled via metrics config, skipping")

from cost_calculator import CostCalculator
cost_calculator = CostCalculator()
Expand Down
Loading