Fix HELM EEE instance metric rows

Erotemic · Erotemic · commit 4353b80019b0 · 2026-05-13T15:34:38.000-04:00
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -35,6 +35,60 @@ def _require_helm_dependencies() -> None:
 )
 
 
+def _score_from_stat(stat) -> float | None:
+    value = getattr(stat, 'mean', None)
+    if value is None:
+        count = getattr(stat, 'count', None)
+        total = getattr(stat, 'sum', None)
+        if count:
+            value = total / count
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+# Metric names whose per-instance score is a correctness signal in [0, 1]
+# where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not
+# in this allowlist (token counts, runtime, finish-reason flags, logprobs,
+# etc.) gets ``is_correct=False`` because we have no correctness claim
+# from a bookkeeping/resource metric. Keep this list tight and named after
+# the actual HELM stat names — broaden only for verified correctness
+# semantics.
+_BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({
+    'exact_match',
+    'quasi_exact_match',
+    'prefix_exact_match',
+    'quasi_prefix_exact_match',
+    'exact_match@5',
+    'quasi_exact_match@5',
+    'prefix_exact_match@5',
+    'quasi_prefix_exact_match@5',
+    'ifeval_strict_accuracy',
+    'chain_of_thought_correctness',
+    'math_equiv',
+    'math_equiv_chain_of_thought',
+})
+
+
+def _is_correct_for_metric(metric_name: str | None, score: float) -> bool:
+    """Decide ``is_correct`` honestly per metric name.
+
+    For correctness metrics in the allowlist, the HELM convention is that
+    score==1.0 means correct and 0.0 means wrong, so any positive score
+    rounds up to "correct". For anything else (bookkeeping / resource
+    stats, or graded metrics like rouge_l/bleu where >0 is not a correctness
+    signal) we deliberately do not claim correctness.
+    """
+    if metric_name is None:
+        return False
+    if metric_name in _BINARY_CORRECTNESS_METRIC_NAMES:
+        return score > 0
+    return False
+
+
 class HELMInstanceLevelDataAdapter:
     def __init__(
         self,
@@ -97,27 +151,23 @@ def convert_instance_level_logs(
             reasoning_traces = extract_all_reasonings(state)
             if isinstance(reasoning_traces, str):
                 reasoning_traces = [reasoning_traces]
+            if reasoning_traces is None:
+                reasoning_traces = []
+            reasoning_traces = [
+                trace for trace in reasoning_traces if isinstance(trace, str)
+            ]
 
-            is_correct = False
-            score = 0.0
-            if inst_stats:
-                em_stat = next(
-                    (
-                        s
-                        for s in inst_stats.stats
-                        if s.name.name == 'exact_match'
-                    ),
-                    None,
+            metric_stats = list(inst_stats.stats) if inst_stats else []
+            if not metric_stats:
+                correct_completions = sum(
+                    1 for c in completions if c.strip() in correct_refs
                 )
-                if em_stat:
-                    score = em_stat.mean
-                    is_correct = em_stat.mean > 0
-                else:  # TODO check for more specific tasks
-                    correct_completions = sum(
-                        1 for c in completions if c.strip() in correct_refs
-                    )
-                    score = correct_completions / len(completions)
-                    is_correct = score > 0
+                fallback_score = (
+                    correct_completions / len(completions)
+                    if completions
+                    else 0.0
+                )
+                metric_stats = [None]
 
             token_usage = None
             if inst_stats:
@@ -155,56 +205,72 @@ def convert_instance_level_logs(
                     total_tokens=int(p_tokens + c_tokens),
                 )
 
-            instance_level_logs.append(
-                InstanceLevelEvaluationLog(
-                    schema_version=SCHEMA_VERSION,
-                    evaluation_id=self.evaluation_id,
-                    model_id=model_id,
-                    evaluation_name=evaluation_name,
-                    sample_id=str(state.instance.id),
-                    sample_hash=sha256_string(
-                        state.request.prompt + (correct_refs[0] if correct_refs else '')
-                    ),  # TODO use all references
-                    interaction_type=InteractionType.single_turn,
-                    input=Input(
-                        raw=state.request.prompt,
-                        reference=correct_refs if correct_refs else [],
-                        choices=(
-                            list(state.output_mapping.values())
-                            if state.output_mapping
-                            else [
-                                ref.output.text
-                                for ref in state.instance.references
-                            ]
+            for stat in metric_stats:
+                if stat is None:
+                    metric_name = None
+                    score = fallback_score
+                    # Fallback path: ``score`` here is an exact-match
+                    # proxy from completion-vs-reference matching, so
+                    # the correctness claim is honest in the same sense
+                    # as the legacy single-row behavior.
+                    is_correct = score > 0
+                else:
+                    metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+                    score = _score_from_stat(stat)
+                    if score is None:
+                        continue
+                    is_correct = _is_correct_for_metric(metric_name, score)
+                instance_level_logs.append(
+                    InstanceLevelEvaluationLog(
+                        schema_version=SCHEMA_VERSION,
+                        evaluation_id=self.evaluation_id,
+                        model_id=model_id,
+                        evaluation_name=evaluation_name,
+                        evaluation_result_id=metric_name,
+                        sample_id=str(state.instance.id),
+                        sample_hash=sha256_string(
+                            state.request.prompt + (correct_refs[0] if correct_refs else '')
+                        ),  # TODO use all references
+                        interaction_type=InteractionType.single_turn,
+                        input=Input(
+                            raw=state.request.prompt,
+                            reference=correct_refs if correct_refs else [],
+                            choices=(
+                                list(state.output_mapping.values())
+                                if state.output_mapping
+                                else [
+                                    ref.output.text
+                                    for ref in state.instance.references
+                                ]
+                            ),
                         ),
-                    ),
-                    output=Output(
-                        raw=completions, reasoning_trace=reasoning_traces
-                    ),
-                    answer_attribution=[
-                        AnswerAttributionItem(
-                            turn_idx=0,
-                            source='output.raw',
-                            extracted_value=state.result.completions[
-                                0
-                            ].text.strip()
-                            if state.result and state.result.completions
-                            else '',
-                            extraction_method='exact_match',
-                            is_terminal=True,
-                        )
-                    ],
-                    evaluation=Evaluation(
-                        score=float(score), is_correct=is_correct
-                    ),
-                    token_usage=token_usage,
-                    performance=Performance(
-                        generation_time_ms=state.result.request_time * 1000
-                        if state.result.request_time
-                        else None
-                    ),
+                        output=Output(
+                            raw=completions, reasoning_trace=reasoning_traces
+                        ),
+                        answer_attribution=[
+                            AnswerAttributionItem(
+                                turn_idx=0,
+                                source='output.raw',
+                                extracted_value=state.result.completions[
+                                    0
+                                ].text.strip()
+                                if state.result and state.result.completions
+                                else '',
+                                extraction_method='exact_match',
+                                is_terminal=True,
+                            )
+                        ],
+                        evaluation=Evaluation(
+                            score=float(score), is_correct=is_correct
+                        ),
+                        token_usage=token_usage,
+                        performance=Performance(
+                            generation_time_ms=state.result.request_time * 1000
+                            if state.result.request_time
+                            else None
+                        ),
+                    )
                 )
-            )
 
         self._save_json(instance_level_logs)
         return self.path, len(instance_level_logs)
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
@@ -76,7 +76,10 @@ def test_mmlu_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, metric) emission: each of the 10 samples produces one
+    # row per non-empty stat, so total_rows is much larger than the
+    # legacy "one row per sample" count.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_hellswag_eval():
@@ -117,7 +120,8 @@ def test_hellswag_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_narrativeqa_eval():
@@ -154,7 +158,8 @@ def test_narrativeqa_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 5
+    # Per-(sample, metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 5
 
 
 def test_missing_model_deployment_falls_back_to_model():
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py