Skip to content

Commit 0f6380c

Browse files
authored
Preserve explicit passed when handling _result/_label keys (#46836)
* Preserve explicit `passed` when handling `_result`/`_label` keys In `_update_metric_value`, the `_result`/`result`/`_label` branch unconditionally derived `passed` from the label value for `azure_ai_evaluator` criteria, overwriting any `passed` already set by an earlier `metric_key == "passed"` iteration. Guard the derivation with `"passed" not in metric_dict` so an explicitly provided `passed` value is preserved. * Add unit tests * reformatting
1 parent 48b4a0f commit 0f6380c

2 files changed

Lines changed: 176 additions & 1 deletion

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2818,7 +2818,7 @@ def _update_metric_value(
28182818
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
28192819
metric_dict["label"] = metric_value
28202820
result_name = "label"
2821-
if criteria_type == "azure_ai_evaluator":
2821+
if criteria_type == "azure_ai_evaluator" and "passed" not in metric_dict:
28222822
if metric_value is None:
28232823
passed = False
28242824
else:

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
_process_rows,
4242
_aggregate_label_defect_metrics,
4343
_update_metric_value,
44+
_extract_metric_values,
4445
_build_internal_log_attributes,
4546
_extract_testing_criteria_metadata,
4647
_process_criteria_metrics,
@@ -3367,6 +3368,180 @@ def test_fallback_to_criteria_name(self):
33673368

33683369

33693370
@pytest.mark.unittest
3371+
class TestUpdateMetricValuePreservesExplicitPassed:
3372+
"""Tests that _update_metric_value preserves an explicit ``passed`` value when
3373+
processing ``_result`` / ``result`` / ``_label`` keys.
3374+
3375+
The fix gates the ``passed`` derivation for ``azure_ai_evaluator`` criteria
3376+
behind ``"passed" not in metric_dict`` so a previously set explicit value
3377+
(e.g. from an earlier ``metric_key == "passed"`` iteration) is not overwritten.
3378+
"""
3379+
3380+
@staticmethod
3381+
def _call(metric_dict, metric_key, metric_value, criteria_type="azure_ai_evaluator"):
3382+
return _update_metric_value(
3383+
criteria_type=criteria_type,
3384+
metric_dict=metric_dict,
3385+
metric_key=metric_key,
3386+
metric="test_metric",
3387+
metric_value=metric_value,
3388+
logger=logging.getLogger("test"),
3389+
)
3390+
3391+
# --- guard preserves explicit value ---
3392+
@pytest.mark.parametrize("label_key", ["test_metric_result", "result", "test_metric_label"])
3393+
def test_explicit_passed_true_preserved_when_label_says_fail(self, label_key):
3394+
"""Explicit passed=True must NOT be overwritten by a 'Fail' label."""
3395+
metric_dict = {"passed": True}
3396+
result_name, _, _, derived_passed = self._call(metric_dict, label_key, "Fail")
3397+
assert metric_dict["passed"] is True
3398+
assert metric_dict["label"] == "Fail"
3399+
assert result_name == "label"
3400+
assert derived_passed is None # guard fired; no derived value returned
3401+
3402+
@pytest.mark.parametrize("label_key", ["test_metric_result", "result", "test_metric_label"])
3403+
def test_explicit_passed_false_preserved_when_label_says_pass(self, label_key):
3404+
"""Explicit passed=False must NOT be overwritten by a 'Pass' label."""
3405+
metric_dict = {"passed": False}
3406+
_, _, _, derived_passed = self._call(metric_dict, label_key, "Pass")
3407+
assert metric_dict["passed"] is False
3408+
assert metric_dict["label"] == "Pass"
3409+
assert derived_passed is None
3410+
3411+
def test_explicit_passed_none_preserved_when_label_present(self):
3412+
"""Even an explicit passed=None must be preserved (key already in dict)."""
3413+
metric_dict = {"passed": None}
3414+
_, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Pass")
3415+
assert metric_dict["passed"] is None
3416+
assert metric_dict["label"] == "Pass"
3417+
assert derived_passed is None
3418+
3419+
# --- derivation still happens when no explicit passed ---
3420+
@pytest.mark.parametrize(
3421+
"label_value,expected_passed",
3422+
[
3423+
("Pass", True),
3424+
("pass", True),
3425+
("True", True),
3426+
("true", True),
3427+
("Fail", False),
3428+
("fail", False),
3429+
("False", False),
3430+
(None, False),
3431+
],
3432+
)
3433+
def test_passed_derived_when_not_present(self, label_value, expected_passed):
3434+
"""Without an existing passed key, original derivation logic still applies."""
3435+
metric_dict = {}
3436+
_, _, _, derived_passed = self._call(metric_dict, "test_metric_result", label_value)
3437+
assert metric_dict["passed"] is expected_passed
3438+
assert derived_passed is expected_passed
3439+
3440+
# --- non azure_ai_evaluator criteria never derive passed ---
3441+
def test_non_azure_ai_evaluator_does_not_derive_passed(self):
3442+
"""Other criteria types must not derive passed from label, regardless of guard."""
3443+
metric_dict = {}
3444+
_, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Pass", criteria_type="python_grader")
3445+
assert "passed" not in metric_dict
3446+
assert derived_passed is None
3447+
3448+
def test_non_azure_ai_evaluator_preserves_explicit_passed(self):
3449+
"""Other criteria types must also preserve explicit passed (no derivation path runs)."""
3450+
metric_dict = {"passed": True}
3451+
_, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Fail", criteria_type="python_grader")
3452+
assert metric_dict["passed"] is True
3453+
assert metric_dict["label"] == "Fail"
3454+
assert derived_passed is None
3455+
3456+
# --- explicit passed wins regardless of iteration order via _extract_metric_values ---
3457+
def test_extract_metric_values_passed_before_result(self):
3458+
"""When 'passed' key precedes '_result', explicit value wins (already worked, regression check)."""
3459+
metrics = {
3460+
"passed": True,
3461+
"test_metric_result": "Fail",
3462+
}
3463+
result = _extract_metric_values(
3464+
criteria_name="test_metric",
3465+
criteria_type="azure_ai_evaluator",
3466+
metrics=metrics,
3467+
expected_metrics=["test_metric"],
3468+
logger=logging.getLogger("test"),
3469+
)
3470+
assert result["test_metric"]["passed"] is True
3471+
assert result["test_metric"]["label"] == "Fail"
3472+
3473+
def test_extract_metric_values_result_before_passed(self):
3474+
"""When '_result' precedes 'passed', explicit value still wins (unchanged branch)."""
3475+
metrics = {
3476+
"test_metric_result": "Fail",
3477+
"passed": True,
3478+
}
3479+
result = _extract_metric_values(
3480+
criteria_name="test_metric",
3481+
criteria_type="azure_ai_evaluator",
3482+
metrics=metrics,
3483+
expected_metrics=["test_metric"],
3484+
logger=logging.getLogger("test"),
3485+
)
3486+
assert result["test_metric"]["passed"] is True
3487+
assert result["test_metric"]["label"] == "Fail"
3488+
3489+
# --- end-to-end through _process_criteria_metrics (one level up) ---
3490+
def test_process_criteria_metrics_preserves_explicit_passed(self):
3491+
"""End-to-end: explicit passed must survive through the full result-object pipeline."""
3492+
metrics = {
3493+
"passed": True,
3494+
"test_metric_result": "Fail",
3495+
"test_metric_score": 0.2,
3496+
"test_metric_reason": "Did not meet bar",
3497+
}
3498+
testing_criteria_metadata = {
3499+
"test_metric": {
3500+
"metrics": ["test_metric"],
3501+
"type": "azure_ai_evaluator",
3502+
"is_inverse": False,
3503+
}
3504+
}
3505+
results, _ = _process_criteria_metrics(
3506+
criteria_name="test_metric",
3507+
metrics=metrics,
3508+
testing_criteria_metadata=testing_criteria_metadata,
3509+
logger=logging.getLogger("test"),
3510+
eval_id=None,
3511+
eval_run_id=None,
3512+
)
3513+
assert len(results) == 1
3514+
# Explicit passed=True wins despite label="Fail" and low score
3515+
assert results[0]["passed"] is True
3516+
assert results[0]["label"] == "Fail"
3517+
assert results[0]["score"] == 0.2
3518+
3519+
def test_process_criteria_metrics_derives_passed_when_absent(self):
3520+
"""End-to-end regression: when no explicit passed is provided, derivation still happens."""
3521+
metrics = {
3522+
"test_metric_result": "Pass",
3523+
"test_metric_score": 0.9,
3524+
}
3525+
testing_criteria_metadata = {
3526+
"test_metric": {
3527+
"metrics": ["test_metric"],
3528+
"type": "azure_ai_evaluator",
3529+
"is_inverse": False,
3530+
}
3531+
}
3532+
results, _ = _process_criteria_metrics(
3533+
criteria_name="test_metric",
3534+
metrics=metrics,
3535+
testing_criteria_metadata=testing_criteria_metadata,
3536+
logger=logging.getLogger("test"),
3537+
eval_id=None,
3538+
eval_run_id=None,
3539+
)
3540+
assert len(results) == 1
3541+
assert results[0]["passed"] is True
3542+
assert results[0]["label"] == "Pass"
3543+
3544+
33703545
@pytest.mark.skipif(MISSING_OPENTELEMETRY, reason="This test requires the opentelemetry package")
33713546
class TestEmitEvalResultShutdown:
33723547
"""Tests that emit_eval_result_events_to_app_insights shuts down the LoggerProvider."""

0 commit comments

Comments
 (0)