|
41 | 41 | _process_rows, |
42 | 42 | _aggregate_label_defect_metrics, |
43 | 43 | _update_metric_value, |
| 44 | + _extract_metric_values, |
44 | 45 | _build_internal_log_attributes, |
45 | 46 | _extract_testing_criteria_metadata, |
46 | 47 | _process_criteria_metrics, |
@@ -3367,6 +3368,180 @@ def test_fallback_to_criteria_name(self): |
3367 | 3368 |
|
3368 | 3369 |
|
3369 | 3370 | @pytest.mark.unittest |
| 3371 | +class TestUpdateMetricValuePreservesExplicitPassed: |
| 3372 | + """Tests that _update_metric_value preserves an explicit ``passed`` value when |
| 3373 | + processing ``_result`` / ``result`` / ``_label`` keys. |
| 3374 | +
|
| 3375 | + The fix gates the ``passed`` derivation for ``azure_ai_evaluator`` criteria |
| 3376 | + behind ``"passed" not in metric_dict`` so a previously set explicit value |
| 3377 | + (e.g. from an earlier ``metric_key == "passed"`` iteration) is not overwritten. |
| 3378 | + """ |
| 3379 | + |
| 3380 | + @staticmethod |
| 3381 | + def _call(metric_dict, metric_key, metric_value, criteria_type="azure_ai_evaluator"): |
| 3382 | + return _update_metric_value( |
| 3383 | + criteria_type=criteria_type, |
| 3384 | + metric_dict=metric_dict, |
| 3385 | + metric_key=metric_key, |
| 3386 | + metric="test_metric", |
| 3387 | + metric_value=metric_value, |
| 3388 | + logger=logging.getLogger("test"), |
| 3389 | + ) |
| 3390 | + |
| 3391 | + # --- guard preserves explicit value --- |
| 3392 | + @pytest.mark.parametrize("label_key", ["test_metric_result", "result", "test_metric_label"]) |
| 3393 | + def test_explicit_passed_true_preserved_when_label_says_fail(self, label_key): |
| 3394 | + """Explicit passed=True must NOT be overwritten by a 'Fail' label.""" |
| 3395 | + metric_dict = {"passed": True} |
| 3396 | + result_name, _, _, derived_passed = self._call(metric_dict, label_key, "Fail") |
| 3397 | + assert metric_dict["passed"] is True |
| 3398 | + assert metric_dict["label"] == "Fail" |
| 3399 | + assert result_name == "label" |
| 3400 | + assert derived_passed is None # guard fired; no derived value returned |
| 3401 | + |
| 3402 | + @pytest.mark.parametrize("label_key", ["test_metric_result", "result", "test_metric_label"]) |
| 3403 | + def test_explicit_passed_false_preserved_when_label_says_pass(self, label_key): |
| 3404 | + """Explicit passed=False must NOT be overwritten by a 'Pass' label.""" |
| 3405 | + metric_dict = {"passed": False} |
| 3406 | + _, _, _, derived_passed = self._call(metric_dict, label_key, "Pass") |
| 3407 | + assert metric_dict["passed"] is False |
| 3408 | + assert metric_dict["label"] == "Pass" |
| 3409 | + assert derived_passed is None |
| 3410 | + |
| 3411 | + def test_explicit_passed_none_preserved_when_label_present(self): |
| 3412 | + """Even an explicit passed=None must be preserved (key already in dict).""" |
| 3413 | + metric_dict = {"passed": None} |
| 3414 | + _, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Pass") |
| 3415 | + assert metric_dict["passed"] is None |
| 3416 | + assert metric_dict["label"] == "Pass" |
| 3417 | + assert derived_passed is None |
| 3418 | + |
| 3419 | + # --- derivation still happens when no explicit passed --- |
| 3420 | + @pytest.mark.parametrize( |
| 3421 | + "label_value,expected_passed", |
| 3422 | + [ |
| 3423 | + ("Pass", True), |
| 3424 | + ("pass", True), |
| 3425 | + ("True", True), |
| 3426 | + ("true", True), |
| 3427 | + ("Fail", False), |
| 3428 | + ("fail", False), |
| 3429 | + ("False", False), |
| 3430 | + (None, False), |
| 3431 | + ], |
| 3432 | + ) |
| 3433 | + def test_passed_derived_when_not_present(self, label_value, expected_passed): |
| 3434 | + """Without an existing passed key, original derivation logic still applies.""" |
| 3435 | + metric_dict = {} |
| 3436 | + _, _, _, derived_passed = self._call(metric_dict, "test_metric_result", label_value) |
| 3437 | + assert metric_dict["passed"] is expected_passed |
| 3438 | + assert derived_passed is expected_passed |
| 3439 | + |
| 3440 | + # --- non azure_ai_evaluator criteria never derive passed --- |
| 3441 | + def test_non_azure_ai_evaluator_does_not_derive_passed(self): |
| 3442 | + """Other criteria types must not derive passed from label, regardless of guard.""" |
| 3443 | + metric_dict = {} |
| 3444 | + _, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Pass", criteria_type="python_grader") |
| 3445 | + assert "passed" not in metric_dict |
| 3446 | + assert derived_passed is None |
| 3447 | + |
| 3448 | + def test_non_azure_ai_evaluator_preserves_explicit_passed(self): |
| 3449 | + """Other criteria types must also preserve explicit passed (no derivation path runs).""" |
| 3450 | + metric_dict = {"passed": True} |
| 3451 | + _, _, _, derived_passed = self._call(metric_dict, "test_metric_result", "Fail", criteria_type="python_grader") |
| 3452 | + assert metric_dict["passed"] is True |
| 3453 | + assert metric_dict["label"] == "Fail" |
| 3454 | + assert derived_passed is None |
| 3455 | + |
| 3456 | + # --- explicit passed wins regardless of iteration order via _extract_metric_values --- |
| 3457 | + def test_extract_metric_values_passed_before_result(self): |
| 3458 | + """When 'passed' key precedes '_result', explicit value wins (already worked, regression check).""" |
| 3459 | + metrics = { |
| 3460 | + "passed": True, |
| 3461 | + "test_metric_result": "Fail", |
| 3462 | + } |
| 3463 | + result = _extract_metric_values( |
| 3464 | + criteria_name="test_metric", |
| 3465 | + criteria_type="azure_ai_evaluator", |
| 3466 | + metrics=metrics, |
| 3467 | + expected_metrics=["test_metric"], |
| 3468 | + logger=logging.getLogger("test"), |
| 3469 | + ) |
| 3470 | + assert result["test_metric"]["passed"] is True |
| 3471 | + assert result["test_metric"]["label"] == "Fail" |
| 3472 | + |
| 3473 | + def test_extract_metric_values_result_before_passed(self): |
| 3474 | + """When '_result' precedes 'passed', explicit value still wins (unchanged branch).""" |
| 3475 | + metrics = { |
| 3476 | + "test_metric_result": "Fail", |
| 3477 | + "passed": True, |
| 3478 | + } |
| 3479 | + result = _extract_metric_values( |
| 3480 | + criteria_name="test_metric", |
| 3481 | + criteria_type="azure_ai_evaluator", |
| 3482 | + metrics=metrics, |
| 3483 | + expected_metrics=["test_metric"], |
| 3484 | + logger=logging.getLogger("test"), |
| 3485 | + ) |
| 3486 | + assert result["test_metric"]["passed"] is True |
| 3487 | + assert result["test_metric"]["label"] == "Fail" |
| 3488 | + |
| 3489 | + # --- end-to-end through _process_criteria_metrics (one level up) --- |
| 3490 | + def test_process_criteria_metrics_preserves_explicit_passed(self): |
| 3491 | + """End-to-end: explicit passed must survive through the full result-object pipeline.""" |
| 3492 | + metrics = { |
| 3493 | + "passed": True, |
| 3494 | + "test_metric_result": "Fail", |
| 3495 | + "test_metric_score": 0.2, |
| 3496 | + "test_metric_reason": "Did not meet bar", |
| 3497 | + } |
| 3498 | + testing_criteria_metadata = { |
| 3499 | + "test_metric": { |
| 3500 | + "metrics": ["test_metric"], |
| 3501 | + "type": "azure_ai_evaluator", |
| 3502 | + "is_inverse": False, |
| 3503 | + } |
| 3504 | + } |
| 3505 | + results, _ = _process_criteria_metrics( |
| 3506 | + criteria_name="test_metric", |
| 3507 | + metrics=metrics, |
| 3508 | + testing_criteria_metadata=testing_criteria_metadata, |
| 3509 | + logger=logging.getLogger("test"), |
| 3510 | + eval_id=None, |
| 3511 | + eval_run_id=None, |
| 3512 | + ) |
| 3513 | + assert len(results) == 1 |
| 3514 | + # Explicit passed=True wins despite label="Fail" and low score |
| 3515 | + assert results[0]["passed"] is True |
| 3516 | + assert results[0]["label"] == "Fail" |
| 3517 | + assert results[0]["score"] == 0.2 |
| 3518 | + |
| 3519 | + def test_process_criteria_metrics_derives_passed_when_absent(self): |
| 3520 | + """End-to-end regression: when no explicit passed is provided, derivation still happens.""" |
| 3521 | + metrics = { |
| 3522 | + "test_metric_result": "Pass", |
| 3523 | + "test_metric_score": 0.9, |
| 3524 | + } |
| 3525 | + testing_criteria_metadata = { |
| 3526 | + "test_metric": { |
| 3527 | + "metrics": ["test_metric"], |
| 3528 | + "type": "azure_ai_evaluator", |
| 3529 | + "is_inverse": False, |
| 3530 | + } |
| 3531 | + } |
| 3532 | + results, _ = _process_criteria_metrics( |
| 3533 | + criteria_name="test_metric", |
| 3534 | + metrics=metrics, |
| 3535 | + testing_criteria_metadata=testing_criteria_metadata, |
| 3536 | + logger=logging.getLogger("test"), |
| 3537 | + eval_id=None, |
| 3538 | + eval_run_id=None, |
| 3539 | + ) |
| 3540 | + assert len(results) == 1 |
| 3541 | + assert results[0]["passed"] is True |
| 3542 | + assert results[0]["label"] == "Pass" |
| 3543 | + |
| 3544 | + |
3370 | 3545 | @pytest.mark.skipif(MISSING_OPENTELEMETRY, reason="This test requires the opentelemetry package") |
3371 | 3546 | class TestEmitEvalResultShutdown: |
3372 | 3547 | """Tests that emit_eval_result_events_to_app_insights shuts down the LoggerProvider.""" |
|
0 commit comments