Skip to content

Commit a4541c2

Browse files
Fix inverse metric adjustment to skip string labels from code-based evaluators (#46663)
Code-based evaluators like deflection_rate return string pass/fail labels that already reflect direction-aware semantics. The inverse metric adjustment was incorrectly treating these strings as boolean False (since isinstance('fail', bool) is False), flipping 'fail' to 'pass'. Fix: skip _adjust_for_inverse_metric entirely when the label is a string, since string labels mean the evaluator already computed the correct direction-aware pass/fail. Boolean labels (from safety evaluators) still get inverted as before. Fixes Bug #5240742 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 3067d65 commit a4541c2

2 files changed

Lines changed: 159 additions & 2 deletions

File tree

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2931,8 +2931,11 @@ def _create_result_object(
29312931
sample = metric_values.get("sample")
29322932
properties = metric_values.get("properties")
29332933

2934-
# Handle decrease boolean metrics
2935-
if is_inverse:
2934+
# Handle decrease boolean metrics — only apply inverse adjustment for
2935+
# boolean labels (from safety evaluators like indirect_attack). String
2936+
# labels like "pass"/"fail" (from code-based evaluators like deflection_rate)
2937+
# indicate the evaluator already computed direction-aware pass/fail.
2938+
if is_inverse and not (label is not None and isinstance(label, str)):
29362939
score, label, passed = _adjust_for_inverse_metric(label)
29372940

29382941
# Create result object

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
_extract_testing_criteria_metadata,
4646
_process_criteria_metrics,
4747
_log_events_to_app_insights,
48+
_adjust_for_inverse_metric,
49+
_is_inverse_metric,
50+
_create_result_object,
4851
)
4952
from azure.ai.evaluation._evaluate._utils import _convert_name_map_into_property_entries
5053
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope
@@ -2279,3 +2282,154 @@ def test_token_usage_partial_only_prompt(self):
22792282
attrs = emitted[0].attributes
22802283
assert attrs["gen_ai.evaluation.usage.input_tokens"] == "42"
22812284
assert "gen_ai.evaluation.usage.output_tokens" not in attrs
2285+
2286+
2287+
class TestAdjustForInverseMetric:
2288+
"""Tests for _adjust_for_inverse_metric handling of boolean labels."""
2289+
2290+
def test_boolean_true_returns_fail(self):
2291+
"""Boolean True (violation detected) should map to fail."""
2292+
score, label, passed = _adjust_for_inverse_metric(True)
2293+
assert score == 0.0
2294+
assert label == "fail"
2295+
assert passed is False
2296+
2297+
def test_boolean_false_returns_pass(self):
2298+
"""Boolean False (no violation) should map to pass."""
2299+
score, label, passed = _adjust_for_inverse_metric(False)
2300+
assert score == 1.0
2301+
assert label == "pass"
2302+
assert passed is True
2303+
2304+
def test_none_returns_pass(self):
2305+
"""None label should default to pass (no violation detected)."""
2306+
score, label, passed = _adjust_for_inverse_metric(None)
2307+
assert score == 1.0
2308+
assert label == "pass"
2309+
assert passed is True
2310+
2311+
2312+
class TestIsInverseMetric:
2313+
"""Tests for _is_inverse_metric identifying decrease boolean metrics."""
2314+
2315+
def test_hardcoded_inverse_metric(self):
2316+
"""Metrics in the hardcoded inverse lists should return True."""
2317+
logger = logging.getLogger("test")
2318+
# indirect_attack maps to metric names like "xpia"
2319+
assert _is_inverse_metric("xpia", [], logger, None, None) is True
2320+
2321+
def test_explicitly_configured_inverse_metric(self):
2322+
"""Metrics in the explicit inverse_metric list should return True."""
2323+
logger = logging.getLogger("test")
2324+
assert _is_inverse_metric("deflection_rate", ["deflection_rate"], logger, None, None) is True
2325+
2326+
def test_non_inverse_metric(self):
2327+
"""Regular metrics should return False."""
2328+
logger = logging.getLogger("test")
2329+
assert _is_inverse_metric("coherence", [], logger, None, None) is False
2330+
2331+
def test_deflection_rate_not_in_hardcoded_list(self):
2332+
"""deflection_rate is not in the hardcoded list (only in dynamic config)."""
2333+
logger = logging.getLogger("test")
2334+
assert _is_inverse_metric("deflection_rate", [], logger, None, None) is False
2335+
2336+
2337+
class TestCreateResultObjectInverseMetric:
2338+
"""Integration tests for _create_result_object with is_inverse=True.
2339+
2340+
Verifies that inverse adjustment is skipped for string labels (from
2341+
code-based evaluators like deflection_rate that already compute
2342+
direction-aware pass/fail) and applied for boolean labels (from
2343+
safety evaluators).
2344+
"""
2345+
2346+
def test_inverse_metric_string_fail_label_preserved(self):
2347+
"""deflection_rate: score=1, label='fail' should be preserved (not adjusted)."""
2348+
logger = logging.getLogger("test")
2349+
metric_values = {
2350+
"score": 1,
2351+
"label": "fail",
2352+
"reason": "AI deflected the query",
2353+
"threshold": 0,
2354+
"passed": False,
2355+
}
2356+
result = _create_result_object(
2357+
criteria_name="deflection_rate",
2358+
metric="deflection_rate",
2359+
metric_values=metric_values,
2360+
criteria_type="azure_ai_evaluator",
2361+
is_inverse=True,
2362+
logger=logger,
2363+
eval_id=None,
2364+
eval_run_id=None,
2365+
)
2366+
assert result["label"] == "fail"
2367+
assert result["passed"] is False
2368+
assert result["score"] == 1
2369+
2370+
def test_inverse_metric_string_pass_label_preserved(self):
2371+
"""deflection_rate: score=0, label='pass' should be preserved (not adjusted)."""
2372+
logger = logging.getLogger("test")
2373+
metric_values = {
2374+
"score": 0,
2375+
"label": "pass",
2376+
"reason": "AI resolved the query",
2377+
"threshold": 0,
2378+
"passed": True,
2379+
}
2380+
result = _create_result_object(
2381+
criteria_name="deflection_rate",
2382+
metric="deflection_rate",
2383+
metric_values=metric_values,
2384+
criteria_type="azure_ai_evaluator",
2385+
is_inverse=True,
2386+
logger=logger,
2387+
eval_id=None,
2388+
eval_run_id=None,
2389+
)
2390+
assert result["label"] == "pass"
2391+
assert result["passed"] is True
2392+
assert result["score"] == 0
2393+
2394+
def test_inverse_metric_boolean_true_label_adjusted(self):
2395+
"""Safety evaluator: boolean True with is_inverse=True should be adjusted."""
2396+
logger = logging.getLogger("test")
2397+
metric_values = {
2398+
"score": True,
2399+
"label": True,
2400+
}
2401+
result = _create_result_object(
2402+
criteria_name="indirect_attack",
2403+
metric="xpia",
2404+
metric_values=metric_values,
2405+
criteria_type="azure_ai_evaluator",
2406+
is_inverse=True,
2407+
logger=logger,
2408+
eval_id=None,
2409+
eval_run_id=None,
2410+
)
2411+
assert result["label"] == "fail"
2412+
assert result["passed"] is False
2413+
assert result["score"] == 0.0
2414+
2415+
def test_non_inverse_metric_preserves_values(self):
2416+
"""Non-inverse metric should not modify label/score."""
2417+
logger = logging.getLogger("test")
2418+
metric_values = {
2419+
"score": 4,
2420+
"label": "pass",
2421+
"passed": True,
2422+
}
2423+
result = _create_result_object(
2424+
criteria_name="coherence",
2425+
metric="coherence",
2426+
metric_values=metric_values,
2427+
criteria_type="azure_ai_evaluator",
2428+
is_inverse=False,
2429+
logger=logger,
2430+
eval_id=None,
2431+
eval_run_id=None,
2432+
)
2433+
assert result["label"] == "pass"
2434+
assert result["passed"] is True
2435+
assert result["score"] == 4

0 commit comments

Comments
 (0)