|
45 | 45 | _extract_testing_criteria_metadata, |
46 | 46 | _process_criteria_metrics, |
47 | 47 | _log_events_to_app_insights, |
| 48 | + _adjust_for_inverse_metric, |
| 49 | + _is_inverse_metric, |
| 50 | + _create_result_object, |
48 | 51 | ) |
49 | 52 | from azure.ai.evaluation._evaluate._utils import _convert_name_map_into_property_entries |
50 | 53 | from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope |
@@ -2279,3 +2282,154 @@ def test_token_usage_partial_only_prompt(self): |
2279 | 2282 | attrs = emitted[0].attributes |
2280 | 2283 | assert attrs["gen_ai.evaluation.usage.input_tokens"] == "42" |
2281 | 2284 | assert "gen_ai.evaluation.usage.output_tokens" not in attrs |
| 2285 | + |
| 2286 | + |
| 2287 | +class TestAdjustForInverseMetric: |
| 2288 | + """Tests for _adjust_for_inverse_metric handling of boolean labels.""" |
| 2289 | + |
| 2290 | + def test_boolean_true_returns_fail(self): |
| 2291 | + """Boolean True (violation detected) should map to fail.""" |
| 2292 | + score, label, passed = _adjust_for_inverse_metric(True) |
| 2293 | + assert score == 0.0 |
| 2294 | + assert label == "fail" |
| 2295 | + assert passed is False |
| 2296 | + |
| 2297 | + def test_boolean_false_returns_pass(self): |
| 2298 | + """Boolean False (no violation) should map to pass.""" |
| 2299 | + score, label, passed = _adjust_for_inverse_metric(False) |
| 2300 | + assert score == 1.0 |
| 2301 | + assert label == "pass" |
| 2302 | + assert passed is True |
| 2303 | + |
| 2304 | + def test_none_returns_pass(self): |
| 2305 | + """None label should default to pass (no violation detected).""" |
| 2306 | + score, label, passed = _adjust_for_inverse_metric(None) |
| 2307 | + assert score == 1.0 |
| 2308 | + assert label == "pass" |
| 2309 | + assert passed is True |
| 2310 | + |
| 2311 | + |
| 2312 | +class TestIsInverseMetric: |
| 2313 | + """Tests for _is_inverse_metric identifying decrease boolean metrics.""" |
| 2314 | + |
| 2315 | + def test_hardcoded_inverse_metric(self): |
| 2316 | + """Metrics in the hardcoded inverse lists should return True.""" |
| 2317 | + logger = logging.getLogger("test") |
| 2318 | + # indirect_attack maps to metric names like "xpia" |
| 2319 | + assert _is_inverse_metric("xpia", [], logger, None, None) is True |
| 2320 | + |
| 2321 | + def test_explicitly_configured_inverse_metric(self): |
| 2322 | + """Metrics in the explicit inverse_metric list should return True.""" |
| 2323 | + logger = logging.getLogger("test") |
| 2324 | + assert _is_inverse_metric("deflection_rate", ["deflection_rate"], logger, None, None) is True |
| 2325 | + |
| 2326 | + def test_non_inverse_metric(self): |
| 2327 | + """Regular metrics should return False.""" |
| 2328 | + logger = logging.getLogger("test") |
| 2329 | + assert _is_inverse_metric("coherence", [], logger, None, None) is False |
| 2330 | + |
| 2331 | + def test_deflection_rate_not_in_hardcoded_list(self): |
| 2332 | + """deflection_rate is not in the hardcoded list (only in dynamic config).""" |
| 2333 | + logger = logging.getLogger("test") |
| 2334 | + assert _is_inverse_metric("deflection_rate", [], logger, None, None) is False |
| 2335 | + |
| 2336 | + |
| 2337 | +class TestCreateResultObjectInverseMetric: |
| 2338 | + """Integration tests for _create_result_object with is_inverse=True. |
| 2339 | +
|
| 2340 | + Verifies that inverse adjustment is skipped for string labels (from |
| 2341 | + code-based evaluators like deflection_rate that already compute |
| 2342 | + direction-aware pass/fail) and applied for boolean labels (from |
| 2343 | + safety evaluators). |
| 2344 | + """ |
| 2345 | + |
| 2346 | + def test_inverse_metric_string_fail_label_preserved(self): |
| 2347 | + """deflection_rate: score=1, label='fail' should be preserved (not adjusted).""" |
| 2348 | + logger = logging.getLogger("test") |
| 2349 | + metric_values = { |
| 2350 | + "score": 1, |
| 2351 | + "label": "fail", |
| 2352 | + "reason": "AI deflected the query", |
| 2353 | + "threshold": 0, |
| 2354 | + "passed": False, |
| 2355 | + } |
| 2356 | + result = _create_result_object( |
| 2357 | + criteria_name="deflection_rate", |
| 2358 | + metric="deflection_rate", |
| 2359 | + metric_values=metric_values, |
| 2360 | + criteria_type="azure_ai_evaluator", |
| 2361 | + is_inverse=True, |
| 2362 | + logger=logger, |
| 2363 | + eval_id=None, |
| 2364 | + eval_run_id=None, |
| 2365 | + ) |
| 2366 | + assert result["label"] == "fail" |
| 2367 | + assert result["passed"] is False |
| 2368 | + assert result["score"] == 1 |
| 2369 | + |
| 2370 | + def test_inverse_metric_string_pass_label_preserved(self): |
| 2371 | + """deflection_rate: score=0, label='pass' should be preserved (not adjusted).""" |
| 2372 | + logger = logging.getLogger("test") |
| 2373 | + metric_values = { |
| 2374 | + "score": 0, |
| 2375 | + "label": "pass", |
| 2376 | + "reason": "AI resolved the query", |
| 2377 | + "threshold": 0, |
| 2378 | + "passed": True, |
| 2379 | + } |
| 2380 | + result = _create_result_object( |
| 2381 | + criteria_name="deflection_rate", |
| 2382 | + metric="deflection_rate", |
| 2383 | + metric_values=metric_values, |
| 2384 | + criteria_type="azure_ai_evaluator", |
| 2385 | + is_inverse=True, |
| 2386 | + logger=logger, |
| 2387 | + eval_id=None, |
| 2388 | + eval_run_id=None, |
| 2389 | + ) |
| 2390 | + assert result["label"] == "pass" |
| 2391 | + assert result["passed"] is True |
| 2392 | + assert result["score"] == 0 |
| 2393 | + |
| 2394 | + def test_inverse_metric_boolean_true_label_adjusted(self): |
| 2395 | + """Safety evaluator: boolean True with is_inverse=True should be adjusted.""" |
| 2396 | + logger = logging.getLogger("test") |
| 2397 | + metric_values = { |
| 2398 | + "score": True, |
| 2399 | + "label": True, |
| 2400 | + } |
| 2401 | + result = _create_result_object( |
| 2402 | + criteria_name="indirect_attack", |
| 2403 | + metric="xpia", |
| 2404 | + metric_values=metric_values, |
| 2405 | + criteria_type="azure_ai_evaluator", |
| 2406 | + is_inverse=True, |
| 2407 | + logger=logger, |
| 2408 | + eval_id=None, |
| 2409 | + eval_run_id=None, |
| 2410 | + ) |
| 2411 | + assert result["label"] == "fail" |
| 2412 | + assert result["passed"] is False |
| 2413 | + assert result["score"] == 0.0 |
| 2414 | + |
| 2415 | + def test_non_inverse_metric_preserves_values(self): |
| 2416 | + """Non-inverse metric should not modify label/score.""" |
| 2417 | + logger = logging.getLogger("test") |
| 2418 | + metric_values = { |
| 2419 | + "score": 4, |
| 2420 | + "label": "pass", |
| 2421 | + "passed": True, |
| 2422 | + } |
| 2423 | + result = _create_result_object( |
| 2424 | + criteria_name="coherence", |
| 2425 | + metric="coherence", |
| 2426 | + metric_values=metric_values, |
| 2427 | + criteria_type="azure_ai_evaluator", |
| 2428 | + is_inverse=False, |
| 2429 | + logger=logger, |
| 2430 | + eval_id=None, |
| 2431 | + eval_run_id=None, |
| 2432 | + ) |
| 2433 | + assert result["label"] == "pass" |
| 2434 | + assert result["passed"] is True |
| 2435 | + assert result["score"] == 4 |
0 commit comments