Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

### Bugs Fixed

- Fixed error blame attribution in `_get_single_run_results` to perform a case-insensitive comparison when checking the AOAI error code for `UserError`, ensuring failed evaluation runs are correctly classified as user errors regardless of server-side casing.
- Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
- Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,16 @@ def _get_single_run_results(

LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
if run_results.status != "completed":
error_code = getattr(getattr(run_results, "error", None), "code", None)
blame = (
ErrorBlame.USER_ERROR
if isinstance(error_code, str) and error_code.lower() == "usererror"
else ErrorBlame.UNKNOWN
)
raise EvaluationException(
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
+ f" failed with status {run_results.status}.",
blame=ErrorBlame.UNKNOWN,
blame=blame,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.AOAI_GRADER,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pytest
import copy
from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas
from unittest.mock import MagicMock, patch
from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas, _get_single_run_results
from azure.ai.evaluation._exceptions import ErrorBlame, EvaluationException
Comment thread
posaninagendra marked this conversation as resolved.


@pytest.fixture
Expand Down Expand Up @@ -119,3 +121,85 @@ def test_combine_item_schemas_with_external_properties_without_required(self, de

assert data_source_config["item_schema"]["properties"] == expected_properties
assert data_source_config["item_schema"]["required"] == expected_required


class TestGetSingleRunResultsBlame:
"""Unit tests for blame attribution in _get_single_run_results."""

def _make_run_info(self, client):
return {
"client": client,
"eval_group_id": "group-1",
"eval_run_id": "run-1",
"grader_name_map": {},
}

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
@pytest.mark.parametrize("code", ["UserError", "usererror", "USERERROR", "uSeReRrOr"])
def test_user_error_code_sets_user_blame(self, mock_wait, code):
"""When run fails with error.code matching 'usererror' (case-insensitive), blame should be USER_ERROR."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error.code = code
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.USER_ERROR

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_non_user_error_code_sets_unknown_blame(self, mock_wait):
"""When run fails with a non-UserError code, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error.code = "SystemError"
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_missing_error_attribute_sets_unknown_blame(self, mock_wait):
"""When run fails and error attribute is absent, blame should be UNKNOWN."""
run_result = MagicMock(spec=["status"])
run_result.status = "failed"
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_error_present_but_code_missing_sets_unknown_blame(self, mock_wait):
"""When error object exists but has no code attribute, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error = MagicMock(spec=[]) # error object without 'code'
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_error_is_none_sets_unknown_blame(self, mock_wait):
"""When error attribute is None, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error = None
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN