Skip to content

Commit 31160ea

Browse files
slister1001Copilot
andcommitted
Fix App Insights emission silently dropping events when evaluator definition has no metrics
Rubric evaluators registered without metric metadata produce an evaluator definition payload of `{"type": "rubric"}` (RAISvc's `RubricBasedEvaluatorDefinition.Metrics` defaults to empty and validation does not require it). In `_build_internal_log_attributes` the helper called `evaluator_definition.get("metrics").get(metric_name)` which raises `AttributeError: 'NoneType' object has no attribute 'get'` when `metrics` is missing or set to None. That exception is caught by the per-event `try/except` in `_log_events_to_app_insights` and silently swallowed, so `event_logger.emit()` is never called for any event in the run. The net effect is that App Insights receives zero `gen_ai.evaluation.result` events for the entire eval — observed in production for `rubric-manual-260526043804-e45a09` in the westus2 bug bash project, while other rubric evaluators in the same workspace that had populated metric metadata continued to emit normally. Guard the metrics lookup so a single misshapen definition does not abort emission for the whole run. Added regression tests for `_build_internal_log_attributes` covering: missing `metrics` key, `metrics: None`, `metrics: {}`, `metrics: [...]` (malformed type), and the happy path where metric metadata is preserved. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 5da28b4 commit 31160ea

3 files changed

Lines changed: 96 additions & 1 deletion

File tree

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
2020
- Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
2121
- Fixed non-dict `_properties` values from evaluators causing downstream issues. Values that are not dicts are now logged and dropped gracefully.
22+
- Fixed App Insights emission silently dropping every `gen_ai.evaluation.result` event when an evaluator definition (e.g., a rubric evaluator registered without metric metadata, sent as `{"type": "rubric"}`) lacked a `metrics` dict. `_build_internal_log_attributes` raised `AttributeError: 'NoneType' object has no attribute 'get'`, which was swallowed by the per-event try/except in `_log_events_to_app_insights`, resulting in zero events being emitted to App Insights for the affected run. The helper now tolerates missing or non-dict `metrics` sections and still emits the event with the base evaluator attributes.
2223

2324
### Other Changes
2425

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,11 @@ def _build_internal_log_attributes(
11361136
internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
11371137

11381138
if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
1139-
metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
1139+
metrics_section = evaluator_definition.get("metrics")
1140+
if isinstance(metrics_section, dict):
1141+
metric_config_detail = metrics_section.get(metric_name)
1142+
else:
1143+
metric_config_detail = None
11401144

11411145
if metric_config_detail:
11421146
if metric_config_detail.get("min_value") is not None:

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,6 +2161,96 @@ def test_threshold_positive_value(self):
21612161
assert attrs["gen_ai.evaluation.threshold"] == "3.5"
21622162

21632163

2164+
@pytest.mark.unittest
2165+
class TestBuildInternalLogAttributesEvaluatorDefinition:
2166+
"""Tests for _build_internal_log_attributes handling of malformed/missing evaluator metrics.
2167+
2168+
Regression: rubric evaluators registered without a `metrics` field on the evaluator
2169+
definition (RAISvc sends bare ``{"type": "rubric"}``) used to crash this helper with
2170+
``AttributeError: 'NoneType' object has no attribute 'get'``. That exception was
2171+
silently swallowed by the per-event try/except in ``_log_events_to_app_insights``
2172+
and resulted in zero events being emitted to App Insights for the entire run.
2173+
"""
2174+
2175+
def test_definition_without_metrics_key_does_not_raise(self):
2176+
"""Evaluator definition lacking 'metrics' (e.g. rubric ``{"type": "rubric"}``)
2177+
must not raise. Base evaluator attributes should still be populated."""
2178+
event_data = {"name": "rubric-manual-260526043804-e45a09"}
2179+
evaluator_config = {
2180+
"rubric-manual-260526043804-e45a09": {
2181+
"_evaluator_name": "rubric-manual-260526043804-e45a09",
2182+
"_evaluator_version": "1",
2183+
"_evaluator_definition": {"type": "rubric"},
2184+
}
2185+
}
2186+
attrs = _build_internal_log_attributes(event_data, "rubric-manual-260526043804-e45a09", evaluator_config, {})
2187+
assert attrs["gen_ai.evaluation.testing_criteria.name"] == "rubric-manual-260526043804-e45a09"
2188+
assert attrs["gen_ai.evaluator.name"] == "rubric-manual-260526043804-e45a09"
2189+
assert attrs["gen_ai.evaluator.version"] == "1"
2190+
assert "gen_ai.evaluation.min_value" not in attrs
2191+
assert "gen_ai.evaluation.max_value" not in attrs
2192+
assert "gen_ai.evaluation.desirable_direction" not in attrs
2193+
assert "gen_ai.evaluation.type" not in attrs
2194+
2195+
def test_definition_with_metrics_none_does_not_raise(self):
2196+
"""Evaluator definition with metrics=None must not raise."""
2197+
event_data = {"name": "my_grader"}
2198+
evaluator_config = {
2199+
"my_grader": {
2200+
"_evaluator_definition": {"type": "rubric", "metrics": None},
2201+
}
2202+
}
2203+
attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
2204+
assert "gen_ai.evaluation.min_value" not in attrs
2205+
2206+
def test_definition_with_metrics_empty_dict_does_not_raise(self):
2207+
"""Evaluator definition with metrics={} must not raise."""
2208+
event_data = {"name": "my_grader"}
2209+
evaluator_config = {
2210+
"my_grader": {
2211+
"_evaluator_definition": {"type": "rubric", "metrics": {}},
2212+
}
2213+
}
2214+
attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
2215+
assert "gen_ai.evaluation.min_value" not in attrs
2216+
2217+
def test_definition_with_metrics_list_does_not_raise(self):
2218+
"""Evaluator definition with malformed metrics (e.g. list) must not raise."""
2219+
event_data = {"name": "my_grader"}
2220+
evaluator_config = {
2221+
"my_grader": {
2222+
"_evaluator_definition": {"type": "rubric", "metrics": ["score"]},
2223+
}
2224+
}
2225+
attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
2226+
assert "gen_ai.evaluation.min_value" not in attrs
2227+
2228+
def test_definition_with_metric_metadata_still_populates_attributes(self):
2229+
"""When evaluator definition does contain matching metric metadata, the
2230+
min/max/desirable_direction/type attributes should still be emitted."""
2231+
event_data = {"name": "my_grader"}
2232+
evaluator_config = {
2233+
"my_grader": {
2234+
"_evaluator_definition": {
2235+
"type": "prompt",
2236+
"metrics": {
2237+
"score": {
2238+
"min_value": 1.0,
2239+
"max_value": 5.0,
2240+
"desirable_direction": "increase",
2241+
"type": "ordinal",
2242+
}
2243+
},
2244+
},
2245+
}
2246+
}
2247+
attrs = _build_internal_log_attributes(event_data, "score", evaluator_config, {})
2248+
assert attrs["gen_ai.evaluation.min_value"] == "1.0"
2249+
assert attrs["gen_ai.evaluation.max_value"] == "5.0"
2250+
assert attrs["gen_ai.evaluation.desirable_direction"] == "increase"
2251+
assert attrs["gen_ai.evaluation.type"] == "ordinal"
2252+
2253+
21642254
@pytest.mark.unittest
21652255
class TestExtractTestingCriteriaMetadataPassThreshold:
21662256
"""Tests for pass_threshold propagation in _extract_testing_criteria_metadata."""

0 commit comments

Comments
 (0)