Fix App Insights emission silently dropping events when evaluator definition has no metrics

slister1001 · Copilot · slister1001 · commit 31160ea70470 · 2026-05-27T15:27:10.000-04:00
Rubric evaluators registered without metric metadata produce an evaluator
definition payload of `{"type": "rubric"}` (RAISvc's `RubricBasedEvaluatorDefinition.Metrics`
defaults to empty and validation does not require it). In
`_build_internal_log_attributes` the helper called
`evaluator_definition.get("metrics").get(metric_name)` which raises
`AttributeError: 'NoneType' object has no attribute 'get'` when `metrics`
is missing or set to None. That exception is caught by the per-event
`try/except` in `_log_events_to_app_insights` and silently swallowed, so
`event_logger.emit()` is never called for any event in the run. The net
effect is that App Insights receives zero `gen_ai.evaluation.result` events
for the entire eval — observed in production for `rubric-manual-260526043804-e45a09`
in the westus2 bug bash project, while other rubric evaluators in the same
workspace that had populated metric metadata continued to emit normally.

Guard the metrics lookup so a single misshapen definition does not abort
emission for the whole run.

Added regression tests for `_build_internal_log_attributes` covering:
missing `metrics` key, `metrics: None`, `metrics: {}`, `metrics: [...]`
(malformed type), and the happy path where metric metadata is preserved.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -19,6 +19,7 @@
 - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
 - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
 - Fixed non-dict `_properties` values from evaluators causing downstream issues. Values that are not dicts are now logged and dropped gracefully.
+- Fixed App Insights emission silently dropping every `gen_ai.evaluation.result` event when an evaluator definition (e.g., a rubric evaluator registered without metric metadata, sent as `{"type": "rubric"}`) lacked a `metrics` dict. `_build_internal_log_attributes` raised `AttributeError: 'NoneType' object has no attribute 'get'`, which was swallowed by the per-event try/except in `_log_events_to_app_insights`, resulting in zero events being emitted to App Insights for the affected run. The helper now tolerates missing or non-dict `metrics` sections and still emits the event with the base evaluator attributes.
 
 ### Other Changes
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -1136,7 +1136,11 @@ def _build_internal_log_attributes(
                 internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
 
             if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
-                metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
+                metrics_section = evaluator_definition.get("metrics")
+                if isinstance(metrics_section, dict):
+                    metric_config_detail = metrics_section.get(metric_name)
+                else:
+                    metric_config_detail = None
 
                 if metric_config_detail:
                     if metric_config_detail.get("min_value") is not None:
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -2161,6 +2161,96 @@ def test_threshold_positive_value(self):
         assert attrs["gen_ai.evaluation.threshold"] == "3.5"
 
 
+@pytest.mark.unittest
+class TestBuildInternalLogAttributesEvaluatorDefinition:
+    """Tests for _build_internal_log_attributes handling of malformed/missing evaluator metrics.
+
+    Regression: rubric evaluators registered without a `metrics` field on the evaluator
+    definition (RAISvc sends bare ``{"type": "rubric"}``) used to crash this helper with
+    ``AttributeError: 'NoneType' object has no attribute 'get'``. That exception was
+    silently swallowed by the per-event try/except in ``_log_events_to_app_insights``
+    and resulted in zero events being emitted to App Insights for the entire run.
+    """
+
+    def test_definition_without_metrics_key_does_not_raise(self):
+        """Evaluator definition lacking 'metrics' (e.g. rubric ``{"type": "rubric"}``)
+        must not raise. Base evaluator attributes should still be populated."""
+        event_data = {"name": "rubric-manual-260526043804-e45a09"}
+        evaluator_config = {
+            "rubric-manual-260526043804-e45a09": {
+                "_evaluator_name": "rubric-manual-260526043804-e45a09",
+                "_evaluator_version": "1",
+                "_evaluator_definition": {"type": "rubric"},
+            }
+        }
+        attrs = _build_internal_log_attributes(event_data, "rubric-manual-260526043804-e45a09", evaluator_config, {})
+        assert attrs["gen_ai.evaluation.testing_criteria.name"] == "rubric-manual-260526043804-e45a09"
+        assert attrs["gen_ai.evaluator.name"] == "rubric-manual-260526043804-e45a09"
+        assert attrs["gen_ai.evaluator.version"] == "1"
+        assert "gen_ai.evaluation.min_value" not in attrs
+        assert "gen_ai.evaluation.max_value" not in attrs
+        assert "gen_ai.evaluation.desirable_direction" not in attrs
+        assert "gen_ai.evaluation.type" not in attrs
+
+    def test_definition_with_metrics_none_does_not_raise(self):
+        """Evaluator definition with metrics=None must not raise."""
+        event_data = {"name": "my_grader"}
+        evaluator_config = {
+            "my_grader": {
+                "_evaluator_definition": {"type": "rubric", "metrics": None},
+            }
+        }
+        attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
+        assert "gen_ai.evaluation.min_value" not in attrs
+
+    def test_definition_with_metrics_empty_dict_does_not_raise(self):
+        """Evaluator definition with metrics={} must not raise."""
+        event_data = {"name": "my_grader"}
+        evaluator_config = {
+            "my_grader": {
+                "_evaluator_definition": {"type": "rubric", "metrics": {}},
+            }
+        }
+        attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
+        assert "gen_ai.evaluation.min_value" not in attrs
+
+    def test_definition_with_metrics_list_does_not_raise(self):
+        """Evaluator definition with malformed metrics (e.g. list) must not raise."""
+        event_data = {"name": "my_grader"}
+        evaluator_config = {
+            "my_grader": {
+                "_evaluator_definition": {"type": "rubric", "metrics": ["score"]},
+            }
+        }
+        attrs = _build_internal_log_attributes(event_data, "my_grader", evaluator_config, {})
+        assert "gen_ai.evaluation.min_value" not in attrs
+
+    def test_definition_with_metric_metadata_still_populates_attributes(self):
+        """When evaluator definition does contain matching metric metadata, the
+        min/max/desirable_direction/type attributes should still be emitted."""
+        event_data = {"name": "my_grader"}
+        evaluator_config = {
+            "my_grader": {
+                "_evaluator_definition": {
+                    "type": "prompt",
+                    "metrics": {
+                        "score": {
+                            "min_value": 1.0,
+                            "max_value": 5.0,
+                            "desirable_direction": "increase",
+                            "type": "ordinal",
+                        }
+                    },
+                },
+            }
+        }
+        attrs = _build_internal_log_attributes(event_data, "score", evaluator_config, {})
+        assert attrs["gen_ai.evaluation.min_value"] == "1.0"
+        assert attrs["gen_ai.evaluation.max_value"] == "5.0"
+        assert attrs["gen_ai.evaluation.desirable_direction"] == "increase"
+        assert attrs["gen_ai.evaluation.type"] == "ordinal"
+
+
 @pytest.mark.unittest
 class TestExtractTestingCriteriaMetadataPassThreshold:
     """Tests for pass_threshold propagation in _extract_testing_criteria_metadata."""