[Evaluation] Fix unhashable list crash in binary aggregation (#46743)

mmkawale · manaskawale · web-flow · commit 1fa00e14cfac · 2026-05-06T10:33:53.000-07:00
* [Evaluation] Fix unhashable list crash in binary aggregation

Wrap value_counts().to_dict() in _aggregation_binary_output with try/except TypeError. Columns matching outputs.*_result whose values are unhashable (e.g. lists) are now skipped with a warning instead of aborting the entire evaluate() call with EvaluationException: (InternalError) unhashable type: 'list'.

Adds a unit test covering a mixed DataFrame (valid pass/fail column + list-valued column) and a CHANGELOG entry under 1.16.7 (Unreleased).

* [Evaluation] Assert warning is emitted for unhashable result columns

---------

Co-authored-by: Manas Kawale &lt;manaskawale@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -14,6 +14,7 @@
 
 ### Bugs Fixed
 
+- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.
 - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
 - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
 - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -276,8 +276,19 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
             )
             continue
         if evaluator_name:
-            # Count the occurrences of each unique value (pass/fail)
-            value_counts = df[col].value_counts().to_dict()
+            try:
+                # Count the occurrences of each unique value (pass/fail)
+                value_counts = df[col].value_counts().to_dict()
+            except TypeError as ex:
+                # Column contains unhashable values (e.g., lists/dicts) and is therefore
+                # not a binary pass/fail result column. Skip it instead of aborting the
+                # entire evaluation aggregation.
+                LOGGER.warning(
+                    "Skipping column '%s' for binary aggregation due to unhashable values: %s",
+                    col,
+                    ex,
+                )
+                continue
 
             # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
             total_rows = len(df)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -740,6 +740,38 @@ def test_general_aggregation(self):
         assert "bad_thing.boolean_with_nan" not in aggregation
         assert "bad_thing.boolean_with_none" not in aggregation
 
+    def test_binary_aggregation_skips_unhashable_result_columns(self, caplog):
+        """A `_result` column containing list values must not crash binary aggregation."""
+        data = {
+            # Valid binary pass/fail column - should be aggregated.
+            "outputs.good_eval.metric_result": ["pass", "pass", "fail", "pass"],
+            # Malformed column whose values are lists (unhashable) - should be skipped
+            # with a warning instead of raising TypeError: unhashable type: 'list'.
+            "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]],
+        }
+        data_df = pd.DataFrame(data)
+
+        with caplog.at_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate"):
+            aggregation = _aggregate_metrics(data_df, {})
+
+        assert "good_eval.binary_aggregate" in aggregation
+        assert aggregation["good_eval.binary_aggregate"] == 0.75
+        assert "bad_eval.binary_aggregate" not in aggregation
+
+        # The malformed column must be reported via a warning so silent drops are
+        # caught by this regression test.
+        unhashable_warnings = [
+            record
+            for record in caplog.records
+            if record.levelno == logging.WARNING
+            and "outputs.bad_eval.metric_result" in record.getMessage()
+            and "unhashable" in record.getMessage()
+        ]
+        assert unhashable_warnings, (
+            "Expected a warning mentioning 'outputs.bad_eval.metric_result' and 'unhashable', "
+            f"got: {[r.getMessage() for r in caplog.records]}"
+        )
+
     def test_aggregate_label_defect_metrics_with_nan_in_details(self):
         """Test that NaN/None values in details column are properly ignored during aggregation."""
         data = {