[Evaluation] Fix UTF-8 encoding for red team JSONL files on Windows (#45500)

slister1001 · Copilot · web-flow · commit 6a690955c5df · 2026-03-10T03:22:49.000Z
* Fix UTF-8 encoding for red team JSONL files on Windows

Add explicit encoding='utf-8' to all file open() calls in the PyRIT result
processing path. Without this, Windows defaults to the system locale encoding
(charmap/cp1252), causing UnicodeDecodeError when reading JSONL files containing
non-ASCII characters from UnicodeConfusable strategy or CJK languages.

Fixes: Tests 1.7 (UnicodeConfusable), 1.16 (Japanese/Chinese)

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* Add encoding regression tests for non-ASCII JSONL round-trip

Test CJK characters, Unicode confusables, and mixed scripts to prevent
future regressions of the charmap encoding bug on Windows.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* Format with black

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* Address review comments: test production code paths, consolidate CHANGELOG

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

* Apply black formatting

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;

---------

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## 1.16.0 (Unreleased)
 
 ### Bugs Fixed
+- Fixed `UnicodeDecodeError` on Windows when reading red team JSONL files containing non-ASCII characters (UnicodeConfusable strategy, CJK languages) by adding explicit `encoding="utf-8"` to all file open calls in the result processing path.
 - Fixed `NotFoundError: 404` when using `model_config` dict target with Foundry-style endpoints (`*.services.ai.azure.com`) by appending `/openai/v1` to the endpoint URL for PyRIT compatibility.
 - Fixed red team scan status stuck at `in_progress` in results.json despite the scan completing, by treating leftover `pending` entries as `failed`.
 - Fixed `ungrounded_attributes` risk category being silently skipped due to a cache key mismatch (`isa` vs `ungrounded_attributes`) in the Foundry execution path.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_result_processor.py
@@ -199,7 +199,7 @@ def to_red_team_result(
                 # Process data file to extract conversations
                 if data_file and os.path.exists(data_file):
                     try:
-                        with open(data_file, "r") as f:
+                        with open(data_file, "r", encoding="utf-8") as f:
                             for line in f:
                                 try:
                                     conv_data = json.loads(line)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_utils/formatting_utils.py
@@ -303,7 +303,7 @@ def write_pyrit_outputs_to_file(
     if os.path.exists(output_path):
         existing_line_count = 0
         try:
-            with open(output_path, "r") as existing_file:
+            with open(output_path, "r", encoding="utf-8") as existing_file:
                 existing_line_count = sum(1 for _ in existing_file)
 
             if len(conversations) > existing_line_count:
@@ -335,7 +335,7 @@ def write_pyrit_outputs_to_file(
                         if risk_sub_type:
                             conv_dict["risk_sub_type"] = risk_sub_type
                     json_lines += json.dumps(conv_dict) + "\n"
-                with Path(output_path).open("w") as f:
+                with Path(output_path).open("w", encoding="utf-8") as f:
                     f.writelines(json_lines)
                 logger.debug(
                     f"Successfully wrote {len(conversations)-existing_line_count} new conversation(s) to {output_path}"
@@ -375,7 +375,7 @@ def write_pyrit_outputs_to_file(
                 if risk_sub_type:
                     conv_dict["risk_sub_type"] = risk_sub_type
             json_lines += json.dumps(conv_dict) + "\n"
-        with Path(output_path).open("w") as f:
+        with Path(output_path).open("w", encoding="utf-8") as f:
             f.writelines(json_lines)
         logger.debug(f"Successfully wrote {len(conversations)} conversations to {output_path}")
     return str(output_path)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_formatting_utils.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_formatting_utils.py
@@ -5,6 +5,7 @@
 import pytest
 import math
 import json
+import logging
 from unittest.mock import patch, MagicMock, mock_open
 from azure.ai.evaluation.red_team._utils.formatting_utils import (
     message_to_dict,
@@ -14,6 +15,7 @@
     format_scorecard,
     is_none_or_nan,
     list_mean_nan_safe,
+    write_pyrit_outputs_to_file,
 )
 from azure.ai.evaluation.red_team._attack_strategy import AttackStrategy
 from pyrit.models import ChatMessage
@@ -229,3 +231,111 @@ def test_list_mean_nan_safe_empty_after_filtering(self):
         """Test list_mean_nan_safe with a list that is empty after filtering."""
         result = list_mean_nan_safe([None, float("nan")])
         assert result == 0.0  # Default when no valid values
+
+
+def _make_mock_pieces(conversation_id, messages):
+    """Create mock prompt request pieces for a conversation.
+
+    :param conversation_id: The conversation ID to assign to all pieces
+    :param messages: List of (role, content) tuples
+    :return: List of mock PromptRequestPiece objects
+    """
+    pieces = []
+    for role, content in messages:
+        piece = MagicMock()
+        piece.conversation_id = conversation_id
+        piece.original_value = content
+        piece.labels = {
+            "context": "",
+            "tool_calls": [],
+            "risk_sub_type": None,
+            "token_usage": None,
+        }
+        chat_msg = MagicMock(spec=ChatMessage)
+        chat_msg.role = role
+        chat_msg.content = content
+        piece.to_chat_message.return_value = chat_msg
+        pieces.append(piece)
+    return pieces
+
+
+@pytest.mark.unittest
+class TestUnicodeJSONLRoundTrip:
+    """Test that JSONL files with non-ASCII content survive write/read round-trips.
+
+    Regression tests for the encoding bug where open() without encoding='utf-8'
+    caused UnicodeDecodeError on Windows for UnicodeConfusable and CJK content.
+    These tests exercise the production write_pyrit_outputs_to_file code path.
+    """
+
+    def test_jsonl_roundtrip_cjk_characters(self, tmp_path):
+        """Test JSONL round-trip with CJK characters (Japanese, Chinese)."""
+        output_path = str(tmp_path / "cjk_test.jsonl")
+        pieces = _make_mock_pieces(
+            "conv-cjk",
+            [("user", "これはテストです"), ("assistant", "这是一个测试")],
+        )
+        mock_memory = MagicMock()
+        mock_memory.get_prompt_request_pieces.return_value = pieces
+
+        with patch("azure.ai.evaluation.red_team._utils.formatting_utils.CentralMemory") as mock_cm:
+            mock_cm.get_memory_instance.return_value = mock_memory
+            write_pyrit_outputs_to_file(
+                output_path=output_path,
+                logger=logging.getLogger("test"),
+                prompt_to_context={},
+            )
+
+        with open(output_path, "r", encoding="utf-8") as f:
+            data = json.loads(f.readline())
+            assert data["conversation"]["messages"][0]["content"] == "これはテストです"
+            assert data["conversation"]["messages"][1]["content"] == "这是一个测试"
+
+    def test_jsonl_roundtrip_unicode_confusable(self, tmp_path):
+        """Test JSONL round-trip with Unicode confusable characters."""
+        output_path = str(tmp_path / "confusable_test.jsonl")
+        confusable_text = "Ⓗⓔⓛⓛⓞ ⓦⓞⓡⓛⓓ"
+        pieces = _make_mock_pieces(
+            "conv-confusable",
+            [("user", confusable_text), ("assistant", "I understand your request.")],
+        )
+        mock_memory = MagicMock()
+        mock_memory.get_prompt_request_pieces.return_value = pieces
+
+        with patch("azure.ai.evaluation.red_team._utils.formatting_utils.CentralMemory") as mock_cm:
+            mock_cm.get_memory_instance.return_value = mock_memory
+            write_pyrit_outputs_to_file(
+                output_path=output_path,
+                logger=logging.getLogger("test"),
+                prompt_to_context={},
+            )
+
+        with open(output_path, "r", encoding="utf-8") as f:
+            data = json.loads(f.readline())
+            assert data["conversation"]["messages"][0]["content"] == confusable_text
+
+    def test_jsonl_roundtrip_mixed_scripts(self, tmp_path):
+        """Test JSONL round-trip with mixed scripts (Arabic, Cyrillic, emoji)."""
+        output_path = str(tmp_path / "mixed_test.jsonl")
+        pieces = _make_mock_pieces(
+            "conv-mixed",
+            [
+                ("user", "مرحبا Привет 🔥 café"),
+                ("assistant", "Multi-script response: αβγ"),
+            ],
+        )
+        mock_memory = MagicMock()
+        mock_memory.get_prompt_request_pieces.return_value = pieces
+
+        with patch("azure.ai.evaluation.red_team._utils.formatting_utils.CentralMemory") as mock_cm:
+            mock_cm.get_memory_instance.return_value = mock_memory
+            write_pyrit_outputs_to_file(
+                output_path=output_path,
+                logger=logging.getLogger("test"),
+                prompt_to_context={},
+            )
+
+        with open(output_path, "r", encoding="utf-8") as f:
+            data = json.loads(f.readline())
+            assert "مرحبا" in data["conversation"]["messages"][0]["content"]
+            assert "αβγ" in data["conversation"]["messages"][1]["content"]