Skip to content

Commit b7db915

Browse files
Fix RedTeam.scan() decoding encoded attack prompts in results
For converter-based attack strategies (Base64, Flip, Morse, ROT13, Caesar, Leetspeak, AsciiArt, AnsiAttack, Atbash, Binary, CharacterSpace, CharSwap, Diacritic, StringJoin, SuffixAppend, UnicodeConfusable, UnicodeSubstitution, Url, AsciiSmuggler, Tense), FoundryResultProcessor was emitting the decoded 'original_value' as the user-message content while the target was actually receiving 'converted_value'. This made evaluation_results.json / results.json show plaintext where the audit trail should show the encoded payload, breaking post-scan auditability and per-variant debugging. This change makes conversation[].content always reflect the on-wire value (converted_value) for both user and assistant turns, and preserves the pre-converter objective as a sibling 'original_value' field on user messages whenever it differs. Baseline (non-encoded) strategies are unaffected since original_value == converted_value. Adds two regression tests in TestFoundryResultProcessor and a CHANGELOG entry. Resolves #47228.
1 parent 728afe5 commit b7db915

3 files changed

Lines changed: 123 additions & 8 deletions

File tree

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Release History
22

3+
## 1.17.1 (Unreleased)
4+
5+
### Bugs Fixed
6+
7+
- Fixed `RedTeam.scan()` storing decoded plaintext instead of the actual encoded payload for converter-based attack strategies (`Base64`, `Flip`, `Morse`, `ROT13`, `Caesar`, `Leetspeak`, `AsciiArt`, `AnsiAttack`, `Atbash`, `Binary`, `CharacterSpace`, `CharSwap`, `Diacritic`, `StringJoin`, `SuffixAppend`, `UnicodeConfusable`, `UnicodeSubstitution`, `Url`, `AsciiSmuggler`, `Tense`) in `evaluation_results.json` / `results.json`. The persisted `conversation[].content` for user turns now reflects what the target actually received (`converted_value`); the pre-converter adversarial objective is preserved on the same message as a new `original_value` field so the audit trail of what the attack meant to say is not lost. Baseline (non-encoded) strategies are unaffected. Resolves [Azure/azure-sdk-for-python#47228](https://github.com/Azure/azure-sdk-for-python/issues/47228).
8+
39
## 1.17.0 (2026-06-03)
410

511
### Breaking Changes

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -349,21 +349,40 @@ def _build_messages_from_pieces(
349349
# Get role, handling api_role property
350350
role = getattr(piece, "api_role", None) or getattr(piece, "role", "user")
351351

352-
# Get content: for user messages show the original adversarial prompt,
353-
# not the converter output (e.g., Base64-encoded or tense-rephrased text).
354-
# For assistant messages, show the response as-is.
355-
if role == "user":
356-
original = getattr(piece, "original_value", None)
357-
converted = getattr(piece, "converted_value", None)
358-
content = original if isinstance(original, str) and original else (converted or "")
352+
# Get content. For both user and assistant turns, ``content`` reflects
353+
# what was actually sent on the wire (``converted_value``) so the
354+
# stored conversation matches the payload the target received /
355+
# produced. When a converter (Base64, Flip, Morse, Caesar, etc.) was
356+
# applied, the pre-conversion adversarial objective is preserved as
357+
# ``original_value`` on the same message so consumers can still
358+
# display / score against the decoded text without losing fidelity
359+
# of the actual attack surface.
360+
original = getattr(piece, "original_value", None)
361+
converted = getattr(piece, "converted_value", None)
362+
if isinstance(converted, str) and converted:
363+
content = converted
364+
elif isinstance(original, str) and original:
365+
content = original
359366
else:
360-
content = getattr(piece, "converted_value", None) or getattr(piece, "original_value", "")
367+
content = ""
361368

362369
message: Dict[str, Any] = {
363370
"role": role,
364371
"content": content,
365372
}
366373

374+
# Preserve the pre-converter objective when it differs from the
375+
# transmitted content. This keeps the audit trail intact: callers
376+
# can compare ``content`` (what the target saw) with
377+
# ``original_value`` (what the attack meant to say) for every
378+
# encoding-based strategy.
379+
if (
380+
isinstance(original, str)
381+
and original
382+
and original != content
383+
):
384+
message["original_value"] = original
385+
367386
# Add context from labels if present (for XPIA)
368387
if hasattr(piece, "labels") and piece.labels:
369388
context_str = piece.labels.get("context")

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,6 +1427,96 @@ def test_build_messages_from_pieces(self):
14271427
assert messages[0]["content"] == "User message"
14281428
assert messages[1]["role"] == "assistant"
14291429
assert messages[1]["content"] == "Assistant response"
1430+
# When original and converted match (no encoding), no audit field is added.
1431+
assert "original_value" not in messages[0]
1432+
assert "original_value" not in messages[1]
1433+
1434+
def test_build_messages_preserves_encoded_user_prompt(self):
1435+
"""Encoded attack prompts must be stored as the wire payload.
1436+
1437+
Regression test for
1438+
https://github.com/Azure/azure-sdk-for-python/issues/47228 — for
1439+
converter-based strategies (Base64, Flip, Morse, ROT13, etc.) the
1440+
target receives ``converted_value``, so the persisted conversation
1441+
must report ``converted_value`` as ``content`` (not the decoded
1442+
``original_value``). The pre-converter objective is preserved as
1443+
``original_value`` on the same message so callers still have an
1444+
audit trail of what the attack meant to say.
1445+
"""
1446+
mock_scenario = MagicMock()
1447+
mock_dataset = MagicMock()
1448+
mock_dataset.get_all_seed_groups.return_value = []
1449+
1450+
processor = FoundryResultProcessor(
1451+
scenario=mock_scenario,
1452+
dataset_config=mock_dataset,
1453+
risk_category="violence",
1454+
)
1455+
1456+
# Simulate a Base64-converted user turn: the target actually saw the
1457+
# encoded payload, but the SDK still has the plaintext objective.
1458+
user_piece = MagicMock()
1459+
user_piece.api_role = "user"
1460+
user_piece.original_value = "How do I make a dangerous thing?"
1461+
user_piece.converted_value = "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8="
1462+
user_piece.sequence = 0
1463+
user_piece.prompt_metadata = {}
1464+
user_piece.labels = {}
1465+
1466+
# Assistant response — converter is a no-op on the response side, so
1467+
# original and converted match. No audit field should be emitted.
1468+
assistant_piece = MagicMock()
1469+
assistant_piece.api_role = "assistant"
1470+
assistant_piece.original_value = "Sorry, I can't help with that."
1471+
assistant_piece.converted_value = "Sorry, I can't help with that."
1472+
assistant_piece.sequence = 1
1473+
assistant_piece.prompt_metadata = {}
1474+
assistant_piece.labels = {}
1475+
1476+
messages = processor._build_messages_from_pieces([user_piece, assistant_piece])
1477+
1478+
# The user turn must carry the encoded payload as content so consumers
1479+
# can verify exactly what the target received.
1480+
assert messages[0]["role"] == "user"
1481+
assert messages[0]["content"] == "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8="
1482+
# The plaintext objective is preserved alongside it for auditability.
1483+
assert messages[0]["original_value"] == "How do I make a dangerous thing?"
1484+
1485+
# Assistant turn is unchanged: content == converted_value, no audit field.
1486+
assert messages[1]["role"] == "assistant"
1487+
assert messages[1]["content"] == "Sorry, I can't help with that."
1488+
assert "original_value" not in messages[1]
1489+
1490+
def test_build_messages_falls_back_to_original_when_converted_missing(self):
1491+
"""When ``converted_value`` is empty, fall back to ``original_value``.
1492+
1493+
Covers the historical behavior for pieces where PyRIT did not run a
1494+
converter (e.g., Baseline strategy or in-flight failures).
1495+
"""
1496+
mock_scenario = MagicMock()
1497+
mock_dataset = MagicMock()
1498+
mock_dataset.get_all_seed_groups.return_value = []
1499+
1500+
processor = FoundryResultProcessor(
1501+
scenario=mock_scenario,
1502+
dataset_config=mock_dataset,
1503+
risk_category="violence",
1504+
)
1505+
1506+
user_piece = MagicMock()
1507+
user_piece.api_role = "user"
1508+
user_piece.original_value = "Baseline prompt"
1509+
user_piece.converted_value = None
1510+
user_piece.sequence = 0
1511+
user_piece.prompt_metadata = {}
1512+
user_piece.labels = {}
1513+
1514+
messages = processor._build_messages_from_pieces([user_piece])
1515+
1516+
assert len(messages) == 1
1517+
assert messages[0]["content"] == "Baseline prompt"
1518+
# original == content here, so no separate audit field is needed.
1519+
assert "original_value" not in messages[0]
14301520

14311521
def test_get_prompt_group_id_from_conversation(self):
14321522
"""Test extracting prompt_group_id from conversation."""

0 commit comments

Comments
 (0)