Skip to content

Commit 9c360eb

Browse files
authored
feat(ai): Redact base64 data URLs in image_url content blocks (#5953)
Extend `redact_blob_message_parts` to detect and redact base64 data URLs inside `image_url` content blocks (e.g. `data:image/jpeg;base64,...`), in addition to the existing `blob` type handling. Some AI integrations send image content as `image_url` items with inline base64 data URLs rather than the `blob` content type. Without this change, those base64 payloads are sent as span data, which inflates event size and can leak image content. Also moves `DATA_URL_BASE64_REGEX` from `sentry_sdk/integrations/pydantic_ai/consts.py` to `sentry_sdk/ai/consts.py` since it's now shared across AI monitoring code beyond pydantic_ai. Fixes PY-2280 and #5948
1 parent fc83474 commit 9c360eb

File tree

5 files changed

+95
-11
lines changed

5 files changed

+95
-11
lines changed

sentry_sdk/ai/consts.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import re
2+
3+
# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
4+
DATA_URL_BASE64_REGEX = re.compile(
5+
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
6+
)

sentry_sdk/ai/utils.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
7+
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX
78

89
if TYPE_CHECKING:
910
from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -588,6 +589,20 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) ->
588589
return 0
589590

590591

592+
def _is_image_type_with_blob_content(item: "Dict[str, Any]") -> bool:
593+
"""
594+
Some content blocks contain an image_url property with base64 content as its value.
595+
This is used to identify those while not leading to unnecessary copying of data when the image URL does not contain base64 content.
596+
"""
597+
if item.get("type") != "image_url":
598+
return False
599+
600+
image_url = item.get("image_url", {}).get("url", "")
601+
data_url_match = DATA_URL_BASE64_REGEX.match(image_url)
602+
603+
return bool(data_url_match)
604+
605+
591606
def redact_blob_message_parts(
592607
messages: "List[Dict[str, Any]]",
593608
) -> "List[Dict[str, Any]]":
@@ -640,7 +655,9 @@ def redact_blob_message_parts(
640655
content = message.get("content")
641656
if isinstance(content, list):
642657
for item in content:
643-
if isinstance(item, dict) and item.get("type") == "blob":
658+
if isinstance(item, dict) and (
659+
item.get("type") == "blob" or _is_image_type_with_blob_content(item)
660+
):
644661
has_blobs = True
645662
break
646663
if has_blobs:
@@ -661,8 +678,11 @@ def redact_blob_message_parts(
661678
content = message.get("content")
662679
if isinstance(content, list):
663680
for item in content:
664-
if isinstance(item, dict) and item.get("type") == "blob":
665-
item["content"] = BLOB_DATA_SUBSTITUTE
681+
if isinstance(item, dict):
682+
if item.get("type") == "blob":
683+
item["content"] = BLOB_DATA_SUBSTITUTE
684+
elif _is_image_type_with_blob_content(item):
685+
item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE
666686

667687
return messages_copy
668688

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1 @@
1-
import re
2-
31
SPAN_ORIGIN = "auto.ai.pydantic_ai"
4-
5-
# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
6-
DATA_URL_BASE64_REGEX = re.compile(
7-
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
8-
)

sentry_sdk/integrations/pydantic_ai/spans/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sentry_sdk.ai.utils import get_modality_from_mime_type
66
from sentry_sdk.consts import SPANDATA
77

8-
from ..consts import DATA_URL_BASE64_REGEX
8+
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX
99

1010
from typing import TYPE_CHECKING
1111

tests/test_ai_monitoring.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,71 @@ def test_redacts_blobs_in_multiple_messages(self):
814814
assert result[1]["content"] == "I see the image." # Unchanged
815815
assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
816816

817+
def test_redacts_single_blob_within_image_url_content(self):
818+
messages = [
819+
{
820+
"role": "user",
821+
"content": [
822+
{
823+
"text": "How many ponies do you see in the image?",
824+
"type": "text",
825+
},
826+
{
827+
"type": "image_url",
828+
"image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg=="},
829+
},
830+
],
831+
}
832+
]
833+
834+
original_blob_content = messages[0]["content"][1]
835+
836+
result = redact_blob_message_parts(messages)
837+
838+
assert messages[0]["content"][1] == original_blob_content
839+
840+
assert (
841+
result[0]["content"][0]["text"]
842+
== "How many ponies do you see in the image?"
843+
)
844+
assert result[0]["content"][0]["type"] == "text"
845+
assert result[0]["content"][1]["type"] == "image_url"
846+
assert result[0]["content"][1]["image_url"]["url"] == BLOB_DATA_SUBSTITUTE
847+
848+
def test_does_not_redact_image_url_content_with_non_blobs(self):
849+
messages = [
850+
{
851+
"role": "user",
852+
"content": [
853+
{
854+
"text": "How many ponies do you see in the image?",
855+
"type": "text",
856+
},
857+
{
858+
"type": "image_url",
859+
"image_url": {"url": "https://example.com/image.jpg"},
860+
},
861+
],
862+
}
863+
]
864+
865+
original_blob_content = messages[0]["content"][1]
866+
867+
result = redact_blob_message_parts(messages)
868+
869+
assert messages[0]["content"][1] == original_blob_content
870+
871+
assert (
872+
result[0]["content"][0]["text"]
873+
== "How many ponies do you see in the image?"
874+
)
875+
assert result[0]["content"][0]["type"] == "text"
876+
assert result[0]["content"][1]["type"] == "image_url"
877+
assert (
878+
result[0]["content"][1]["image_url"]["url"]
879+
== "https://example.com/image.jpg"
880+
)
881+
817882
def test_no_blobs_returns_original_list(self):
818883
"""Test that messages without blobs are returned as-is (performance optimization)"""
819884
messages = [

0 commit comments

Comments
 (0)