Skip to content

Commit ae7e4d7

Browse files
merge master
2 parents 571ddf6 + 45abbe3 commit ae7e4d7

File tree

8 files changed

+218
-28
lines changed

8 files changed

+218
-28
lines changed

sentry_sdk/_werkzeug.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from typing import Dict
3939
from typing import Iterator
4040
from typing import Tuple
41+
from typing import Optional
4142

4243

4344
#
@@ -62,35 +63,41 @@ def _get_headers(environ: "Dict[str, str]") -> "Iterator[Tuple[str, str]]":
6263
yield key.replace("_", "-").title(), value
6364

6465

65-
#
66+
def _strip_default_port(host: str, scheme: "Optional[str]") -> str:
67+
"""Strip the port from the host if it's the default for the scheme."""
68+
if scheme == "http" and host.endswith(":80"):
69+
return host[:-3]
70+
if scheme == "https" and host.endswith(":443"):
71+
return host[:-4]
72+
return host
73+
74+
6675
# `get_host` comes from `werkzeug.wsgi.get_host`
6776
# https://github.com/pallets/werkzeug/blob/1.0.1/src/werkzeug/wsgi.py#L145
68-
#
77+
78+
6979
def get_host(environ: "Dict[str, str]", use_x_forwarded_for: bool = False) -> str:
7080
"""
7181
Return the host for the given WSGI environment.
7282
"""
83+
scheme = environ.get("wsgi.url_scheme")
84+
if use_x_forwarded_for:
85+
scheme = environ.get("HTTP_X_FORWARDED_PROTO", scheme)
86+
7387
if use_x_forwarded_for and "HTTP_X_FORWARDED_HOST" in environ:
74-
rv = environ["HTTP_X_FORWARDED_HOST"]
75-
if environ["wsgi.url_scheme"] == "http" and rv.endswith(":80"):
76-
rv = rv[:-3]
77-
elif environ["wsgi.url_scheme"] == "https" and rv.endswith(":443"):
78-
rv = rv[:-4]
88+
return _strip_default_port(environ["HTTP_X_FORWARDED_HOST"], scheme)
7989
elif environ.get("HTTP_HOST"):
80-
rv = environ["HTTP_HOST"]
81-
if environ["wsgi.url_scheme"] == "http" and rv.endswith(":80"):
82-
rv = rv[:-3]
83-
elif environ["wsgi.url_scheme"] == "https" and rv.endswith(":443"):
84-
rv = rv[:-4]
90+
return _strip_default_port(environ["HTTP_HOST"], scheme)
8591
elif environ.get("SERVER_NAME"):
92+
# SERVER_NAME/SERVER_PORT describe the internal server, so use
93+
# wsgi.url_scheme (not the forwarded scheme) for port decisions.
8694
rv = environ["SERVER_NAME"]
8795
if (environ["wsgi.url_scheme"], environ["SERVER_PORT"]) not in (
8896
("https", "443"),
8997
("http", "80"),
9098
):
9199
rv += ":" + environ["SERVER_PORT"]
100+
return rv
92101
else:
93102
# In spite of the WSGI spec, SERVER_NAME might not be present.
94-
rv = "unknown"
95-
96-
return rv
103+
return "unknown"

sentry_sdk/ai/consts.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import re
2+
3+
# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
4+
DATA_URL_BASE64_REGEX = re.compile(
5+
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
6+
)

sentry_sdk/ai/utils.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import TYPE_CHECKING
55

66
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
7+
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX
78

89
if TYPE_CHECKING:
910
from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -588,6 +589,20 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) ->
588589
return 0
589590

590591

592+
def _is_image_type_with_blob_content(item: "Dict[str, Any]") -> bool:
593+
"""
594+
Some content blocks contain an image_url property with base64 content as its value.
595+
This is used to identify those while not leading to unnecessary copying of data when the image URL does not contain base64 content.
596+
"""
597+
if item.get("type") != "image_url":
598+
return False
599+
600+
image_url = item.get("image_url", {}).get("url", "")
601+
data_url_match = DATA_URL_BASE64_REGEX.match(image_url)
602+
603+
return bool(data_url_match)
604+
605+
591606
def redact_blob_message_parts(
592607
messages: "List[Dict[str, Any]]",
593608
) -> "List[Dict[str, Any]]":
@@ -640,7 +655,9 @@ def redact_blob_message_parts(
640655
content = message.get("content")
641656
if isinstance(content, list):
642657
for item in content:
643-
if isinstance(item, dict) and item.get("type") == "blob":
658+
if isinstance(item, dict) and (
659+
item.get("type") == "blob" or _is_image_type_with_blob_content(item)
660+
):
644661
has_blobs = True
645662
break
646663
if has_blobs:
@@ -661,8 +678,11 @@ def redact_blob_message_parts(
661678
content = message.get("content")
662679
if isinstance(content, list):
663680
for item in content:
664-
if isinstance(item, dict) and item.get("type") == "blob":
665-
item["content"] = BLOB_DATA_SUBSTITUTE
681+
if isinstance(item, dict):
682+
if item.get("type") == "blob":
683+
item["content"] = BLOB_DATA_SUBSTITUTE
684+
elif _is_image_type_with_blob_content(item):
685+
item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE
666686

667687
return messages_copy
668688

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1 @@
1-
import re
2-
31
SPAN_ORIGIN = "auto.ai.pydantic_ai"
4-
5-
# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
6-
DATA_URL_BASE64_REGEX = re.compile(
7-
r"^data:(?:[a-zA-Z0-9][a-zA-Z0-9.+\-]*/[a-zA-Z0-9][a-zA-Z0-9.+\-]*)(?:;[a-zA-Z0-9\-]+=[^;,]*)*;base64,(?:[A-Za-z0-9+/\-_]+={0,2})$"
8-
)

sentry_sdk/integrations/pydantic_ai/spans/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sentry_sdk.ai.utils import get_modality_from_mime_type
66
from sentry_sdk.consts import SPANDATA
77

8-
from ..consts import DATA_URL_BASE64_REGEX
8+
from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX
99

1010
from typing import TYPE_CHECKING
1111

sentry_sdk/integrations/wsgi.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,12 @@ def get_request_url(
5757
path_info = environ.get("PATH_INFO", "").lstrip("/")
5858
path = f"{script_name}/{path_info}"
5959

60+
scheme = environ.get("wsgi.url_scheme")
61+
if use_x_forwarded_for:
62+
scheme = environ.get("HTTP_X_FORWARDED_PROTO", scheme)
63+
6064
return "%s://%s/%s" % (
61-
environ.get("wsgi.url_scheme"),
65+
scheme,
6266
get_host(environ, use_x_forwarded_for),
6367
wsgi_decoding_dance(path).lstrip("/"),
6468
)

tests/integrations/wsgi/test_wsgi.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66

77
import sentry_sdk
88
from sentry_sdk import capture_message
9-
from sentry_sdk.integrations.wsgi import SentryWsgiMiddleware, _ScopedResponse
9+
from sentry_sdk.integrations.wsgi import (
10+
SentryWsgiMiddleware,
11+
_ScopedResponse,
12+
get_request_url,
13+
)
1014

1115

1216
@pytest.fixture
@@ -547,3 +551,94 @@ def app(environ, start_response):
547551
assert isinstance(result, _ScopedResponse)
548552
else:
549553
assert result is response_mock
554+
555+
556+
@pytest.mark.parametrize(
557+
"environ,use_x_forwarded_for,expected_url",
558+
[
559+
# Without use_x_forwarded_for, wsgi.url_scheme is used
560+
(
561+
{
562+
"wsgi.url_scheme": "http",
563+
"SERVER_NAME": "example.com",
564+
"SERVER_PORT": "80",
565+
"PATH_INFO": "/test",
566+
"HTTP_X_FORWARDED_PROTO": "https",
567+
},
568+
False,
569+
"http://example.com/test",
570+
),
571+
# With use_x_forwarded_for, HTTP_X_FORWARDED_PROTO is respected
572+
(
573+
{
574+
"wsgi.url_scheme": "http",
575+
"SERVER_NAME": "example.com",
576+
"SERVER_PORT": "80",
577+
"PATH_INFO": "/test",
578+
"HTTP_X_FORWARDED_PROTO": "https",
579+
},
580+
True,
581+
"https://example.com/test",
582+
),
583+
# With use_x_forwarded_for but no forwarded proto, wsgi.url_scheme is used
584+
(
585+
{
586+
"wsgi.url_scheme": "http",
587+
"SERVER_NAME": "example.com",
588+
"SERVER_PORT": "80",
589+
"PATH_INFO": "/test",
590+
},
591+
True,
592+
"http://example.com/test",
593+
),
594+
# Forwarded host with default https port is stripped using forwarded proto
595+
(
596+
{
597+
"wsgi.url_scheme": "http",
598+
"SERVER_NAME": "internal",
599+
"SERVER_PORT": "80",
600+
"PATH_INFO": "/test",
601+
"HTTP_X_FORWARDED_PROTO": "https",
602+
"HTTP_X_FORWARDED_HOST": "example.com:443",
603+
},
604+
True,
605+
"https://example.com/test",
606+
),
607+
# Forwarded host with non-default port is preserved
608+
(
609+
{
610+
"wsgi.url_scheme": "http",
611+
"SERVER_NAME": "internal",
612+
"SERVER_PORT": "80",
613+
"PATH_INFO": "/test",
614+
"HTTP_X_FORWARDED_PROTO": "https",
615+
"HTTP_X_FORWARDED_HOST": "example.com:8443",
616+
},
617+
True,
618+
"https://example.com:8443/test",
619+
),
620+
# Forwarded proto with HTTP_HOST (no forwarded host) strips default port
621+
(
622+
{
623+
"wsgi.url_scheme": "http",
624+
"HTTP_HOST": "example.com:443",
625+
"SERVER_NAME": "internal",
626+
"SERVER_PORT": "80",
627+
"PATH_INFO": "/test",
628+
"HTTP_X_FORWARDED_PROTO": "https",
629+
},
630+
True,
631+
"https://example.com/test",
632+
),
633+
],
634+
ids=[
635+
"ignores_forwarded_proto_when_disabled",
636+
"respects_forwarded_proto_when_enabled",
637+
"falls_back_to_url_scheme_when_no_forwarded_proto",
638+
"strips_default_https_port_from_forwarded_host",
639+
"preserves_non_default_port_on_forwarded_host",
640+
"strips_default_port_from_http_host_with_forwarded_proto",
641+
],
642+
)
643+
def test_get_request_url_x_forwarded_proto(environ, use_x_forwarded_for, expected_url):
644+
assert get_request_url(environ, use_x_forwarded_for) == expected_url

tests/test_ai_monitoring.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,71 @@ def test_redacts_blobs_in_multiple_messages(self):
814814
assert result[1]["content"] == "I see the image." # Unchanged
815815
assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE
816816

817+
def test_redacts_single_blob_within_image_url_content(self):
818+
messages = [
819+
{
820+
"role": "user",
821+
"content": [
822+
{
823+
"text": "How many ponies do you see in the image?",
824+
"type": "text",
825+
},
826+
{
827+
"type": "image_url",
828+
"image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg=="},
829+
},
830+
],
831+
}
832+
]
833+
834+
original_blob_content = messages[0]["content"][1]
835+
836+
result = redact_blob_message_parts(messages)
837+
838+
assert messages[0]["content"][1] == original_blob_content
839+
840+
assert (
841+
result[0]["content"][0]["text"]
842+
== "How many ponies do you see in the image?"
843+
)
844+
assert result[0]["content"][0]["type"] == "text"
845+
assert result[0]["content"][1]["type"] == "image_url"
846+
assert result[0]["content"][1]["image_url"]["url"] == BLOB_DATA_SUBSTITUTE
847+
848+
def test_does_not_redact_image_url_content_with_non_blobs(self):
849+
messages = [
850+
{
851+
"role": "user",
852+
"content": [
853+
{
854+
"text": "How many ponies do you see in the image?",
855+
"type": "text",
856+
},
857+
{
858+
"type": "image_url",
859+
"image_url": {"url": "https://example.com/image.jpg"},
860+
},
861+
],
862+
}
863+
]
864+
865+
original_blob_content = messages[0]["content"][1]
866+
867+
result = redact_blob_message_parts(messages)
868+
869+
assert messages[0]["content"][1] == original_blob_content
870+
871+
assert (
872+
result[0]["content"][0]["text"]
873+
== "How many ponies do you see in the image?"
874+
)
875+
assert result[0]["content"][0]["type"] == "text"
876+
assert result[0]["content"][1]["type"] == "image_url"
877+
assert (
878+
result[0]["content"][1]["image_url"]["url"]
879+
== "https://example.com/image.jpg"
880+
)
881+
817882
def test_no_blobs_returns_original_list(self):
818883
"""Test that messages without blobs are returned as-is (performance optimization)"""
819884
messages = [

0 commit comments

Comments
 (0)