diff --git a/sdk/core/corehttp/CHANGELOG.md b/sdk/core/corehttp/CHANGELOG.md index 862d230cc92d..4b21b09f5872 100644 --- a/sdk/core/corehttp/CHANGELOG.md +++ b/sdk/core/corehttp/CHANGELOG.md @@ -4,14 +4,18 @@ ### Features Added +- Introduced the keyword argument `additional_allowed_query_params` to `DistributedHttpTracingPolicy` to allow users to specify additional URL query parameters that should not be redacted in span attributes. [#46657](https://github.com/Azure/azure-sdk-for-python/pull/46657) + ### Breaking Changes ### Bugs Fixed -- Fixed `PipelineClient.format_url` to avoid adding trailing slashes when the URL template contains only query parameters. #45113 +- Fixed `PipelineClient.format_url` to avoid adding trailing slashes when the URL template contains only query parameters. [#45113](https://github.com/Azure/azure-sdk-for-python/pull/45113) ### Other Changes +- URL attributes in HTTP tracing spans will now have query parameters sanitized by default. To add additional query parameters that should not be redacted, use the `additional_allowed_query_params` argument when constructing the `DistributedHttpTracingPolicy`. [#46657](https://github.com/Azure/azure-sdk-for-python/pull/46657) + ## 1.0.0b7 (2026-02-05) ### Features Added diff --git a/sdk/core/corehttp/corehttp/runtime/policies/_distributed_tracing.py b/sdk/core/corehttp/corehttp/runtime/policies/_distributed_tracing.py index 50ff9466d3a4..516816e97dfd 100644 --- a/sdk/core/corehttp/corehttp/runtime/policies/_distributed_tracing.py +++ b/sdk/core/corehttp/corehttp/runtime/policies/_distributed_tracing.py @@ -5,15 +5,17 @@ from __future__ import annotations import logging import urllib.parse -from typing import Any, Optional, Tuple, Union, Type, Mapping, Dict, TYPE_CHECKING +from typing import Any, Iterable, MutableSet, Optional, Tuple, Union, Type, Mapping, Dict, TYPE_CHECKING from types import TracebackType from ...rest import HttpRequest from ...rest._rest_py3 import _HttpResponseBase as SansIOHttpResponse from ._base import SansIOHTTPPolicy +from ._utils import sanitize_url from ...settings import settings from ...instrumentation.tracing._models import SpanKind, TracingOptions from ...instrumentation.tracing._tracer import get_tracer +from ...utils._utils import CaseInsensitiveSet if TYPE_CHECKING: from ..pipeline import PipelineRequest, PipelineResponse @@ -31,11 +33,17 @@ class DistributedHttpTracingPolicy(SansIOHTTPPolicy[HttpRequest, SansIOHttpRespo :keyword instrumentation_config: Configuration for the instrumentation providers. :type instrumentation_config: dict[str, Any] + :keyword additional_allowed_query_params: Query parameter names whose values are allowed in recorded URLs. + These are added to the default set which includes "api-version". + :type additional_allowed_query_params: Iterable[str] """ TRACING_CONTEXT = "TRACING_CONTEXT" _SUPPRESSION_TOKEN = "SUPPRESSION_TOKEN" + DEFAULT_QUERY_PARAMS_ALLOWLIST: frozenset[str] = frozenset(["api-version"]) + _REDACTED_PLACEHOLDER = "REDACTED" + # Attribute names _HTTP_RESEND_COUNT = "http.request.resend_count" _USER_AGENT_ORIGINAL = "user_agent.original" @@ -50,9 +58,13 @@ def __init__( # pylint: disable=unused-argument self, *, instrumentation_config: Optional[Mapping[str, Any]] = None, + additional_allowed_query_params: Optional[Iterable[str]] = None, **kwargs: Any, ) -> None: self._instrumentation_config = instrumentation_config + self.allowed_query_params: MutableSet[str] = CaseInsensitiveSet(self.__class__.DEFAULT_QUERY_PARAMS_ALLOWLIST) + if additional_allowed_query_params: + self.allowed_query_params.update(additional_allowed_query_params) def on_request(self, request: PipelineRequest[HttpRequest]) -> None: """Starts a span for the network call. @@ -149,7 +161,7 @@ def _set_http_client_span_attributes( """ attributes: Dict[str, Any] = { self._HTTP_REQUEST_METHOD: request.method, - self._URL_FULL: request.url, + self._URL_FULL: sanitize_url(request.url, self.allowed_query_params, self._REDACTED_PLACEHOLDER), } parsed_url = urllib.parse.urlparse(request.url) diff --git a/sdk/core/corehttp/corehttp/runtime/policies/_utils.py b/sdk/core/corehttp/corehttp/runtime/policies/_utils.py index 74226e7c2fa4..46ea4de99c3d 100644 --- a/sdk/core/corehttp/corehttp/runtime/policies/_utils.py +++ b/sdk/core/corehttp/corehttp/runtime/policies/_utils.py @@ -26,11 +26,12 @@ from __future__ import annotations import datetime import email.utils -from typing import Optional, cast, Union, TYPE_CHECKING +import urllib.parse +from typing import AbstractSet, Optional, cast, Union, TYPE_CHECKING from urllib.parse import urlparse from ...rest import HttpResponse, AsyncHttpResponse, HttpRequest -from ...utils._utils import case_insensitive_dict +from ...utils._utils import case_insensitive_dict, CaseInsensitiveSet if TYPE_CHECKING: from ...runtime.pipeline import PipelineResponse @@ -92,3 +93,36 @@ def get_domain(url: str) -> str: :return: The domain of the url. """ return str(urlparse(url).netloc).lower() + + +def sanitize_url(url: str, allowed_query_params: AbstractSet[str], redacted_placeholder: str = "REDACTED") -> str: + """Redact query parameter values not in the allowlist. + + :param str url: The URL to sanitize. + :param set[str] allowed_query_params: Set of query parameter names whose values should not be redacted. + If a :class:`~corehttp.utils._utils.CaseInsensitiveSet` is provided, lookups are case-insensitive + without per-call normalization. + :param str redacted_placeholder: The placeholder to use for redacted values. + :return: The sanitized URL with redacted query parameter values. + :rtype: str + """ + parsed_url = urllib.parse.urlparse(url) + if not parsed_url.query: + return url + + if not isinstance(allowed_query_params, CaseInsensitiveSet): + allowed_query_params = CaseInsensitiveSet(allowed_query_params) + + parts = [] + for param in parsed_url.query.split("&"): + eq_idx = param.find("=") + if eq_idx == -1: + # No value to redact, keep as-is. + parts.append(param) + else: + key = param[:eq_idx] + parts.append(param if key in allowed_query_params else f"{key}={redacted_placeholder}") + + sanitized_query = "&".join(parts) + sanitized_url = parsed_url._replace(query=sanitized_query) + return urllib.parse.urlunparse(sanitized_url) diff --git a/sdk/core/corehttp/corehttp/utils/_utils.py b/sdk/core/corehttp/corehttp/utils/_utils.py index df4e13cf9281..9e1e39d03ec8 100644 --- a/sdk/core/corehttp/corehttp/utils/_utils.py +++ b/sdk/core/corehttp/corehttp/utils/_utils.py @@ -11,6 +11,7 @@ Iterator, Mapping, MutableMapping, + MutableSet, Optional, Tuple, Union, @@ -134,6 +135,45 @@ def __repr__(self) -> str: return str(dict(self.items())) +class CaseInsensitiveSet(MutableSet[str]): + """A set that stores values in their original form but performs + case-insensitive lookups via a pre-computed lowercase cache. + + :param data: Initial values for the set. + :type data: Iterable[str] + """ + + def __init__(self, data: Optional[Iterable[str]] = None) -> None: + self._lower_to_original: Dict[str, str] = {} + if data: + for item in data: + self.add(item) + + def __contains__(self, item: object) -> bool: + if not isinstance(item, str): + return False + return item.lower() in self._lower_to_original + + def __iter__(self) -> Iterator[str]: + return iter(self._lower_to_original.values()) + + def __len__(self) -> int: + return len(self._lower_to_original) + + def add(self, value: str) -> None: + lower = value.lower() + if lower not in self._lower_to_original: + self._lower_to_original[lower] = value + + def discard(self, value: str) -> None: + self._lower_to_original.pop(value.lower(), None) + + def update(self, *others: Iterable[str]) -> None: + for other in others: + for item in other: + self.add(item) + + def get_file_items(files: "FilesType") -> Sequence[Tuple[str, "FileType"]]: if isinstance(files, Mapping): # casting because ItemsView technically isn't a Sequence, even diff --git a/sdk/core/corehttp/tests/test_tracing_policy.py b/sdk/core/corehttp/tests/test_tracing_policy.py index 8b4770107469..7ac474702ec7 100644 --- a/sdk/core/corehttp/tests/test_tracing_policy.py +++ b/sdk/core/corehttp/tests/test_tracing_policy.py @@ -51,7 +51,7 @@ def test_distributed_tracing_policy(tracing_helper, http_response): assert traceparent.split("-")[2] == format_span_id(span_context.span_id) assert finished_spans[0].attributes.get(policy._HTTP_REQUEST_METHOD) == "GET" - assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=query" + assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=REDACTED" assert finished_spans[0].attributes.get(policy._SERVER_ADDRESS) == "localhost" assert finished_spans[0].attributes.get(policy._USER_AGENT_ORIGINAL) is None assert finished_spans[0].attributes.get(policy._HTTP_RESPONSE_STATUS_CODE) == 202 @@ -138,7 +138,7 @@ def test_distributed_tracing_policy_with_user_agent_policy(tracing_helper, http_ assert traceparent.split("-")[2] == format_span_id(span_context.span_id) assert finished_spans[0].attributes.get(policy._HTTP_REQUEST_METHOD) == "GET" - assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=query" + assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=REDACTED" assert finished_spans[0].attributes.get(policy._SERVER_ADDRESS) == "localhost" assert finished_spans[0].attributes.get(policy._USER_AGENT_ORIGINAL) is not None assert finished_spans[0].attributes.get(policy._USER_AGENT_ORIGINAL).endswith("test-user-agent") @@ -221,7 +221,7 @@ def test_distributed_tracing_policy_with_tracing_options(tracing_helper, http_re assert finished_spans[0].parent is root_span.get_span_context() assert finished_spans[0].attributes.get(policy._HTTP_REQUEST_METHOD) == "GET" - assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=query" + assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?query=REDACTED" assert finished_spans[0].attributes.get(policy._SERVER_ADDRESS) == "localhost" assert finished_spans[0].attributes.get(policy._USER_AGENT_ORIGINAL) is None assert finished_spans[0].attributes.get(policy._HTTP_RESPONSE_STATUS_CODE) == 202 @@ -268,3 +268,98 @@ def test_suppress_http_auto_instrumentation(port, tracing_helper): assert finished_spans[0].attributes.get(policy._HTTP_RESPONSE_STATUS_CODE) == 200 requests_instrumentor.uninstrument() + + +@pytest.mark.parametrize("http_response", HTTP_RESPONSES) +def test_url_full_sanitized_default(tracing_helper, http_response): + """Test that url.full redacts query params not in the default allowlist.""" + with tracing_helper.tracer.start_as_current_span("Root"): + policy = DistributedHttpTracingPolicy() + + request = HttpRequest("GET", "http://localhost/temp?api-version=2024-01-01&secret=mysecret&token=abc") + pipeline_request = PipelineRequest(request, PipelineContext(None)) + policy.on_request(pipeline_request) + + response = create_http_response(http_response, request, None, headers=request.headers, status_code=200) + policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None))) + + finished_spans = tracing_helper.exporter.get_finished_spans() + assert ( + finished_spans[0].attributes.get(policy._URL_FULL) + == "http://localhost/temp?api-version=2024-01-01&secret=REDACTED&token=REDACTED" + ) + + +@pytest.mark.parametrize("http_response", HTTP_RESPONSES) +def test_url_full_sanitized_custom_allowed(tracing_helper, http_response): + """Test that custom additional_allowed_query_params are additive to the default allowlist.""" + with tracing_helper.tracer.start_as_current_span("Root"): + policy = DistributedHttpTracingPolicy(additional_allowed_query_params=["token"]) + + request = HttpRequest("GET", "http://localhost/temp?api-version=2024-01-01&secret=mysecret&token=abc") + pipeline_request = PipelineRequest(request, PipelineContext(None)) + policy.on_request(pipeline_request) + + response = create_http_response(http_response, request, None, headers=request.headers, status_code=200) + policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None))) + + finished_spans = tracing_helper.exporter.get_finished_spans() + assert ( + finished_spans[0].attributes.get(policy._URL_FULL) + == "http://localhost/temp?api-version=2024-01-01&secret=REDACTED&token=abc" + ) + + +@pytest.mark.parametrize("http_response", HTTP_RESPONSES) +def test_url_full_sanitized_case_insensitive(tracing_helper, http_response): + """Test that additional_allowed_query_params matching is case-insensitive.""" + with tracing_helper.tracer.start_as_current_span("Root"): + policy = DistributedHttpTracingPolicy(additional_allowed_query_params=["MyParam"]) + + request = HttpRequest("GET", "http://localhost/temp?myparam=value1&other=value2") + pipeline_request = PipelineRequest(request, PipelineContext(None)) + policy.on_request(pipeline_request) + + response = create_http_response(http_response, request, None, headers=request.headers, status_code=200) + policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None))) + + finished_spans = tracing_helper.exporter.get_finished_spans() + assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp?myparam=value1&other=REDACTED" + + +@pytest.mark.parametrize("http_response", HTTP_RESPONSES) +def test_url_full_no_query_params(tracing_helper, http_response): + """Test that URLs without query params are unchanged.""" + with tracing_helper.tracer.start_as_current_span("Root"): + policy = DistributedHttpTracingPolicy() + + request = HttpRequest("GET", "http://localhost/temp") + pipeline_request = PipelineRequest(request, PipelineContext(None)) + policy.on_request(pipeline_request) + + response = create_http_response(http_response, request, None, headers=request.headers, status_code=200) + policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None))) + + finished_spans = tracing_helper.exporter.get_finished_spans() + assert finished_spans[0].attributes.get(policy._URL_FULL) == "http://localhost/temp" + + +@pytest.mark.parametrize("http_response", HTTP_RESPONSES) +def test_url_full_allowed_query_params_additive(tracing_helper, http_response): + """Test that allowed_query_params can be updated after construction.""" + with tracing_helper.tracer.start_as_current_span("Root"): + policy = DistributedHttpTracingPolicy() + policy.allowed_query_params.add("custom") + + request = HttpRequest("GET", "http://localhost/temp?api-version=v1&custom=val&other=secret") + pipeline_request = PipelineRequest(request, PipelineContext(None)) + policy.on_request(pipeline_request) + + response = create_http_response(http_response, request, None, headers=request.headers, status_code=200) + policy.on_response(pipeline_request, PipelineResponse(request, response, PipelineContext(None))) + + finished_spans = tracing_helper.exporter.get_finished_spans() + assert ( + finished_spans[0].attributes.get(policy._URL_FULL) + == "http://localhost/temp?api-version=v1&custom=val&other=REDACTED" + ) diff --git a/sdk/core/corehttp/tests/test_utils.py b/sdk/core/corehttp/tests/test_utils.py index 0a282b10317c..526a7ad2c189 100644 --- a/sdk/core/corehttp/tests/test_utils.py +++ b/sdk/core/corehttp/tests/test_utils.py @@ -7,8 +7,8 @@ import pytest from corehttp.utils import case_insensitive_dict -from corehttp.utils._utils import get_running_async_lock -from corehttp.runtime.policies._utils import parse_retry_after +from corehttp.utils._utils import get_running_async_lock, CaseInsensitiveSet +from corehttp.runtime.policies._utils import parse_retry_after, sanitize_url @pytest.fixture() @@ -136,3 +136,151 @@ def test_parse_retry_after(): assert ret == 0 ret = parse_retry_after("0.9") assert ret == 0.9 + + +def test_sanitize_url(): + url = "https://www.example.com?q1=value1&q2=value2" + assert sanitize_url(url, {"q1"}) == "https://www.example.com?q1=value1&q2=REDACTED" + assert sanitize_url(url, {"q1", "q2"}) == url + assert sanitize_url(url, {"q2", "q3"}) == "https://www.example.com?q1=REDACTED&q2=value2" + url = "https://www.example.com" + assert sanitize_url(url, {"q1", "q3"}) == url + url = "https://www.example.com?q1=value1" + assert sanitize_url(url, {"q1", "q3"}) == url + url = "https://www.example.com?q1=value1&q2=" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value1&q2=REDACTED" + url = "https://www.example.com?q1=value1&q2" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value1&q2" + url = "https://www.example.com?q1=value1&q2&q3=value3" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value1&q2&q3=value3" + url = "https://www.example.com?q2=value1&q2=value2" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q2=REDACTED&q2=REDACTED" + + url = "https://www.example.com?REDACTED=foo" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?REDACTED=REDACTED" + url = "https://www.example.com?REDACTED=foo&REDACTED=bar" + assert sanitize_url(url, {"redacted"}) == "https://www.example.com?REDACTED=foo&REDACTED=bar" + url = "https://www.example.com?Q1=value1" + assert sanitize_url(url, {"q3"}) == "https://www.example.com?Q1=REDACTED" + + # Test multiple of the same query parameters. + url = "https://example.com?q1=value1&q1=value2&q1=value3" + assert sanitize_url(url, {"q1"}) == "https://example.com?q1=value1&q1=value2&q1=value3" + assert sanitize_url(url, {"q2"}) == "https://example.com?q1=REDACTED&q1=REDACTED&q1=REDACTED" + + # Test query parameters in the path. + url = "https://www.example.com/q1=value1/foo" + assert sanitize_url(url, {"q1", "q3"}) == url + url = "https://www.example.com/q1=value1&q2=value2/foo" + assert sanitize_url(url, {"q1", "q3"}) == url + + # Test encoded query parameters. + url = "https://www.example.com?q1=value%201&q2=value%202&q3=value%203" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value%201&q2=REDACTED&q3=value%203" + url = "https://www.example.com?q1=value%261&q2=value%262&q3=value%263" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value%261&q2=REDACTED&q3=value%263" + url = "https://www.example.com?q1=value+1&q2=value+2&q3=value+3" + assert sanitize_url(url, {"q1", "q3"}) == "https://www.example.com?q1=value+1&q2=REDACTED&q3=value+3" + + # Test case insensitive query parameters. + url = "https://www.example.com?q1=value1&q2=value2" + assert sanitize_url(url, {"Q1", "q3"}) == "https://www.example.com?q1=value1&q2=REDACTED" + url = "https://www.example.com?q1=value1&Q2=value2" + assert sanitize_url(url, {"Q1", "q2"}) == url + + # Test empty allowed set (all params redacted). + url = "https://www.example.com?q1=value1&q2=value2" + assert sanitize_url(url, set()) == "https://www.example.com?q1=REDACTED&q2=REDACTED" + + # Test no-value params are always preserved (nothing to redact). + url = "https://www.example.com?q1&q2=value2" + assert sanitize_url(url, {"q1"}) == "https://www.example.com?q1&q2=REDACTED" + assert sanitize_url(url, {"q2"}) == "https://www.example.com?q1&q2=value2" + assert sanitize_url(url, set()) == "https://www.example.com?q1&q2=REDACTED" + + url = "https://www.example.com?q1" + assert sanitize_url(url, set()) == url + + # Test URL with fragment. + url = "https://www.example.com?q1=value1&q2=value2#fragment" + assert sanitize_url(url, {"q1"}) == "https://www.example.com?q1=value1&q2=REDACTED#fragment" + + # Test value containing '=' (only first '=' splits key/value). + url = "https://www.example.com?q1=a=b&q2=value2" + assert sanitize_url(url, {"q1"}) == "https://www.example.com?q1=a=b&q2=REDACTED" + assert sanitize_url(url, {"q2"}) == "https://www.example.com?q1=REDACTED&q2=value2" + + +def test_sanitize_url_with_different_placeholder(): + url = "https://www.example.com?q1=value1&q2=value2" + assert sanitize_url(url, {"q1"}, "SANITIZED") == "https://www.example.com?q1=value1&q2=SANITIZED" + + +class TestCaseInsensitiveSet: + def test_contains(self): + s = CaseInsensitiveSet(["Foo", "Bar"]) + assert "foo" in s + assert "FOO" in s + assert "Foo" in s + assert "bar" in s + assert "BAR" in s + assert "baz" not in s + + def test_add(self): + s = CaseInsensitiveSet() + s.add("Foo") + assert "foo" in s + assert "FOO" in s + s.add("bar") + assert "Bar" in s + assert len(s) == 2 + + def test_discard(self): + s = CaseInsensitiveSet(["Foo", "Bar"]) + # Discard with different casing should work. + s.discard("foo") + assert "foo" not in s + assert "Foo" not in s + assert "bar" in s + # Discard missing item is a no-op. + s.discard("baz") + assert "bar" in s + assert len(s) == 1 + + def test_remove(self): + s = CaseInsensitiveSet(["Foo", "Bar"]) + # Remove with different casing should work. + s.remove("foo") + assert "Foo" not in s + assert "bar" in s + with pytest.raises(KeyError): + s.remove("baz") + + def test_update(self): + s = CaseInsensitiveSet(["Foo"]) + s.update(["Bar", "Baz"]) + assert "bar" in s + assert "BAZ" in s + assert len(s) == 3 + + def test_clear(self): + s = CaseInsensitiveSet(["Foo", "Bar"]) + s.clear() + assert "foo" not in s + assert len(s) == 0 + + def test_pop(self): + s = CaseInsensitiveSet(["Foo"]) + result = s.pop() + assert result == "Foo" + assert "foo" not in s + assert len(s) == 0 + + def test_multiple(self): + s = CaseInsensitiveSet(["Foo", "foo", "FOO"]) + assert len(s) == 1 + assert "foo" in s + s.add("FOO") + assert len(s) == 1 + s.discard("foo") + assert len(s) == 0