Skip to content

Commit 49c9034

Browse files
authored
utf8 bug fix (#47008)
* utf8 bug fix - initial commit * addressing copilot comments * refactoring tests * addressing PR comments
1 parent 4c4386c commit 49c9034

11 files changed

Lines changed: 1136 additions & 8 deletions

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#### Breaking Changes
99

1010
#### Bugs Fixed
11+
* Fixed bug where the `Content-Length` HTTP request header was computed from the character count of the request body instead of its UTF-8 byte count. See [PR 47008](https://github.com/Azure/azure-sdk-for-python/pull/47008)
12+
* Added an opt-in fallback for invalid UTF-8 in response bodies. Default behavior is unchanged (strict decode). Setting `AZURE_COSMOS_CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT` to `REPLACE` or `IGNORE` enables a permissive decode so reads, queries, and change-feed iteration can make progress past corrupt payloads. See [PR 47008](https://github.com/Azure/azure-sdk-for-python/pull/47008)
1113
* Fixed bug where `CosmosClient` construction with AAD credentials would crash at startup if the semantic reranking inference endpoint environment variable was not set, even when semantic reranking was not being used. The inference service is now lazily initialized on first use. See [PR 46243](https://github.com/Azure/azure-sdk-for-python/pull/46243)
1214
* Fixed bug where region names in `preferred_locations` and `excluded_locations` (client-level and per-request) were not matched tolerantly for differences in case, whitespace, hyphens, and underscores. See [PR 46937](https://github.com/Azure/azure-sdk-for-python/pull/46937)
1315

sdk/cosmos/azure-cosmos/azure/cosmos/_constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ class _Constants:
9090
TIMEOUT_ERROR_THRESHOLD_PPAF_DEFAULT: int = 10
9191
# -------------------------------------------------------------------------
9292

93+
# Controls how the SDK handles invalid UTF-8 bytes in HTTP response bodies.
94+
# Accepted values: "REPLACE", "IGNORE". Anything else (including unset)
95+
# leaves strict decoding in effect, which is the historical default.
96+
CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT: str = \
97+
"AZURE_COSMOS_CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT"
98+
9399
# Error code translations
94100
ERROR_TRANSLATIONS: dict[int, str] = {
95101
400: "BAD_REQUEST - Request being sent is invalid.",

sdk/cosmos/azure-cosmos/azure/cosmos/_inference_service.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from ._cosmos_http_logging_policy import CosmosHttpLoggingPolicy
3939
from ._cosmos_responses import CosmosDict
4040
from ._inference_auth_policy import InferenceServiceBearerTokenPolicy
41+
from ._response_decoding import decode_response_body_for_status
4142
from ._retry_utility import ConnectionRetryPolicy
4243
from .http_constants import HttpHeaders
4344

@@ -202,7 +203,23 @@ def rerank(
202203

203204
data = response.body()
204205
if data:
205-
data = data.decode("utf-8")
206+
try:
207+
data = decode_response_body_for_status(
208+
data, response.status_code, "inference_request"
209+
)
210+
except UnicodeDecodeError as decode_err:
211+
# Only reachable when status is < 400 and strict decode
212+
# is still in effect. ``decode_response_body_for_status``
213+
# never lets malformed UTF-8 escape on status >= 400, and
214+
# it honors REPLACE/IGNORE env fallback before this point.
215+
# Surface as a typed SDK decode exception so wire status
216+
# (e.g. 200) and response metadata are preserved verbatim;
217+
# the decoder error remains available via __cause__.
218+
raise DecodeError(
219+
message="Failed to decode response body as UTF-8: {0}".format(decode_err.reason),
220+
response=response,
221+
error=decode_err,
222+
) from decode_err
206223

207224
if response.status_code >= 400:
208225
raise exceptions.CosmosHttpResponseError(message=data, response=response)
@@ -226,7 +243,15 @@ def rerank(
226243
response=None
227244
) from e
228245
except Exception as e:
229-
if isinstance(e, (exceptions.CosmosHttpResponseError, exceptions.CosmosResourceNotFoundError)):
246+
# ``DecodeError`` is a typed SDK exception (raised by the
247+
# decode wrap a few lines up, or by ``json.loads`` failures
248+
# below it) that already carries the original response and
249+
# the underlying decoder error via ``__cause__``. Treat it
250+
# the same as the Cosmos-typed exceptions and let it pass
251+
# through unchanged so its diagnostic context is preserved.
252+
if isinstance(e, (exceptions.CosmosHttpResponseError,
253+
exceptions.CosmosResourceNotFoundError,
254+
DecodeError)):
230255
raise
231256
raise exceptions.CosmosHttpResponseError(
232257
message=f"Semantic reranking failed: {str(e)}",
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# The MIT License (MIT)
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
4+
"""UTF-8 decoding for HTTP response bodies, with an opt-in fallback for
5+
payloads containing bytes that are not valid UTF-8.
6+
7+
By default this module preserves the historical SDK behavior: strict
8+
decode, ``UnicodeDecodeError`` raised on the first invalid byte.
9+
Operators who need to read past corrupt payloads (for example, to
10+
unblock a stuck change-feed processor) can opt in to a permissive
11+
fallback by setting an environment variable.
12+
13+
The recognized environment variable is
14+
``AZURE_COSMOS_CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT``:
15+
16+
* ``REPLACE`` -> Python ``errors="replace"`` (substitute U+FFFD)
17+
* ``IGNORE`` -> Python ``errors="ignore"`` (drop the bad bytes)
18+
* anything else, including unset -> strict (raise on bad bytes)
19+
20+
The env var is consulted only on the decode-failure path, so operators
21+
can set or change it at any point during process lifetime and the next
22+
malformed payload will pick up the new value. This follows the Cosmos
23+
SDK's runtime-read pattern for environment-based controls.
24+
"""
25+
import logging
26+
import os
27+
from typing import Optional
28+
29+
from ._constants import _Constants
30+
31+
32+
_MALFORMED_INPUT_ENV_VAR = _Constants.CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT
33+
34+
# Mapping from the recognized env var values to Python's bytes.decode
35+
# `errors=` argument. Anything not in this mapping (including the env var
36+
# being unset) resolves to strict decoding, which is the historical default.
37+
_ENV_VALUE_TO_DECODE_ERRORS_MODE = {
38+
"REPLACE": "replace",
39+
"IGNORE": "ignore",
40+
}
41+
42+
_logger = logging.getLogger(__name__)
43+
44+
45+
def _resolve_fallback_mode_from_env() -> Optional[str]:
46+
"""Reads the malformed-input env var and returns the Python decode
47+
``errors=`` mode to use as a fallback, or ``None`` if the operator
48+
has not opted in (in which case strict decoding stays in effect)."""
49+
raw_value = os.environ.get(_MALFORMED_INPUT_ENV_VAR)
50+
if raw_value is None:
51+
return None
52+
return _ENV_VALUE_TO_DECODE_ERRORS_MODE.get(raw_value.strip().upper())
53+
54+
55+
56+
def decode_response_body(data: bytes, operation_context: Optional[str] = None) -> str:
57+
"""Decode an HTTP response body as UTF-8.
58+
59+
The healthy path is strict decoding, identical in behavior and cost
60+
to ``data.decode("utf-8")``. The slow path is taken only when the
61+
payload contains bytes that are not valid UTF-8:
62+
63+
* If the operator has opted in via the malformed-input env var, the
64+
decode is retried in the configured permissive mode (``replace`` or
65+
``ignore``) and a WARNING is logged with the byte offset, the
66+
decoder's reason, and the supplied operation context.
67+
* Otherwise a ``UnicodeDecodeError`` is raised whose ``reason`` field
68+
carries an actionable hint pointing the operator at the env var.
69+
The original exception is preserved as ``__cause__``.
70+
71+
:param data: Response body bytes.
72+
:type data: bytes
73+
:param operation_context: Optional short string identifying the call
74+
site (for example, ``"read_item"`` or ``"query_items page"``);
75+
included in the WARNING log line when permissive fallback fires.
76+
:type operation_context: Optional[str]
77+
:returns: The decoded string.
78+
:rtype: str
79+
:raises UnicodeDecodeError: If the body contains invalid UTF-8 and
80+
the operator has not opted in to a permissive fallback.
81+
"""
82+
try:
83+
return data.decode("utf-8")
84+
except UnicodeDecodeError as strict_error:
85+
fallback_mode = _resolve_fallback_mode_from_env()
86+
if fallback_mode is None:
87+
hint = (
88+
"{original}; set environment variable "
89+
"{env_var}=REPLACE (or IGNORE) to tolerate invalid UTF-8 "
90+
"in Cosmos response bodies"
91+
).format(
92+
original=strict_error.reason,
93+
env_var=_MALFORMED_INPUT_ENV_VAR,
94+
)
95+
raise UnicodeDecodeError(
96+
strict_error.encoding,
97+
strict_error.object,
98+
strict_error.start,
99+
strict_error.end,
100+
hint,
101+
) from strict_error
102+
103+
_logger.warning(
104+
"Cosmos response body contained invalid UTF-8 at byte offset %d "
105+
"(reason: %s); decoding with errors=%r per %s (operation: %s).",
106+
strict_error.start,
107+
strict_error.reason,
108+
fallback_mode,
109+
_MALFORMED_INPUT_ENV_VAR,
110+
operation_context or "-",
111+
)
112+
return data.decode("utf-8", errors=fallback_mode)
113+
114+
115+
def decode_response_body_for_status(
116+
data: bytes,
117+
status_code: int,
118+
operation_context: Optional[str] = None,
119+
) -> str:
120+
"""Decode an HTTP response body, with a best-effort fallback for HTTP
121+
error responses whose body happens to contain invalid UTF-8.
122+
123+
Behaves exactly like :func:`decode_response_body` on success and on
124+
2xx responses with malformed UTF-8. The difference is the error path:
125+
if strict decoding fails AND the response is an HTTP error
126+
(``status_code >= 400``), the body is decoded with ``errors="replace"``
127+
so the caller can still construct the real status-code exception
128+
(``CosmosResourceNotFoundError``, ``CosmosHttpResponseError``, etc.).
129+
130+
The reason: the SDK's retry/refresh logic and customer error handlers
131+
branch on status code, not on message contents. Masking a 404, 410
132+
(partition split), 429 (throttle), or 503 with a ``UnicodeDecodeError``
133+
breaks recovery paths that would otherwise have worked. ``U+FFFD`` in
134+
an error message is acceptable; a wrong exception class is not.
135+
136+
For 2xx responses with malformed UTF-8 the exception is still raised —
137+
a successful response carrying corrupt bytes is a real data-integrity
138+
problem the caller needs to see.
139+
140+
:param data: Response body bytes.
141+
:type data: bytes
142+
:param status_code: The HTTP status code of the response.
143+
:type status_code: int
144+
:param operation_context: Optional short string identifying the call
145+
site; forwarded to :func:`decode_response_body`.
146+
:type operation_context: Optional[str]
147+
:returns: The decoded string.
148+
:rtype: str
149+
:raises UnicodeDecodeError: If the body contains invalid UTF-8, the
150+
operator has not opted in to a permissive fallback, and the
151+
response is a success (2xx/3xx) rather than an HTTP error.
152+
"""
153+
try:
154+
return decode_response_body(data, operation_context)
155+
except UnicodeDecodeError:
156+
if status_code >= 400:
157+
return data.decode("utf-8", errors="replace")
158+
raise

sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from ._availability_strategy_config import CrossRegionHedgingStrategy
3434
from ._availability_strategy_handler import execute_with_hedging
3535
from ._constants import _Constants
36+
from ._response_decoding import decode_response_body_for_status
3637
from ._request_object import RequestObject
3738
from .documents import _OperationType
3839

@@ -177,7 +178,23 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin
177178

178179
data = response.body()
179180
if data:
180-
data = data.decode("utf-8")
181+
try:
182+
data = decode_response_body_for_status(
183+
data, response.status_code, request_params.operation_type
184+
)
185+
except UnicodeDecodeError as decode_err:
186+
# Only reachable when status is < 400 and strict decode is
187+
# still in effect. ``decode_response_body_for_status`` never
188+
# lets malformed UTF-8 escape on status >= 400, and it honors
189+
# REPLACE/IGNORE env fallback before this point. Surface as a
190+
# typed SDK decode exception so wire status (e.g. 200) and
191+
# response metadata are preserved verbatim; the decoder error
192+
# remains available via __cause__.
193+
raise DecodeError(
194+
message="Failed to decode response body as UTF-8: {0}".format(decode_err.reason),
195+
response=response,
196+
error=decode_err,
197+
) from decode_err
181198

182199
if response.status_code == 404:
183200
raise exceptions.CosmosResourceNotFoundError(message=data, response=response)
@@ -257,7 +274,10 @@ def SynchronizedRequest(
257274
"""
258275
request.data = _request_body_from_data(request_data)
259276
if request.data and isinstance(request.data, str):
260-
request.headers[http_constants.HttpHeaders.ContentLength] = len(request.data)
277+
# Use UTF-8 byte length, not str length (code-point count), so the
278+
# header matches the bytes the transport actually writes for any
279+
# non-ASCII payload.
280+
request.headers[http_constants.HttpHeaders.ContentLength] = len(request.data.encode("utf-8"))
261281
elif request.data is None:
262282
request.headers[http_constants.HttpHeaders.ContentLength] = 0
263283

sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from .._availability_strategy_config import CrossRegionHedgingStrategy
3636
from .._constants import _Constants
3737
from .._request_object import RequestObject
38+
from .._response_decoding import decode_response_body_for_status
3839
from .._synchronized_request import _request_body_from_data, _replace_url_prefix
3940
from ..documents import _OperationType
4041

@@ -141,7 +142,23 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p
141142

142143
data = response.body()
143144
if data:
144-
data = data.decode("utf-8")
145+
try:
146+
data = decode_response_body_for_status(
147+
data, response.status_code, request_params.operation_type
148+
)
149+
except UnicodeDecodeError as decode_err:
150+
# Only reachable when status is < 400 and strict decode is
151+
# still in effect. ``decode_response_body_for_status`` never
152+
# lets malformed UTF-8 escape on status >= 400, and it honors
153+
# REPLACE/IGNORE env fallback before this point. Surface as a
154+
# typed SDK decode exception so wire status (e.g. 200) and
155+
# response metadata are preserved verbatim; the decoder error
156+
# remains available via __cause__.
157+
raise DecodeError(
158+
message="Failed to decode response body as UTF-8: {0}".format(decode_err.reason),
159+
response=response,
160+
error=decode_err,
161+
) from decode_err
145162

146163
if response.status_code == 404:
147164
raise exceptions.CosmosResourceNotFoundError(message=data, response=response)
@@ -210,7 +227,10 @@ async def AsynchronousRequest(
210227
"""
211228
request.data = _request_body_from_data(request_data)
212229
if request.data and isinstance(request.data, str):
213-
request.headers[http_constants.HttpHeaders.ContentLength] = len(request.data)
230+
# Use UTF-8 byte length, not str length (code-point count), so the
231+
# header matches the bytes the transport actually writes for any
232+
# non-ASCII payload.
233+
request.headers[http_constants.HttpHeaders.ContentLength] = len(request.data.encode("utf-8"))
214234
elif request.data is None:
215235
request.headers[http_constants.HttpHeaders.ContentLength] = 0
216236

sdk/cosmos/azure-cosmos/azure/cosmos/aio/_inference_service_async.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from .._constants import _Constants as Constants
4141
from .._cosmos_http_logging_policy import CosmosHttpLoggingPolicy
4242
from .._cosmos_responses import CosmosDict
43+
from .._response_decoding import decode_response_body_for_status
4344
from ..http_constants import HttpHeaders
4445

4546

@@ -235,7 +236,23 @@ async def rerank(
235236

236237
data = response.body()
237238
if data:
238-
data = data.decode("utf-8")
239+
try:
240+
data = decode_response_body_for_status(
241+
data, response.status_code, "inference_request"
242+
)
243+
except UnicodeDecodeError as decode_err:
244+
# Only reachable when status is < 400 and strict decode
245+
# is still in effect. ``decode_response_body_for_status``
246+
# never lets malformed UTF-8 escape on status >= 400, and
247+
# it honors REPLACE/IGNORE env fallback before this point.
248+
# Surface as a typed SDK decode exception so wire status
249+
# (e.g. 200) and response metadata are preserved verbatim;
250+
# the decoder error remains available via __cause__.
251+
raise DecodeError(
252+
message="Failed to decode response body as UTF-8: {0}".format(decode_err.reason),
253+
response=response,
254+
error=decode_err,
255+
) from decode_err
239256

240257
if response.status_code >= 400:
241258
raise exceptions.CosmosHttpResponseError(message=data, response=response)
@@ -259,7 +276,15 @@ async def rerank(
259276
response=None
260277
) from e
261278
except Exception as e:
262-
if isinstance(e, (exceptions.CosmosHttpResponseError, exceptions.CosmosResourceNotFoundError)):
279+
# ``DecodeError`` is a typed SDK exception (raised by the
280+
# decode wrap a few lines up, or by ``json.loads`` failures
281+
# below it) that already carries the original response and
282+
# the underlying decoder error via ``__cause__``. Treat it
283+
# the same as the Cosmos-typed exceptions and let it pass
284+
# through unchanged so its diagnostic context is preserved.
285+
if isinstance(e, (exceptions.CosmosHttpResponseError,
286+
exceptions.CosmosResourceNotFoundError,
287+
DecodeError)):
263288
raise
264289
raise exceptions.CosmosHttpResponseError(
265290
message=f"Semantic reranking failed: {str(e)}",

0 commit comments

Comments
 (0)