|
| 1 | +# The MIT License (MIT) |
| 2 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 3 | + |
| 4 | +"""UTF-8 decoding for HTTP response bodies, with an opt-in fallback for |
| 5 | +payloads containing bytes that are not valid UTF-8. |
| 6 | +
|
| 7 | +By default this module preserves the historical SDK behavior: strict |
| 8 | +decode, ``UnicodeDecodeError`` raised on the first invalid byte. |
| 9 | +Operators who need to read past corrupt payloads (for example, to |
| 10 | +unblock a stuck change-feed processor) can opt in to a permissive |
| 11 | +fallback by setting an environment variable. |
| 12 | +
|
| 13 | +The recognized environment variable is |
| 14 | +``AZURE_COSMOS_CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT``: |
| 15 | +
|
| 16 | +* ``REPLACE`` -> Python ``errors="replace"`` (substitute U+FFFD) |
| 17 | +* ``IGNORE`` -> Python ``errors="ignore"`` (drop the bad bytes) |
| 18 | +* anything else, including unset -> strict (raise on bad bytes) |
| 19 | +
|
| 20 | +The env var is consulted only on the decode-failure path, so operators |
| 21 | +can set or change it at any point during process lifetime and the next |
| 22 | +malformed payload will pick up the new value. This follows the Cosmos |
| 23 | +SDK's runtime-read pattern for environment-based controls. |
| 24 | +""" |
| 25 | +import logging |
| 26 | +import os |
| 27 | +from typing import Optional |
| 28 | + |
| 29 | +from ._constants import _Constants |
| 30 | + |
| 31 | + |
| 32 | +_MALFORMED_INPUT_ENV_VAR = _Constants.CHARSET_DECODER_ERROR_ACTION_ON_MALFORMED_INPUT |
| 33 | + |
| 34 | +# Mapping from the recognized env var values to Python's bytes.decode |
| 35 | +# `errors=` argument. Anything not in this mapping (including the env var |
| 36 | +# being unset) resolves to strict decoding, which is the historical default. |
| 37 | +_ENV_VALUE_TO_DECODE_ERRORS_MODE = { |
| 38 | + "REPLACE": "replace", |
| 39 | + "IGNORE": "ignore", |
| 40 | +} |
| 41 | + |
| 42 | +_logger = logging.getLogger(__name__) |
| 43 | + |
| 44 | + |
| 45 | +def _resolve_fallback_mode_from_env() -> Optional[str]: |
| 46 | + """Reads the malformed-input env var and returns the Python decode |
| 47 | + ``errors=`` mode to use as a fallback, or ``None`` if the operator |
| 48 | + has not opted in (in which case strict decoding stays in effect).""" |
| 49 | + raw_value = os.environ.get(_MALFORMED_INPUT_ENV_VAR) |
| 50 | + if raw_value is None: |
| 51 | + return None |
| 52 | + return _ENV_VALUE_TO_DECODE_ERRORS_MODE.get(raw_value.strip().upper()) |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | +def decode_response_body(data: bytes, operation_context: Optional[str] = None) -> str: |
| 57 | + """Decode an HTTP response body as UTF-8. |
| 58 | +
|
| 59 | + The healthy path is strict decoding, identical in behavior and cost |
| 60 | + to ``data.decode("utf-8")``. The slow path is taken only when the |
| 61 | + payload contains bytes that are not valid UTF-8: |
| 62 | +
|
| 63 | + * If the operator has opted in via the malformed-input env var, the |
| 64 | + decode is retried in the configured permissive mode (``replace`` or |
| 65 | + ``ignore``) and a WARNING is logged with the byte offset, the |
| 66 | + decoder's reason, and the supplied operation context. |
| 67 | + * Otherwise a ``UnicodeDecodeError`` is raised whose ``reason`` field |
| 68 | + carries an actionable hint pointing the operator at the env var. |
| 69 | + The original exception is preserved as ``__cause__``. |
| 70 | +
|
| 71 | + :param data: Response body bytes. |
| 72 | + :type data: bytes |
| 73 | + :param operation_context: Optional short string identifying the call |
| 74 | + site (for example, ``"read_item"`` or ``"query_items page"``); |
| 75 | + included in the WARNING log line when permissive fallback fires. |
| 76 | + :type operation_context: Optional[str] |
| 77 | + :returns: The decoded string. |
| 78 | + :rtype: str |
| 79 | + :raises UnicodeDecodeError: If the body contains invalid UTF-8 and |
| 80 | + the operator has not opted in to a permissive fallback. |
| 81 | + """ |
| 82 | + try: |
| 83 | + return data.decode("utf-8") |
| 84 | + except UnicodeDecodeError as strict_error: |
| 85 | + fallback_mode = _resolve_fallback_mode_from_env() |
| 86 | + if fallback_mode is None: |
| 87 | + hint = ( |
| 88 | + "{original}; set environment variable " |
| 89 | + "{env_var}=REPLACE (or IGNORE) to tolerate invalid UTF-8 " |
| 90 | + "in Cosmos response bodies" |
| 91 | + ).format( |
| 92 | + original=strict_error.reason, |
| 93 | + env_var=_MALFORMED_INPUT_ENV_VAR, |
| 94 | + ) |
| 95 | + raise UnicodeDecodeError( |
| 96 | + strict_error.encoding, |
| 97 | + strict_error.object, |
| 98 | + strict_error.start, |
| 99 | + strict_error.end, |
| 100 | + hint, |
| 101 | + ) from strict_error |
| 102 | + |
| 103 | + _logger.warning( |
| 104 | + "Cosmos response body contained invalid UTF-8 at byte offset %d " |
| 105 | + "(reason: %s); decoding with errors=%r per %s (operation: %s).", |
| 106 | + strict_error.start, |
| 107 | + strict_error.reason, |
| 108 | + fallback_mode, |
| 109 | + _MALFORMED_INPUT_ENV_VAR, |
| 110 | + operation_context or "-", |
| 111 | + ) |
| 112 | + return data.decode("utf-8", errors=fallback_mode) |
| 113 | + |
| 114 | + |
| 115 | +def decode_response_body_for_status( |
| 116 | + data: bytes, |
| 117 | + status_code: int, |
| 118 | + operation_context: Optional[str] = None, |
| 119 | +) -> str: |
| 120 | + """Decode an HTTP response body, with a best-effort fallback for HTTP |
| 121 | + error responses whose body happens to contain invalid UTF-8. |
| 122 | +
|
| 123 | + Behaves exactly like :func:`decode_response_body` on success and on |
| 124 | + 2xx responses with malformed UTF-8. The difference is the error path: |
| 125 | + if strict decoding fails AND the response is an HTTP error |
| 126 | + (``status_code >= 400``), the body is decoded with ``errors="replace"`` |
| 127 | + so the caller can still construct the real status-code exception |
| 128 | + (``CosmosResourceNotFoundError``, ``CosmosHttpResponseError``, etc.). |
| 129 | +
|
| 130 | + The reason: the SDK's retry/refresh logic and customer error handlers |
| 131 | + branch on status code, not on message contents. Masking a 404, 410 |
| 132 | + (partition split), 429 (throttle), or 503 with a ``UnicodeDecodeError`` |
| 133 | + breaks recovery paths that would otherwise have worked. ``U+FFFD`` in |
| 134 | + an error message is acceptable; a wrong exception class is not. |
| 135 | +
|
| 136 | + For 2xx responses with malformed UTF-8 the exception is still raised — |
| 137 | + a successful response carrying corrupt bytes is a real data-integrity |
| 138 | + problem the caller needs to see. |
| 139 | +
|
| 140 | + :param data: Response body bytes. |
| 141 | + :type data: bytes |
| 142 | + :param status_code: The HTTP status code of the response. |
| 143 | + :type status_code: int |
| 144 | + :param operation_context: Optional short string identifying the call |
| 145 | + site; forwarded to :func:`decode_response_body`. |
| 146 | + :type operation_context: Optional[str] |
| 147 | + :returns: The decoded string. |
| 148 | + :rtype: str |
| 149 | + :raises UnicodeDecodeError: If the body contains invalid UTF-8, the |
| 150 | + operator has not opted in to a permissive fallback, and the |
| 151 | + response is a success (2xx/3xx) rather than an HTTP error. |
| 152 | + """ |
| 153 | + try: |
| 154 | + return decode_response_body(data, operation_context) |
| 155 | + except UnicodeDecodeError: |
| 156 | + if status_code >= 400: |
| 157 | + return data.decode("utf-8", errors="replace") |
| 158 | + raise |
0 commit comments