Skip to content

Commit 87069d9

Browse files
fix(smithy-json): escape control characters in write_string per RFC 8259 §7 (smithy-lang#647)
* fix(smithy-json): escape control characters in write_string per RFC 8259 §7 StreamingJSONEncoder.write_string() only escaped backslash and double quote. Control characters U+0000–U+001F (newline, tab, CR, etc.) were written as raw bytes, producing invalid JSON that causes SerializationException on API calls with multi-line string fields. Use a regex to escape all control characters: named escapes for common ones (\n, \r, \t, \b, \f) and \uXXXX for the rest. * adding changelog entry
1 parent 9c9b35d commit 87069d9

File tree

4 files changed

+93
-3
lines changed

4 files changed

+93
-3
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "enhancement",
3+
"description": "Fixed string serialization to escape all control characters (U+0000-U+001F) per [RFC 8259](https://www.rfc-editor.org/rfc/rfc8259#section-7), preventing invalid JSON output for multiline and other control-character-containing strings. ([#647](https://github.com/smithy-lang/smithy-python/pull/647))"
4+
}

packages/smithy-json/src/smithy_json/_private/serializers.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
import re
45
from base64 import b64encode
56
from collections.abc import Callable, Mapping, Sequence
67
from contextlib import AbstractContextManager
@@ -27,6 +28,30 @@
2728
_INF: float = float("inf")
2829
_NEG_INF: float = float("-inf")
2930

31+
# RFC 8259 §7: All control characters U+0000 through U+001F MUST be escaped.
32+
_ESCAPE_MAP: dict[str, str] = {
33+
"\\": "\\\\",
34+
'"': '\\"',
35+
"\n": "\\n",
36+
"\r": "\\r",
37+
"\t": "\\t",
38+
"\b": "\\b",
39+
"\f": "\\f",
40+
}
41+
_CHARS_TO_ESCAPE = re.compile(r'[\\"\x00-\x1f]')
42+
43+
44+
def _escape_char(match: re.Match[str]) -> str:
45+
c = match.group()
46+
if c in _ESCAPE_MAP:
47+
return _ESCAPE_MAP[c]
48+
return f"\\u{ord(c):04x}"
49+
50+
51+
def _escape_string(value: str) -> str:
52+
"""Escape a string value per RFC 8259 §7."""
53+
return _CHARS_TO_ESCAPE.sub(_escape_char, value)
54+
3055

3156
class JSONShapeSerializer(ShapeSerializer):
3257
def __init__(self, sink: BytesWriter, settings: JSONSettings) -> None:
@@ -271,9 +296,7 @@ def write_document_value(
271296

272297
def write_string(self, value: str) -> None:
273298
self._sink.write(b'"')
274-
self._sink.write(
275-
value.replace("\\", "\\\\").replace('"', '\\"').encode("utf-8")
276-
)
299+
self._sink.write(_escape_string(value).encode("utf-8"))
277300
self._sink.write(b'"')
278301

279302
def write_int(self, value: int) -> None:

packages/smithy-json/tests/unit/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,12 @@ def _read_optional_map(k: str, d: ShapeDeserializer):
349349
(Decimal("1.1"), b"1.1"),
350350
(b"foo", b'"Zm9v"'),
351351
("foo", b'"foo"'),
352+
# RFC 8259 §7: control characters must be escaped
353+
("line 1\nline 2", b'"line 1\\nline 2"'),
354+
("col 1\tcol 2", b'"col 1\\tcol 2"'),
355+
("a\rb", b'"a\\rb"'),
356+
("a\\b", b'"a\\\\b"'),
357+
('a"b', b'"a\\"b"'),
352358
(datetime(2024, 5, 15, tzinfo=UTC), b'"2024-05-15T00:00:00Z"'),
353359
(None, b"null"),
354360
(["foo"], b'["foo"]'),

packages/smithy-json/tests/unit/test_serializers.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
3+
import json
34
from datetime import datetime
45
from decimal import Decimal
56
from io import BytesIO
@@ -82,3 +83,59 @@ def test_json_serializer(given: Any, expected: bytes) -> None:
8283
sink.seek(0)
8384
actual = sink.read()
8485
assert actual == expected
86+
87+
88+
def _serialize_string(value: str) -> bytes:
89+
"""Serialize a string value through the JSON codec and return raw bytes."""
90+
sink = BytesIO()
91+
serializer = JSONCodec().create_serializer(sink)
92+
serializer.write_string(STRING, value)
93+
serializer.flush()
94+
sink.seek(0)
95+
return sink.read()
96+
97+
98+
class TestStringControlCharEscaping:
99+
"""RFC 8259 §7: All control characters U+0000-U+001F must be escaped."""
100+
101+
@pytest.mark.parametrize(
102+
"char, escaped",
103+
[
104+
("\n", "\\n"),
105+
("\r", "\\r"),
106+
("\t", "\\t"),
107+
("\b", "\\b"),
108+
("\f", "\\f"),
109+
],
110+
)
111+
def test_named_control_chars(self, char: str, escaped: str) -> None:
112+
result = _serialize_string(f"a{char}b")
113+
assert result == f'"a{escaped}b"'.encode()
114+
115+
def test_all_control_chars_produce_valid_json(self) -> None:
116+
"""Every U+0000-U+001F character must be escaped so output is valid JSON."""
117+
for cp in range(0x20):
118+
value = f"before{chr(cp)}after"
119+
raw = _serialize_string(value)
120+
# Must parse as valid JSON
121+
parsed = json.loads(raw)
122+
assert parsed == value, f"Round-trip failed for U+{cp:04X}"
123+
124+
def test_null_byte(self) -> None:
125+
result = _serialize_string("a\x00b")
126+
assert result == b'"a\\u0000b"'
127+
128+
def test_mixed_escapes(self) -> None:
129+
result = _serialize_string('line 1\nline 2\t"quoted"\r\n')
130+
assert result == b'"line 1\\nline 2\\t\\"quoted\\"\\r\\n"'
131+
132+
def test_existing_backslash_and_quote_still_escaped(self) -> None:
133+
result = _serialize_string('a\\b"c')
134+
assert result == b'"a\\\\b\\"c"'
135+
136+
def test_serialized_output_is_valid_json(self) -> None:
137+
"""Realistic multi-line prompt string produces valid JSON."""
138+
value = "System: You are helpful.\nUser: Hello\nAssistant:"
139+
raw = _serialize_string(value)
140+
parsed = json.loads(raw)
141+
assert parsed == value

0 commit comments

Comments
 (0)