Skip to content

Commit 4baf2ec

Browse files
committed
Fix crash in Frame.__str__ on partial UTF-8 sequences.
Fix #1695.
1 parent 8d2b133 commit 4baf2ec

3 files changed

Lines changed: 120 additions & 47 deletions

File tree

docs/project/changelog.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ Improvements
4040

4141
* Added wheels for ARMv7, PowerPC, RISC-V, and S/390.
4242

43+
Bug fixes
44+
.........
45+
46+
* Prevented ``Frame.__str__`` from crashing when a text frame is fragmented in
47+
the middle of a UTF-8 sequence.
48+
4349
.. _16.0:
4450

4551
16.0

src/websockets/frames.py

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -147,55 +147,104 @@ class Frame:
147147
# Configure if you want to see more in logs. Should be a multiple of 3.
148148
MAX_LOG_SIZE = int(os.environ.get("WEBSOCKETS_MAX_LOG_SIZE", "75"))
149149

150+
DEFAULT_IS_TEXT = {OP_TEXT: True, OP_BINARY: False, OP_CLOSE: True}
151+
150152
def __str__(self) -> str:
151153
"""
152154
Return a human-readable representation of a frame.
153155
156+
This function is intended for logging and debugging. It doesn't aim to
157+
support round-tripping because payloads can be too long for displaying
158+
conveniently. Instead, it shows the beginning and the end. It's robust
159+
to incorrect data.
160+
161+
It attempts to decode UTF-8 payloads whenever possible, even for binary
162+
frames and control frames, because those frequently contain UTF-8 data.
163+
It applies the same logic to continuation frames, because we don't know
164+
if they continue a text frame or a binary frame.
165+
154166
"""
155-
coding = None
167+
expect_text = self.DEFAULT_IS_TEXT.get(self.opcode)
168+
data_repr, is_text = self._data_repr()
169+
170+
data_type = "" if expect_text == is_text else ("text" if is_text else "binary")
156171
length = f"{len(self.data)} byte{'' if len(self.data) == 1 else 's'}"
157172
non_final = "" if self.fin else "continued"
173+
metadata = ", ".join(filter(None, [data_type, length, non_final]))
174+
175+
return f"{self.opcode.name} {data_repr} [{metadata}]"
176+
177+
def _data_repr(self) -> tuple[str, bool | None]:
178+
"""
179+
Return a human-readable representation of the payload.
180+
181+
Also returns whether the payload is text.
182+
183+
The representation is elided to fit ``MAX_LOG_SIZE``.
184+
185+
This is a helper for the __str__ method.
158186
159-
if self.opcode is OP_TEXT:
160-
# Decoding only the beginning and the end is needlessly hard.
161-
# Decode the entire payload then elide later if necessary.
162-
data = repr(bytes(self.data).decode())
163-
elif self.opcode is OP_BINARY:
164-
# We'll show at most the first 16 bytes and the last 8 bytes.
165-
# Encode just what we need, plus two dummy bytes to elide later.
187+
"""
188+
if not self.data:
189+
return "''", self.DEFAULT_IS_TEXT.get(self.opcode)
190+
191+
# Special case for close frames: parse close code and reason.
192+
# Fall back to the standard case if the payload is malformed.
193+
194+
if self.opcode is OP_CLOSE:
195+
try:
196+
return str(Close.parse(self.data)), True
197+
except (ProtocolError, UnicodeDecodeError):
198+
pass
199+
200+
# Guess whether the payload is UTF-8 or binary, regardless of opcode, to
201+
# display UTF-8 text in binary frames nicely and generally to be helpful
202+
# and robust. Also support frames fragmented within UTF-8 sequences.
203+
204+
if len(self.data) > 4 * self.MAX_LOG_SIZE:
205+
# Process only the start and the end, as the middle will be elided.
206+
# Cast to bytes because self.data could be a memoryview.
207+
data_start = bytes(self.data[: 8 * self.MAX_LOG_SIZE // 3])
208+
data_end = bytes(self.data[-4 * self.MAX_LOG_SIZE // 3 :])
209+
is_text = is_utf8_fragment(
210+
data_start,
211+
must_start_clean=self.opcode != OP_CONT,
212+
) and is_utf8_fragment(
213+
data_end,
214+
must_end_clean=self.fin,
215+
)
216+
if is_text:
217+
data_repr = repr((data_start + data_end).decode(errors="replace"))
218+
219+
else:
220+
# Cast to bytes because self.data could be a memoryview.
221+
data = bytes(self.data)
222+
is_text = is_utf8_fragment(
223+
data,
224+
must_start_clean=self.opcode != OP_CONT,
225+
must_end_clean=self.fin,
226+
)
227+
if is_text:
228+
data_repr = repr(data.decode(errors="replace"))
229+
230+
# When the payload is text (except perhaps for boundaries), we decoded
231+
# enough in ``data_repr``. Now, do the same when the payload is binary.
232+
233+
if not is_text:
166234
binary = self.data
167235
if len(binary) > self.MAX_LOG_SIZE // 3:
168236
cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8
237+
# Encode two dummy bytes to force eliding and adding an ellipsis.
169238
binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
170-
data = " ".join(f"{byte:02x}" for byte in binary)
171-
elif self.opcode is OP_CLOSE:
172-
data = str(Close.parse(self.data))
173-
elif self.data:
174-
# We don't know if a Continuation frame contains text or binary.
175-
# Ping and Pong frames could contain UTF-8.
176-
# Attempt to decode as UTF-8 and display it as text; fallback to
177-
# binary. If self.data is a memoryview, it has no decode() method,
178-
# which raises AttributeError.
179-
try:
180-
data = repr(bytes(self.data).decode())
181-
coding = "text"
182-
except (UnicodeDecodeError, AttributeError):
183-
binary = self.data
184-
if len(binary) > self.MAX_LOG_SIZE // 3:
185-
cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8
186-
binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
187-
data = " ".join(f"{byte:02x}" for byte in binary)
188-
coding = "binary"
189-
else:
190-
data = "''"
239+
data_repr = " ".join(f"{byte:02x}" for byte in binary)
191240

192-
if len(data) > self.MAX_LOG_SIZE:
193-
cut = self.MAX_LOG_SIZE // 3 - 1 # by default cut = 24
194-
data = data[: 2 * cut] + "..." + data[-cut:]
241+
# Elide the middle of the representation to fit the maximum log size.
195242

196-
metadata = ", ".join(filter(None, [coding, length, non_final]))
243+
if len(data_repr) > self.MAX_LOG_SIZE:
244+
cut = self.MAX_LOG_SIZE // 3 - 1 # by default cut = 24
245+
data_repr = data_repr[: 2 * cut] + "..." + data_repr[-cut:]
197246

198-
return f"{self.opcode.name} {data} [{metadata}]"
247+
return data_repr, is_text
199248

200249
@classmethod
201250
def parse(

tests/test_frames.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def test_extensions(self):
174174
class Rot13:
175175
@staticmethod
176176
def encode(frame):
177-
assert frame.opcode == OP_TEXT
177+
assert frame.opcode is OP_TEXT
178178
text = frame.data.decode()
179179
data = codecs.encode(text, "rot13").encode()
180180
return dataclasses.replace(frame, data=data)
@@ -229,25 +229,31 @@ def test_cont_final_binary_from_memoryview(self):
229229
"CONT fc fd fe ff [binary, 4 bytes]",
230230
)
231231

232+
def test_cont_text_fragmented(self):
233+
self.assertEqual(
234+
str(Frame(OP_CONT, b"\xa9caf\xc3", fin=False)),
235+
"CONT '�caf�' [text, 5 bytes, continued]",
236+
)
237+
232238
def test_cont_text_truncated(self):
233239
self.assertEqual(
234-
str(Frame(OP_CONT, b"caf\xc3\xa9 " * 16, fin=False)),
240+
str(Frame(OP_CONT, b"caf\xc3\xa9 " * 64, fin=False)),
235241
"CONT 'café café café café café café café café café ca..."
236-
"fé café café café café ' [text, 96 bytes, continued]",
242+
"fé café café café café ' [text, 384 bytes, continued]",
237243
)
238244

239245
def test_cont_binary_truncated(self):
240246
self.assertEqual(
241-
str(Frame(OP_CONT, bytes(range(256)), fin=False)),
247+
str(Frame(OP_CONT, bytes(range(256)) * 2, fin=False)),
242248
"CONT 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f ..."
243-
" f8 f9 fa fb fc fd fe ff [binary, 256 bytes, continued]",
249+
" f8 f9 fa fb fc fd fe ff [binary, 512 bytes, continued]",
244250
)
245251

246252
def test_cont_binary_truncated_from_memoryview(self):
247253
self.assertEqual(
248-
str(Frame(OP_CONT, memoryview(bytes(range(256))), fin=False)),
254+
str(Frame(OP_CONT, memoryview(bytes(range(256)) * 2), fin=False)),
249255
"CONT 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f ..."
250-
" f8 f9 fa fb fc fd fe ff [binary, 256 bytes, continued]",
256+
" f8 f9 fa fb fc fd fe ff [binary, 512 bytes, continued]",
251257
)
252258

253259
def test_text(self):
@@ -262,11 +268,17 @@ def test_text_non_final(self):
262268
"TEXT 'café' [5 bytes, continued]",
263269
)
264270

271+
def test_text_fragmented(self):
272+
self.assertEqual(
273+
str(Frame(OP_TEXT, b"caf\xc3", fin=False)),
274+
"TEXT 'caf�' [4 bytes, continued]",
275+
)
276+
265277
def test_text_truncated(self):
266278
self.assertEqual(
267-
str(Frame(OP_TEXT, b"caf\xc3\xa9 " * 16)),
279+
str(Frame(OP_TEXT, b"caf\xc3\xa9 " * 64)),
268280
"TEXT 'café café café café café café café café café ca..."
269-
"fé café café café café ' [96 bytes]",
281+
"fé café café café café ' [384 bytes]",
270282
)
271283

272284
def test_text_with_newline(self):
@@ -301,16 +313,16 @@ def test_binary_non_final_from_memoryview(self):
301313

302314
def test_binary_truncated(self):
303315
self.assertEqual(
304-
str(Frame(OP_BINARY, bytes(range(256)))),
316+
str(Frame(OP_BINARY, bytes(range(256)) * 2)),
305317
"BINARY 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f ..."
306-
" f8 f9 fa fb fc fd fe ff [256 bytes]",
318+
" f8 f9 fa fb fc fd fe ff [512 bytes]",
307319
)
308320

309321
def test_binary_truncated_from_memoryview(self):
310322
self.assertEqual(
311-
str(Frame(OP_BINARY, memoryview(bytes(range(256))))),
323+
str(Frame(OP_BINARY, memoryview(bytes(range(256)) * 2))),
312324
"BINARY 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f ..."
313-
" f8 f9 fa fb fc fd fe ff [256 bytes]",
325+
" f8 f9 fa fb fc fd fe ff [512 bytes]",
314326
)
315327

316328
def test_close(self):
@@ -325,6 +337,12 @@ def test_close_reason(self):
325337
"CLOSE 1001 (going away) Bye! [6 bytes]",
326338
)
327339

340+
def test_close_invalid(self):
341+
self.assertEqual(
342+
str(Frame(OP_CLOSE, b"\x00\x01\x02\x03")),
343+
"CLOSE 00 01 02 03 [binary, 4 bytes]",
344+
)
345+
328346
def test_ping(self):
329347
self.assertEqual(
330348
str(Frame(OP_PING, b"")),

0 commit comments

Comments
 (0)