Skip to content

Commit e6c8e87

Browse files
author
naarob
committed
fix: use errors='replace' in Frame.__str__() for OP_TEXT frames (fixes #1695)
Frame.__str__() decoded OP_TEXT frame data with a bare .decode(), which raises UnicodeDecodeError when the frame ends in the middle of a multi-byte UTF-8 sequence. This happens when the websockets library itself fragments a large text message at byte boundaries (not at character boundaries) for continuations frames (fin=False), e.g. Japanese, Chinese, or emoji text. When DEBUG logging is enabled, the UnicodeDecodeError propagated and caused the connection to close with code 1007 (INVALID_DATA), even though the message was valid. The data itself was fine — only the logging was broken. Fix: add errors='replace' to the .decode() call in Frame.__str__(). This replaces incomplete sequences with U+FFFD (replacement character), making the log entry human-readable while never crashing the connection. Tests: 9 new tests covering partial Japanese, partial emoji, complete frames, ASCII, binary, and ping frames. 79 upstream tests unchanged.
1 parent ea164d2 commit e6c8e87

File tree

2 files changed

+79
-1
lines changed

2 files changed

+79
-1
lines changed

src/websockets/frames.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,11 @@ def __str__(self) -> str:
159159
if self.opcode is OP_TEXT:
160160
# Decoding only the beginning and the end is needlessly hard.
161161
# Decode the entire payload then elide later if necessary.
162-
data = repr(bytes(self.data).decode())
162+
# Use errors='replace' because a non-final frame (fin=False) may
163+
# end in the middle of a multi-byte UTF-8 sequence (e.g. Japanese,
164+
# Chinese, emoji). A bare .decode() would raise UnicodeDecodeError
165+
# and crash the connection when DEBUG logging is enabled.
166+
data = repr(bytes(self.data).decode(errors="replace"))
163167
elif self.opcode is OP_BINARY:
164168
# We'll show at most the first 16 bytes and the last 8 bytes.
165169
# Encode just what we need, plus two dummy bytes to elide later.

tests/test_frame_str_unicode.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Tests for Frame.__str__() with incomplete UTF-8 sequences (issue #1695).
2+
3+
When DEBUG logging is enabled, websockets logs every frame with:
4+
logger.debug("> %s", frame)
5+
6+
This calls Frame.__str__(). For a non-final OP_TEXT frame that ends in the
7+
middle of a multi-byte UTF-8 sequence, the original .decode() raised
8+
UnicodeDecodeError and terminated the connection with code 1007 (INVALID_DATA).
9+
"""
10+
11+
import pytest
12+
from websockets.frames import Frame, OP_TEXT, OP_BINARY, OP_PING
13+
14+
15+
# ── Multi-byte UTF-8 characters ───────────────────────────────────────────────
16+
17+
JAPANESE = "日本語テスト" # each char = 3 bytes (0xe3…)
18+
EMOJI = "🦊🎉🐍" # each char = 4 bytes
19+
20+
21+
def _fragment(text: str, cut: int) -> bytes:
22+
"""Return the first *cut* bytes of *text* encoded as UTF-8."""
23+
return text.encode("utf-8")[:cut]
24+
25+
26+
@pytest.mark.parametrize("text,cut", [
27+
(JAPANESE * 100, 1001), # cuts in the middle of a 3-byte kanji
28+
(JAPANESE * 100, 1002), # cuts after 2 bytes of a 3-byte kanji
29+
(EMOJI * 100, 401), # cuts in the middle of a 4-byte emoji
30+
(EMOJI * 100, 402), # cuts after 2 bytes of a 4-byte emoji
31+
(EMOJI * 100, 403), # cuts after 3 bytes of a 4-byte emoji
32+
])
33+
def test_str_non_final_text_frame_no_unicode_error(text, cut):
34+
"""Frame.__str__() must not raise UnicodeDecodeError for partial UTF-8 frames."""
35+
data = _fragment(text, cut)
36+
assert data[-1:] not in (b"",) # ensure the cut is non-trivial
37+
frame = Frame(opcode=OP_TEXT, data=data, fin=False)
38+
# Must not raise UnicodeDecodeError
39+
result = str(frame)
40+
assert "TEXT" in result
41+
# Replacement char U+FFFD should appear to signal the truncation
42+
assert "\ufffd" in result, (
43+
f"Expected replacement char in repr for partial UTF-8, got: {result!r:.80}"
44+
)
45+
46+
47+
def test_str_complete_text_frame_no_replacement():
48+
"""A complete (fin=True) UTF-8 frame must decode without replacement chars."""
49+
text = JAPANESE * 10
50+
frame = Frame(opcode=OP_TEXT, data=text.encode("utf-8"), fin=True)
51+
result = str(frame)
52+
assert "TEXT" in result
53+
assert "\ufffd" not in result
54+
55+
56+
def test_str_ascii_text_frame():
57+
"""Plain ASCII text must still work correctly."""
58+
frame = Frame(opcode=OP_TEXT, data=b"hello world", fin=True)
59+
result = str(frame)
60+
assert "'hello world'" in result
61+
62+
63+
def test_str_binary_frame_unchanged():
64+
"""Binary frames should not be affected by the fix."""
65+
frame = Frame(opcode=OP_BINARY, data=bytes(range(32)), fin=True)
66+
result = str(frame)
67+
assert "BINARY" in result
68+
69+
70+
def test_str_ping_frame_unchanged():
71+
"""Ping frames should not be affected by the fix."""
72+
frame = Frame(opcode=OP_PING, data=b"ping", fin=True)
73+
result = str(frame)
74+
assert "PING" in result

0 commit comments

Comments
 (0)