Skip to content

Commit 41ae21b

Browse files
authored
Fix decode returning utf-16 code units (#9)
* Fix decode returning utf-16 code units * Add test
1 parent 35209bb commit 41ae21b

2 files changed

Lines changed: 20 additions & 24 deletions

File tree

src/mdurl/_decode.py

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,11 @@ def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
5555
b2 = int(seq[i + 4 : i + 6], 16)
5656

5757
if (b2 & 0xC0) == 0x80:
58-
chr_ = ((b1 << 6) & 0x7C0) | (b2 & 0x3F)
59-
60-
if chr_ < 0x80:
61-
result += "\ufffd\ufffd"
62-
else:
63-
result += chr(chr_)
58+
all_bytes = bytes((b1, b2))
59+
try:
60+
result += all_bytes.decode()
61+
except UnicodeDecodeError:
62+
result += "\ufffd" * 2
6463

6564
i += 3
6665
i += 3 # emulate JS for loop statement3
@@ -72,12 +71,11 @@ def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
7271
b3 = int(seq[i + 7 : i + 9], 16)
7372

7473
if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80:
75-
chr_ = ((b1 << 12) & 0xF000) | ((b2 << 6) & 0xFC0) | (b3 & 0x3F)
76-
77-
if chr_ < 0x800 or (chr_ >= 0xD800 and chr_ <= 0xDFFF):
78-
result += "\ufffd\ufffd\ufffd"
79-
else:
80-
result += chr(chr_)
74+
all_bytes = bytes((b1, b2, b3))
75+
try:
76+
result += all_bytes.decode()
77+
except UnicodeDecodeError:
78+
result += "\ufffd" * 3
8179

8280
i += 6
8381
i += 3 # emulate JS for loop statement3
@@ -90,18 +88,11 @@ def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
9088
b4 = int(seq[i + 10 : i + 12], 16)
9189

9290
if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80:
93-
chr_ = (
94-
((b1 << 18) & 0x1C0000)
95-
| ((b2 << 12) & 0x3F000)
96-
| ((b3 << 6) & 0xFC0)
97-
| (b4 & 0x3F)
98-
)
99-
100-
if chr_ < 0x10000 or chr_ > 0x10FFFF:
101-
result += "\ufffd\ufffd\ufffd\ufffd"
102-
else:
103-
chr_ -= 0x10000
104-
result += chr(0xD800 + (chr_ >> 10)) + chr(0xDC00 + (chr_ & 0x3FF))
91+
all_bytes = bytes((b1, b2, b3, b4))
92+
try:
93+
result += all_bytes.decode()
94+
except UnicodeDecodeError:
95+
result += "\ufffd" * 4
10596

10697
i += 9
10798
i += 3 # emulate JS for loop statement3

tests/test_decode.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from mdurl import decode
2+
3+
4+
def test_decode_multi_byte():
5+
assert decode("https://host.invalid/%F0%9F%91%A9") == "https://host.invalid/👩"

0 commit comments

Comments
 (0)