From bcc953382321294449fa55fe034a643f46834bb3 Mon Sep 17 00:00:00 2001 From: Xymb Date: Wed, 25 Jun 2025 02:45:44 +0200 Subject: [PATCH] Replace utf8 with modified utf8. --- nbtlib/tag.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/nbtlib/tag.py b/nbtlib/tag.py index 09ab56d..e165ee8 100644 --- a/nbtlib/tag.py +++ b/nbtlib/tag.py @@ -217,12 +217,12 @@ def write_numeric(fmt, value, fileobj, byteorder="big"): def read_string(fileobj, byteorder="big"): """Read a string from a file-like object.""" length = read_numeric(USHORT, fileobj, byteorder) - return fileobj.read(length).decode("utf-8", "replace") + return _modified_utf8_decode(fileobj.read(length)) def write_string(value, fileobj, byteorder="big"): """Write a string to a file-like object.""" - data = value.encode("utf-8") + data = _modified_utf8_encode(value) write_numeric(USHORT, len(data), fileobj, byteorder) fileobj.write(data) @@ -1252,3 +1252,83 @@ class LongArray(Array): item_type = get_format(np.dtype, "i8") array_prefix = "L" wrapper = Long + + +def _modified_utf8_encode(string: str) -> bytes: + """Encode *string* with the Modified UTF-8 flavour used by Java NBT.""" + out = bytearray() + for ch in string: + cp = ord(ch) + + # U+0001 … U+007F → identical single-byte (except U+0000). + if 0x0001 <= cp <= 0x007F: + out.append(cp) + + # U+0000 or U+0080 … U+07FF → two-byte sequence. + elif cp == 0 or cp <= 0x07FF: + out.append(0xC0 | (cp >> 6)) + out.append(0x80 | (cp & 0x3F)) + + # U+0800 … U+FFFF → three-byte sequence. + elif cp <= 0xFFFF: + out.append(0xE0 | (cp >> 12)) + out.append(0x80 | ((cp >> 6) & 0x3F)) + out.append(0x80 | (cp & 0x3F)) + + # Supplementary plane → encode surrogate pair separately. + else: + cp -= 0x10000 + for surrogate in ( + 0xD800 | (cp >> 10), + 0xDC00 | (cp & 0x3FF), + ): + out.append(0xE0 | (surrogate >> 12)) + out.append(0x80 | ((surrogate >> 6) & 0x3F)) + out.append(0x80 | (surrogate & 0x3F)) + return bytes(out) + + +def _modified_utf8_decode(data: bytes) -> str: + """Decode Modified UTF-8 *data* back to a python str.""" + i, n = 0, len(data) + code_units = [] + + while i < n: + a = data[i] + + # 1-byte + if (a & 0x80) == 0: + code_units.append(a) + i += 1 + + # 2-bytes + elif (a & 0xE0) == 0xC0: + b = data[i + 1] + code_units.append(((a & 0x1F) << 6) | (b & 0x3F)) + i += 2 + + # 3-bytes + else: # (a & 0xF0) == 0xE0 + b, c = data[i + 1], data[i + 2] + code_units.append( + ((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F) + ) + i += 3 + + # Turn UTF-16 code units (with possible surrogates) into python str + chars, j = [], 0 + while j < len(code_units): + cu = code_units[j] + # surrogate pair ? + if 0xD800 <= cu <= 0xDBFF and j + 1 < len(code_units): + cu2 = code_units[j + 1] + if 0xDC00 <= cu2 <= 0xDFFF: + chars.append( + chr(((cu - 0xD800) << 10) + (cu2 - 0xDC00) + 0x10000) + ) + j += 2 + continue + chars.append(chr(cu)) + j += 1 + + return "".join(chars)