From 1cd84bad150b2d2bb77cc8d7d3e192db94fa9dd6 Mon Sep 17 00:00:00 2001 From: Koda Reef Date: Sun, 22 Mar 2026 22:40:56 +0000 Subject: [PATCH] Fix UTF8FirstLetterNumBytes to handle malformed UTF-8 correctly UTF8FirstLetterNumBytes returns the byte count from OneCharLen without validating that the expected continuation bytes actually follow the leader byte. Malformed UTF-8 (e.g., bare leader bytes without continuations) causes Utf8Len to undercount characters by 2-4x, bypassing string length validation constraints. For example, 20 bytes of 0xC0 (invalid 2-byte leaders) produces Utf8Len=10 instead of 20, allowing a max_len=10 constraint to accept 20 bytes of data. Fix: - Clamp consumed bytes to remaining buffer length - Validate continuation bytes have the 10xxxxxx pattern - Return 1 for any invalid byte (count as single character) Valid UTF-8 counting is unchanged. --- validate/validate.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/validate/validate.h b/validate/validate.h index d6cf6c9d9..01917d288 100644 --- a/validate/validate.h +++ b/validate/validate.h @@ -156,10 +156,20 @@ inline int OneCharLen(const char* src) { return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; } -inline int UTF8FirstLetterNumBytes(const char *utf8_str, int str_len) { +inline int UTF8FirstLetterNumBytes(const char *utf8_str, ptrdiff_t str_len) { if (str_len == 0) return 0; - return OneCharLen(utf8_str); + int char_len = OneCharLen(utf8_str); + // Clamp to remaining bytes: a truncated multi-byte sequence + // counts as a single (invalid) character. + if (char_len > str_len) + return 1; + // Validate continuation bytes (must have 10xxxxxx pattern). + for (int i = 1; i < char_len; i++) { + if ((static_cast(utf8_str[i]) & 0xC0) != 0x80) + return 1; // Invalid continuation: count leader as single char. + } + return char_len; } inline size_t Utf8Len(const string& narrow_string) {