11#ifndef HD_INC_CORE_STRING_UNICODE_UTF8_H
22#define HD_INC_CORE_STRING_UNICODE_UTF8_H
3+ #include " ../../slice.h"
4+ #include " ../../traits/is_same.h"
5+ #include " ../../traits/remove_cv.h"
36
47namespace hud ::unicode
58{
6- [[nodiscard]] static constexpr bool is_valid_utf8_portable (const char8 *string, usize byte_count) noexcept
9+ /* *
10+ * Validates whether a given byte sequence is well-formed UTF-8 according to the Unicode specification.
11+ *
12+ * This function checks each sequence of bytes (ASCII, 2-, 3-, or 4-byte sequences) and ensures
13+ * the following rules are respected:
14+ * - ASCII bytes (< 0x80) are accepted directly.
15+ * - Multi-byte sequences must follow the correct pattern (10xxxxxx after a valid leading byte).
16+ * - Overlong encodings are rejected.
17+ * - Disallowed values (such as surrogates [U+D800, U+DFFF]) are rejected.
18+ * - Code points above U+10FFFF are rejected.
19+ *
20+ * An optimization is applied to quickly skip blocks of 16 consecutive ASCII bytes in a single operation.
21+ *
22+ * @tparam char_t Expected character type (must be `char8` or equivalent).
23+ * @param string UTF-8 byte sequence to validate.
24+ * @return true if the input is valid UTF-8, false otherwise.
25+ */
26+ template <typename char_t >
27+ requires (hud::is_same_v<hud::remove_cv_t <char_t >, char8>)
28+ [[nodiscard]] static constexpr bool is_valid_utf8_portable (const hud::slice<char_t > string) noexcept
729 {
830 usize pos = 0 ;
931 u32 code_point = 0 ;
32+ usize byte_count = string.byte_count ();
33+ const char8 *str = string.data ();
34+
1035 while (pos < byte_count) {
1136 // Optimization step:
1237 // If the next 16 bytes are guaranteed to be ASCII (all < 128),
1338 // we can skip them all at once instead of checking byte by byte.
1439 usize next_pos = pos + 16 ;
15- if (next_pos <= byte_count) { // Make sure we don't read past the buffer
16- u64 v1 = hud::memory::unaligned_load64 (string + pos); // load first 8 bytes
17- u64 v2 = hud::memory::unaligned_load64 (string + pos + sizeof (u64 )); // load next 8 bytes
40+ if (next_pos <= byte_count) { // Make sure we don't read past the buffer
41+ u64 v1 = hud::memory::unaligned_load64 (str + pos); // load first 8 bytes
42+ u64 v2 = hud::memory::unaligned_load64 (str + pos + sizeof (u64 )); // load next 8 bytes
1843 // Bitwise OR combines both 8-byte blocks so we only need a single mask test below.
1944 // If any byte in v1 or v2 has its high bit set (>= 0x80, non-ASCII),
2045 // the result will also have that bit set. This lets us quickly check
@@ -27,15 +52,15 @@ namespace hud::unicode
2752 }
2853
2954 // Now process byte by byte
30- unsigned char byte = string [pos];
55+ unsigned char byte = str [pos];
3156
3257 // Consume consecutive ASCII bytes.
3358 // This inner loop skips multiple ASCII chars in a row efficiently.
3459 while ((byte & 0x80 ) == 0 ) {
3560 if (++pos == byte_count) {
3661 return true ;
3762 }
38- byte = string [pos];
63+ byte = str [pos];
3964 }
4065
4166 // Case: 2-byte sequence -> 110xxxxx 10xxxxxx
@@ -49,11 +74,11 @@ namespace hud::unicode
4974 return false ;
5075 }
5176 // Ensure 1st continuous byte is 10xxxxxx
52- if ((string [pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
77+ if ((str [pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
5378 return false ;
5479 }
5580 // Read the code point
56- code_point = (byte & 0b00011111 ) << 6 | (string [pos + 1 ] & 0b00111111 );
81+ code_point = (byte & 0b00011111 ) << 6 | (str [pos + 1 ] & 0b00111111 );
5782 // Ensure code point is [0x80, 0x7FF] aka [U+0080, U+07FF]
5883 if ((code_point < 0x80 ) || (0x7ff < code_point)) {
5984 return false ;
@@ -70,15 +95,15 @@ namespace hud::unicode
7095 return false ;
7196 }
7297 // Ensure 1st continuous byte is 10xxxxxx
73- if ((string [pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
98+ if ((str [pos + 1 ] & 0b11000000 ) != 0b10000000 ) {
7499 return false ;
75100 }
76101 // Ensure 2nd continuous byte is 10xxxxxx
77- if ((string [pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
102+ if ((str [pos + 2 ] & 0b11000000 ) != 0b10000000 ) {
78103 return false ;
79104 }
80105 // Read the code point
81- code_point = (byte & 0b00001111 ) << 12 | (string [pos + 1 ] & 0b00111111 ) << 6 | (string [pos + 2 ] & 0b00111111 );
106+ code_point = (byte & 0b00001111 ) << 12 | (str [pos + 1 ] & 0b00111111 ) << 6 | (str [pos + 2 ] & 0b00111111 );
82107 // Check code point valid value
83108 // - must not be overlong encoding (< 0x800 is invalid)
84109 // - must be [0x0800, 0xFFFF] aka [U+0800, U+FFFF]
@@ -127,6 +152,30 @@ namespace hud::unicode
127152 }
128153 return true ;
129154 }
155+
156+ /* *
157+ * Validates whether a given byte sequence is well-formed UTF-8 according to the Unicode specification.
158+ *
159+ * This function checks each sequence of bytes (ASCII, 2-, 3-, or 4-byte sequences) and ensures
160+ * the following rules are respected:
161+ * - ASCII bytes (< 0x80) are accepted directly.
162+ * - Multi-byte sequences must follow the correct pattern (10xxxxxx after a valid leading byte).
163+ * - Overlong encodings are rejected.
164+ * - Disallowed values (such as surrogates [U+D800, U+DFFF]) are rejected.
165+ * - Code points above U+10FFFF are rejected.
166+ *
167+ * An optimization is applied to quickly skip blocks of 16 consecutive ASCII bytes in a single operation.
168+ *
169+ * @tparam char_t Expected character type (must be `char8` or equivalent).
170+ * @param string UTF-8 byte sequence to validate.
171+ * @return true if the input is valid UTF-8, false otherwise.
172+ */
173+ template <typename char_t >
174+ requires (hud::is_same_v<hud::remove_cv_t <char_t >, char8>)
175+ [[nodiscard]] static constexpr bool is_valid_utf8 (const hud::slice<char_t > string) noexcept
176+ {
177+ return is_valid_utf8_portable (string);
178+ }
130179} // namespace hud::unicode
131180
132181#endif // HD_INC_CORE_STRING_UNICODE_UTF8_H
0 commit comments