@@ -33,69 +33,78 @@ constexpr uint32_t kUtf8MaxCodePoint = 0x10FFFF;
3333constexpr uint32_t kUtf8MinSurrogate = 0xD800 ;
3434constexpr uint32_t kUtf8MaxSurrogate = 0xDFFF ;
3535
36- bool DecodeUtf8CodePoint (std::string_view source, uint32_t & code_point) {
37- const auto size = source.size ();
36+ std::optional<uint32_t > DecodeUtf8CodePoint (std::string_view source) {
3837 if (source.empty ()) {
39- return false ;
38+ return std:: nullopt ;
4039 }
4140
4241 auto byte0 = static_cast <uint8_t >(source[0 ]);
42+
43+ // 1-byte sequence (ASCII): 0xxxxxxx
4344 if (byte0 < 0x80 ) {
44- code_point = byte0;
45- return true ;
45+ return byte0;
4646 }
4747
48+ const auto size = source.size ();
49+
50+ // 2-byte sequence: 110xxxxx 10xxxxxx
4851 if ((byte0 & 0xE0 ) == 0xC0 ) {
49- if (source. size () < 2 ) {
50- return false ;
52+ if (size < 2 ) {
53+ return std:: nullopt ;
5154 }
5255 auto byte1 = static_cast <uint8_t >(source[1 ]);
5356 if ((byte1 & 0xC0 ) != 0x80 ) {
54- return false ;
57+ return std:: nullopt ;
5558 }
56- code_point = ((byte0 & 0x1F ) << 6 ) | (byte1 & 0x3F );
59+ uint32_t code_point = ((byte0 & 0x1F ) << 6 ) | (byte1 & 0x3F );
60+ // Check for overlong encoding
5761 if (code_point < 0x80 ) {
58- return false ;
62+ return std:: nullopt ;
5963 }
60- return true ;
64+ return code_point ;
6165 }
6266
67+ // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
6368 if ((byte0 & 0xF0 ) == 0xE0 ) {
64- if (source. size () < 3 ) {
65- return false ;
69+ if (size < 3 ) {
70+ return std:: nullopt ;
6671 }
6772 auto byte1 = static_cast <uint8_t >(source[1 ]);
6873 auto byte2 = static_cast <uint8_t >(source[2 ]);
6974 if ((byte1 & 0xC0 ) != 0x80 || (byte2 & 0xC0 ) != 0x80 ) {
70- return false ;
75+ return std:: nullopt ;
7176 }
72- code_point = ((byte0 & 0x0F ) << 12 ) | ((byte1 & 0x3F ) << 6 ) | (byte2 & 0x3F );
77+ uint32_t code_point = ((byte0 & 0x0F ) << 12 ) | ((byte1 & 0x3F ) << 6 ) | (byte2 & 0x3F );
78+ // Check for overlong encoding and surrogate pairs
7379 if (code_point < 0x800 ||
7480 (code_point >= kUtf8MinSurrogate && code_point <= kUtf8MaxSurrogate )) {
75- return false ;
81+ return std:: nullopt ;
7682 }
77- return true ;
83+ return code_point ;
7884 }
7985
86+ // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
8087 if ((byte0 & 0xF8 ) == 0xF0 ) {
81- if (source. size () < 4 ) {
82- return false ;
88+ if (size < 4 ) {
89+ return std:: nullopt ;
8390 }
8491 auto byte1 = static_cast <uint8_t >(source[1 ]);
8592 auto byte2 = static_cast <uint8_t >(source[2 ]);
8693 auto byte3 = static_cast <uint8_t >(source[3 ]);
8794 if ((byte1 & 0xC0 ) != 0x80 || (byte2 & 0xC0 ) != 0x80 || (byte3 & 0xC0 ) != 0x80 ) {
88- return false ;
95+ return std:: nullopt ;
8996 }
90- code_point = ((byte0 & 0x07 ) << 18 ) | ((byte1 & 0x3F ) << 12 ) | ((byte2 & 0x3F ) << 6 ) |
91- (byte3 & 0x3F );
97+ uint32_t code_point = ((byte0 & 0x07 ) << 18 ) | ((byte1 & 0x3F ) << 12 ) |
98+ ((byte2 & 0x3F ) << 6 ) | (byte3 & 0x3F );
99+ // Check for overlong encoding and valid Unicode range
92100 if (code_point < 0x10000 || code_point > kUtf8MaxCodePoint ) {
93- return false ;
101+ return std:: nullopt ;
94102 }
95- return true ;
103+ return code_point ;
96104 }
97105
98- return false ;
106+ // Invalid UTF-8 start byte
107+ return std::nullopt ;
99108}
100109
101110void AppendUtf8CodePoint (uint32_t code_point, std::string& target) {
@@ -117,9 +126,7 @@ void AppendUtf8CodePoint(uint32_t code_point, std::string& target) {
117126}
118127
119128template <TypeId type_id>
120- Literal TruncateLiteralImpl (const Literal& literal, int32_t width) {
121- std::unreachable ();
122- }
129+ Literal TruncateLiteralImpl (const Literal& literal, int32_t width) = delete;
123130
124131template <>
125132Literal TruncateLiteralImpl<TypeId::kInt >(const Literal& literal, int32_t width) {
@@ -160,19 +167,14 @@ Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t wid
160167}
161168
162169template <TypeId type_id>
163- Result<Literal> TruncateLiteralMaxImpl (const Literal& literal, int32_t width) {
164- std::unreachable ();
165- }
170+ Result<Literal> TruncateLiteralMaxImpl (const Literal& literal, int32_t width) = delete;
166171
167172template <>
168173Result<Literal> TruncateLiteralMaxImpl<TypeId::kString >(const Literal& literal,
169174 int32_t width) {
170175 const auto & str = std::get<std::string>(literal.value ());
171176 ICEBERG_ASSIGN_OR_RAISE (std::string truncated,
172177 TruncateUtils::TruncateUTF8Max (str, width));
173- if (truncated == str) {
174- return literal;
175- }
176178 return Literal::String (std::move (truncated));
177179}
178180
@@ -212,12 +214,12 @@ Result<std::string> TruncateUtils::TruncateUTF8Max(const std::string& source, si
212214 --cp_start;
213215 } while (cp_start > 0 && (static_cast <uint8_t >(truncated[cp_start]) & 0xC0 ) == 0x80 );
214216
215- uint32_t code_point = 0 ;
216- if (!DecodeUtf8CodePoint (
217- std::string_view (truncated.data () + cp_start, last_cp_start - cp_start),
218- code_point)) {
217+ auto code_point_opt = DecodeUtf8CodePoint (
218+ std::string_view (truncated.data () + cp_start, last_cp_start - cp_start));
219+ if (!code_point_opt.has_value ()) {
219220 return InvalidArgument (" Invalid UTF-8 in string literal" );
220221 }
222+ uint32_t code_point = code_point_opt.value ();
221223
222224 // Try to increment the code point
223225 if (code_point < kUtf8MaxCodePoint ) {
0 commit comments