Skip to content

Commit 1f896c5

Browse files
committed
fix the comment issues
1 parent 95c6054 commit 1f896c5

1 file changed

Lines changed: 40 additions & 38 deletions

File tree

src/iceberg/util/truncate_util.cc

Lines changed: 40 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -33,69 +33,78 @@ constexpr uint32_t kUtf8MaxCodePoint = 0x10FFFF;
3333
constexpr uint32_t kUtf8MinSurrogate = 0xD800;
3434
constexpr uint32_t kUtf8MaxSurrogate = 0xDFFF;
3535

36-
bool DecodeUtf8CodePoint(std::string_view source, uint32_t& code_point) {
37-
const auto size = source.size();
36+
std::optional<uint32_t> DecodeUtf8CodePoint(std::string_view source) {
3837
if (source.empty()) {
39-
return false;
38+
return std::nullopt;
4039
}
4140

4241
auto byte0 = static_cast<uint8_t>(source[0]);
42+
43+
// 1-byte sequence (ASCII): 0xxxxxxx
4344
if (byte0 < 0x80) {
44-
code_point = byte0;
45-
return true;
45+
return byte0;
4646
}
4747

48+
const auto size = source.size();
49+
50+
// 2-byte sequence: 110xxxxx 10xxxxxx
4851
if ((byte0 & 0xE0) == 0xC0) {
49-
if (source.size() < 2) {
50-
return false;
52+
if (size < 2) {
53+
return std::nullopt;
5154
}
5255
auto byte1 = static_cast<uint8_t>(source[1]);
5356
if ((byte1 & 0xC0) != 0x80) {
54-
return false;
57+
return std::nullopt;
5558
}
56-
code_point = ((byte0 & 0x1F) << 6) | (byte1 & 0x3F);
59+
uint32_t code_point = ((byte0 & 0x1F) << 6) | (byte1 & 0x3F);
60+
// Check for overlong encoding
5761
if (code_point < 0x80) {
58-
return false;
62+
return std::nullopt;
5963
}
60-
return true;
64+
return code_point;
6165
}
6266

67+
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
6368
if ((byte0 & 0xF0) == 0xE0) {
64-
if (source.size() < 3) {
65-
return false;
69+
if (size < 3) {
70+
return std::nullopt;
6671
}
6772
auto byte1 = static_cast<uint8_t>(source[1]);
6873
auto byte2 = static_cast<uint8_t>(source[2]);
6974
if ((byte1 & 0xC0) != 0x80 || (byte2 & 0xC0) != 0x80) {
70-
return false;
75+
return std::nullopt;
7176
}
72-
code_point = ((byte0 & 0x0F) << 12) | ((byte1 & 0x3F) << 6) | (byte2 & 0x3F);
77+
uint32_t code_point = ((byte0 & 0x0F) << 12) | ((byte1 & 0x3F) << 6) | (byte2 & 0x3F);
78+
// Check for overlong encoding and surrogate pairs
7379
if (code_point < 0x800 ||
7480
(code_point >= kUtf8MinSurrogate && code_point <= kUtf8MaxSurrogate)) {
75-
return false;
81+
return std::nullopt;
7682
}
77-
return true;
83+
return code_point;
7884
}
7985

86+
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
8087
if ((byte0 & 0xF8) == 0xF0) {
81-
if (source.size() < 4) {
82-
return false;
88+
if (size < 4) {
89+
return std::nullopt;
8390
}
8491
auto byte1 = static_cast<uint8_t>(source[1]);
8592
auto byte2 = static_cast<uint8_t>(source[2]);
8693
auto byte3 = static_cast<uint8_t>(source[3]);
8794
if ((byte1 & 0xC0) != 0x80 || (byte2 & 0xC0) != 0x80 || (byte3 & 0xC0) != 0x80) {
88-
return false;
95+
return std::nullopt;
8996
}
90-
code_point = ((byte0 & 0x07) << 18) | ((byte1 & 0x3F) << 12) | ((byte2 & 0x3F) << 6) |
91-
(byte3 & 0x3F);
97+
uint32_t code_point = ((byte0 & 0x07) << 18) | ((byte1 & 0x3F) << 12) |
98+
((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
99+
// Check for overlong encoding and valid Unicode range
92100
if (code_point < 0x10000 || code_point > kUtf8MaxCodePoint) {
93-
return false;
101+
return std::nullopt;
94102
}
95-
return true;
103+
return code_point;
96104
}
97105

98-
return false;
106+
// Invalid UTF-8 start byte
107+
return std::nullopt;
99108
}
100109

101110
void AppendUtf8CodePoint(uint32_t code_point, std::string& target) {
@@ -117,9 +126,7 @@ void AppendUtf8CodePoint(uint32_t code_point, std::string& target) {
117126
}
118127

119128
template <TypeId type_id>
120-
Literal TruncateLiteralImpl(const Literal& literal, int32_t width) {
121-
std::unreachable();
122-
}
129+
Literal TruncateLiteralImpl(const Literal& literal, int32_t width) = delete;
123130

124131
template <>
125132
Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) {
@@ -160,19 +167,14 @@ Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t wid
160167
}
161168

162169
template <TypeId type_id>
163-
Result<Literal> TruncateLiteralMaxImpl(const Literal& literal, int32_t width) {
164-
std::unreachable();
165-
}
170+
Result<Literal> TruncateLiteralMaxImpl(const Literal& literal, int32_t width) = delete;
166171

167172
template <>
168173
Result<Literal> TruncateLiteralMaxImpl<TypeId::kString>(const Literal& literal,
169174
int32_t width) {
170175
const auto& str = std::get<std::string>(literal.value());
171176
ICEBERG_ASSIGN_OR_RAISE(std::string truncated,
172177
TruncateUtils::TruncateUTF8Max(str, width));
173-
if (truncated == str) {
174-
return literal;
175-
}
176178
return Literal::String(std::move(truncated));
177179
}
178180

@@ -212,12 +214,12 @@ Result<std::string> TruncateUtils::TruncateUTF8Max(const std::string& source, si
212214
--cp_start;
213215
} while (cp_start > 0 && (static_cast<uint8_t>(truncated[cp_start]) & 0xC0) == 0x80);
214216

215-
uint32_t code_point = 0;
216-
if (!DecodeUtf8CodePoint(
217-
std::string_view(truncated.data() + cp_start, last_cp_start - cp_start),
218-
code_point)) {
217+
auto code_point_opt = DecodeUtf8CodePoint(
218+
std::string_view(truncated.data() + cp_start, last_cp_start - cp_start));
219+
if (!code_point_opt.has_value()) {
219220
return InvalidArgument("Invalid UTF-8 in string literal");
220221
}
222+
uint32_t code_point = code_point_opt.value();
221223

222224
// Try to increment the code point
223225
if (code_point < kUtf8MaxCodePoint) {

0 commit comments

Comments
 (0)