Skip to content

Commit 95c6054

Browse files
committed
feat: implement truncate max for literals
1 parent cd93b99 commit 95c6054

File tree

3 files changed

+352
-0
lines changed

3 files changed

+352
-0
lines changed

src/iceberg/test/truncate_util_test.cc

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <gtest/gtest.h>
2323

2424
#include "iceberg/expression/literal.h"
25+
#include "iceberg/test/matchers.h"
2526

2627
namespace iceberg {
2728

@@ -50,4 +51,141 @@ TEST(TruncateUtilTest, TruncateLiteral) {
5051
Literal::Binary(std::vector<uint8_t>(expected.begin(), expected.end())));
5152
}
5253

54+
TEST(TruncateUtilTest, TruncateBinaryMax) {
55+
std::vector<uint8_t> test1{1, 1, 2};
56+
std::vector<uint8_t> test2{1, 1, 0xFF, 2};
57+
std::vector<uint8_t> test3{0xFF, 0xFF, 0xFF, 2};
58+
std::vector<uint8_t> test4{1, 1, 0};
59+
std::vector<uint8_t> expected_output{1, 2};
60+
61+
// Test1: truncate {1, 1, 2} to 2 bytes -> {1, 2}
62+
ICEBERG_UNWRAP_OR_FAIL(auto result1,
63+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test1), 2));
64+
EXPECT_EQ(result1, Literal::Binary(expected_output));
65+
66+
// Test2: truncate {1, 1, 0xFF, 2} to 2 bytes -> {1, 2}
67+
ICEBERG_UNWRAP_OR_FAIL(auto result2,
68+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test2), 2));
69+
EXPECT_EQ(result2, Literal::Binary(expected_output));
70+
71+
// Test2b: truncate {1, 1, 0xFF, 2} to 3 bytes -> {1, 2}
72+
ICEBERG_UNWRAP_OR_FAIL(auto result2b,
73+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test2), 3));
74+
EXPECT_EQ(result2b, Literal::Binary(expected_output));
75+
76+
// Test3: no truncation needed when length >= input size
77+
ICEBERG_UNWRAP_OR_FAIL(auto result3,
78+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test3), 5));
79+
EXPECT_EQ(result3, Literal::Binary(test3));
80+
81+
// Test3b: cannot truncate when first bytes are all 0xFF
82+
EXPECT_THAT(TruncateUtils::TruncateLiteralMax(Literal::Binary(test3), 2),
83+
IsError(ErrorKind::kInvalidArgument));
84+
85+
// Test4: truncate {1, 1, 0} to 2 bytes -> {1, 2}
86+
ICEBERG_UNWRAP_OR_FAIL(auto result4,
87+
TruncateUtils::TruncateLiteralMax(Literal::Binary(test4), 2));
88+
EXPECT_EQ(result4, Literal::Binary(expected_output));
89+
}
90+
91+
TEST(TruncateUtilTest, TruncateStringMax) {
92+
// Test1: Japanese characters "イロハニホヘト"
93+
std::string test1 =
94+
"\xE3\x82\xA4\xE3\x83\xAD\xE3\x83\x8F\xE3\x83\x8B\xE3\x83\x9B\xE3\x83\x98\xE3\x83"
95+
"\x88";
96+
std::string test1_2_expected = "\xE3\x82\xA4\xE3\x83\xAE"; // "イヮ"
97+
std::string test1_3_expected = "\xE3\x82\xA4\xE3\x83\xAD\xE3\x83\x90"; // "イロバ"
98+
99+
ICEBERG_UNWRAP_OR_FAIL(auto result1_2,
100+
TruncateUtils::TruncateLiteralMax(Literal::String(test1), 2));
101+
EXPECT_EQ(result1_2, Literal::String(test1_2_expected));
102+
103+
ICEBERG_UNWRAP_OR_FAIL(auto result1_3,
104+
TruncateUtils::TruncateLiteralMax(Literal::String(test1), 3));
105+
EXPECT_EQ(result1_3, Literal::String(test1_3_expected));
106+
107+
// No truncation needed when length >= input size
108+
ICEBERG_UNWRAP_OR_FAIL(auto result1_7,
109+
TruncateUtils::TruncateLiteralMax(Literal::String(test1), 7));
110+
EXPECT_EQ(result1_7, Literal::String(test1));
111+
112+
ICEBERG_UNWRAP_OR_FAIL(auto result1_8,
113+
TruncateUtils::TruncateLiteralMax(Literal::String(test1), 8));
114+
EXPECT_EQ(result1_8, Literal::String(test1));
115+
116+
// Test2: Mixed characters "щщаεはчωいにπάほхεろへσκζ"
117+
std::string test2 =
118+
"\xD1\x89\xD1\x89\xD0\xB0\xCE\xB5\xE3\x81\xAF\xD1\x87\xCF\x89\xE3\x81\x84\xE3\x81"
119+
"\xAB\xCF\x80\xCE\xAC\xE3\x81\xBB\xD1\x85\xCE\xB5\xE3\x82\x8D\xE3\x81\xB8\xCF\x83"
120+
"\xCE\xBA\xCE\xB6";
121+
std::string test2_7_expected =
122+
"\xD1\x89\xD1\x89\xD0\xB0\xCE\xB5\xE3\x81\xAF\xD1\x87\xCF\x8A"; // "щщаεはчϊ"
123+
124+
ICEBERG_UNWRAP_OR_FAIL(auto result2_7,
125+
TruncateUtils::TruncateLiteralMax(Literal::String(test2), 7));
126+
EXPECT_EQ(result2_7, Literal::String(test2_7_expected));
127+
128+
// Test3: String with max 3-byte UTF-8 character "aनि\uFFFF\uFFFF"
129+
std::string test3 = "a\xE0\xA4\xA8\xE0\xA4\xBF\xEF\xBF\xBF\xEF\xBF\xBF";
130+
std::string test3_3_expected = "a\xE0\xA4\xA8\xE0\xA5\x80"; // "aनी"
131+
132+
ICEBERG_UNWRAP_OR_FAIL(auto result3_3,
133+
TruncateUtils::TruncateLiteralMax(Literal::String(test3), 3));
134+
EXPECT_EQ(result3_3, Literal::String(test3_3_expected));
135+
136+
// Test4: Max 3-byte UTF-8 character "\uFFFF\uFFFF"
137+
std::string test4 = "\xEF\xBF\xBF\xEF\xBF\xBF";
138+
std::string test4_1_expected = "\xF0\x90\x80\x80"; // U+10000 (first 4-byte UTF-8 char)
139+
140+
ICEBERG_UNWRAP_OR_FAIL(auto result4_1,
141+
TruncateUtils::TruncateLiteralMax(Literal::String(test4), 1));
142+
EXPECT_EQ(result4_1, Literal::String(test4_1_expected));
143+
144+
// Test5: Max 4-byte UTF-8 characters "\uDBFF\uDFFF\uDBFF\uDFFF"
145+
std::string test5 = "\xF4\x8F\xBF\xBF\xF4\x8F\xBF\xBF"; // U+10FFFF U+10FFFF
146+
EXPECT_THAT(TruncateUtils::TruncateLiteralMax(Literal::String(test5), 1),
147+
IsError(ErrorKind::kInvalidArgument));
148+
149+
// Test6: 4-byte UTF-8 character "\uD800\uDFFF\uD800\uDFFF"
150+
std::string test6 = "\xF0\x90\x8F\xBF\xF0\x90\x8F\xBF"; // U+103FF U+103FF
151+
std::string test6_1_expected = "\xF0\x90\x90\x80"; // U+10400
152+
153+
ICEBERG_UNWRAP_OR_FAIL(auto result6_1,
154+
TruncateUtils::TruncateLiteralMax(Literal::String(test6), 1));
155+
EXPECT_EQ(result6_1, Literal::String(test6_1_expected));
156+
157+
// Test7: Emoji "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02"
158+
std::string test7 = "\xF0\x9F\x98\x82\xF0\x9F\x98\x82\xF0\x9F\x98\x82"; // 😂😂😂
159+
std::string test7_2_expected = "\xF0\x9F\x98\x82\xF0\x9F\x98\x83"; // 😂😃
160+
std::string test7_1_expected = "\xF0\x9F\x98\x83"; // 😃
161+
162+
ICEBERG_UNWRAP_OR_FAIL(auto result7_2,
163+
TruncateUtils::TruncateLiteralMax(Literal::String(test7), 2));
164+
EXPECT_EQ(result7_2, Literal::String(test7_2_expected));
165+
166+
ICEBERG_UNWRAP_OR_FAIL(auto result7_1,
167+
TruncateUtils::TruncateLiteralMax(Literal::String(test7), 1));
168+
EXPECT_EQ(result7_1, Literal::String(test7_1_expected));
169+
170+
// Test8: Overflow case "a\uDBFF\uDFFFc"
171+
std::string test8 =
172+
"a\xF4\x8F\xBF\xBF"
173+
"c"; // a U+10FFFF c
174+
std::string test8_2_expected = "b";
175+
176+
ICEBERG_UNWRAP_OR_FAIL(auto result8_2,
177+
TruncateUtils::TruncateLiteralMax(Literal::String(test8), 2));
178+
EXPECT_EQ(result8_2, Literal::String(test8_2_expected));
179+
180+
// Test9: Skip surrogate range "a" + (char)(Character.MIN_SURROGATE - 1) + "b"
181+
std::string test9 =
182+
"a\xED\x9F\xBF"
183+
"b"; // a U+D7FF b
184+
std::string test9_2_expected = "a\xEE\x80\x80"; // a U+E000
185+
186+
ICEBERG_UNWRAP_OR_FAIL(auto result9_2,
187+
TruncateUtils::TruncateLiteralMax(Literal::String(test9), 2));
188+
EXPECT_EQ(result9_2, Literal::String(test9_2_expected));
189+
}
190+
53191
} // namespace iceberg

src/iceberg/util/truncate_util.cc

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,93 @@
2929
namespace iceberg {
3030

3131
namespace {
32+
constexpr uint32_t kUtf8MaxCodePoint = 0x10FFFF;
33+
constexpr uint32_t kUtf8MinSurrogate = 0xD800;
34+
constexpr uint32_t kUtf8MaxSurrogate = 0xDFFF;
35+
36+
bool DecodeUtf8CodePoint(std::string_view source, uint32_t& code_point) {
37+
const auto size = source.size();
38+
if (source.empty()) {
39+
return false;
40+
}
41+
42+
auto byte0 = static_cast<uint8_t>(source[0]);
43+
if (byte0 < 0x80) {
44+
code_point = byte0;
45+
return true;
46+
}
47+
48+
if ((byte0 & 0xE0) == 0xC0) {
49+
if (source.size() < 2) {
50+
return false;
51+
}
52+
auto byte1 = static_cast<uint8_t>(source[1]);
53+
if ((byte1 & 0xC0) != 0x80) {
54+
return false;
55+
}
56+
code_point = ((byte0 & 0x1F) << 6) | (byte1 & 0x3F);
57+
if (code_point < 0x80) {
58+
return false;
59+
}
60+
return true;
61+
}
62+
63+
if ((byte0 & 0xF0) == 0xE0) {
64+
if (source.size() < 3) {
65+
return false;
66+
}
67+
auto byte1 = static_cast<uint8_t>(source[1]);
68+
auto byte2 = static_cast<uint8_t>(source[2]);
69+
if ((byte1 & 0xC0) != 0x80 || (byte2 & 0xC0) != 0x80) {
70+
return false;
71+
}
72+
code_point = ((byte0 & 0x0F) << 12) | ((byte1 & 0x3F) << 6) | (byte2 & 0x3F);
73+
if (code_point < 0x800 ||
74+
(code_point >= kUtf8MinSurrogate && code_point <= kUtf8MaxSurrogate)) {
75+
return false;
76+
}
77+
return true;
78+
}
79+
80+
if ((byte0 & 0xF8) == 0xF0) {
81+
if (source.size() < 4) {
82+
return false;
83+
}
84+
auto byte1 = static_cast<uint8_t>(source[1]);
85+
auto byte2 = static_cast<uint8_t>(source[2]);
86+
auto byte3 = static_cast<uint8_t>(source[3]);
87+
if ((byte1 & 0xC0) != 0x80 || (byte2 & 0xC0) != 0x80 || (byte3 & 0xC0) != 0x80) {
88+
return false;
89+
}
90+
code_point = ((byte0 & 0x07) << 18) | ((byte1 & 0x3F) << 12) | ((byte2 & 0x3F) << 6) |
91+
(byte3 & 0x3F);
92+
if (code_point < 0x10000 || code_point > kUtf8MaxCodePoint) {
93+
return false;
94+
}
95+
return true;
96+
}
97+
98+
return false;
99+
}
100+
101+
void AppendUtf8CodePoint(uint32_t code_point, std::string& target) {
102+
if (code_point <= 0x7F) {
103+
target.push_back(static_cast<char>(code_point));
104+
} else if (code_point <= 0x7FF) {
105+
target.push_back(static_cast<char>(0xC0 | (code_point >> 6)));
106+
target.push_back(static_cast<char>(0x80 | (code_point & 0x3F)));
107+
} else if (code_point <= 0xFFFF) {
108+
target.push_back(static_cast<char>(0xE0 | (code_point >> 12)));
109+
target.push_back(static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)));
110+
target.push_back(static_cast<char>(0x80 | (code_point & 0x3F)));
111+
} else {
112+
target.push_back(static_cast<char>(0xF0 | (code_point >> 18)));
113+
target.push_back(static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)));
114+
target.push_back(static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)));
115+
target.push_back(static_cast<char>(0x80 | (code_point & 0x3F)));
116+
}
117+
}
118+
32119
template <TypeId type_id>
33120
Literal TruncateLiteralImpl(const Literal& literal, int32_t width) {
34121
std::unreachable();
@@ -72,8 +159,85 @@ Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t wid
72159
return Literal::Binary(std::vector<uint8_t>(data.begin(), data.begin() + width));
73160
}
74161

162+
template <TypeId type_id>
163+
Result<Literal> TruncateLiteralMaxImpl(const Literal& literal, int32_t width) {
164+
std::unreachable();
165+
}
166+
167+
template <>
168+
Result<Literal> TruncateLiteralMaxImpl<TypeId::kString>(const Literal& literal,
169+
int32_t width) {
170+
const auto& str = std::get<std::string>(literal.value());
171+
ICEBERG_ASSIGN_OR_RAISE(std::string truncated,
172+
TruncateUtils::TruncateUTF8Max(str, width));
173+
if (truncated == str) {
174+
return literal;
175+
}
176+
return Literal::String(std::move(truncated));
177+
}
178+
179+
template <>
180+
Result<Literal> TruncateLiteralMaxImpl<TypeId::kBinary>(const Literal& literal,
181+
int32_t width) {
182+
const auto& data = std::get<std::vector<uint8_t>>(literal.value());
183+
if (static_cast<int32_t>(data.size()) <= width) {
184+
return literal;
185+
}
186+
187+
std::vector<uint8_t> truncated(data.begin(), data.begin() + width);
188+
for (auto it = truncated.rbegin(); it != truncated.rend(); ++it) {
189+
if (*it < 0xFF) {
190+
++(*it);
191+
truncated.resize(truncated.size() - std::distance(truncated.rbegin(), it));
192+
return Literal::Binary(std::move(truncated));
193+
}
194+
}
195+
return InvalidArgument("Cannot truncate upper bound for binary: all bytes are 0xFF");
196+
}
197+
75198
} // namespace
76199

200+
Result<std::string> TruncateUtils::TruncateUTF8Max(const std::string& source, size_t L) {
201+
std::string truncated = TruncateUTF8(source, L);
202+
if (truncated == source) {
203+
return truncated;
204+
}
205+
206+
// Try incrementing code points from the end
207+
size_t last_cp_start = truncated.size();
208+
while (last_cp_start > 0) {
209+
size_t cp_start = last_cp_start;
210+
// Find the start of the previous code point
211+
do {
212+
--cp_start;
213+
} while (cp_start > 0 && (static_cast<uint8_t>(truncated[cp_start]) & 0xC0) == 0x80);
214+
215+
uint32_t code_point = 0;
216+
if (!DecodeUtf8CodePoint(
217+
std::string_view(truncated.data() + cp_start, last_cp_start - cp_start),
218+
code_point)) {
219+
return InvalidArgument("Invalid UTF-8 in string literal");
220+
}
221+
222+
// Try to increment the code point
223+
if (code_point < kUtf8MaxCodePoint) {
224+
uint32_t next_code_point = code_point + 1;
225+
// Skip surrogate range
226+
if (next_code_point >= kUtf8MinSurrogate && next_code_point <= kUtf8MaxSurrogate) {
227+
next_code_point = kUtf8MaxSurrogate + 1;
228+
}
229+
if (next_code_point <= kUtf8MaxCodePoint) {
230+
truncated.resize(cp_start);
231+
AppendUtf8CodePoint(next_code_point, truncated);
232+
return truncated;
233+
}
234+
}
235+
last_cp_start = cp_start;
236+
}
237+
return InvalidArgument(
238+
"Cannot truncate upper bound for string: all code points are 0x10FFFF");
239+
}
240+
77241
Decimal TruncateUtils::TruncateDecimal(const Decimal& decimal, int32_t width) {
78242
return decimal - (((decimal % width) + width) % width);
79243
}
@@ -104,4 +268,27 @@ Result<Literal> TruncateUtils::TruncateLiteral(const Literal& literal, int32_t w
104268
}
105269
}
106270

271+
#define DISPATCH_TRUNCATE_LITERAL_MAX(TYPE_ID) \
272+
case TYPE_ID: \
273+
return TruncateLiteralMaxImpl<TYPE_ID>(literal, width);
274+
275+
Result<Literal> TruncateUtils::TruncateLiteralMax(const Literal& literal, int32_t width) {
276+
if (literal.IsNull()) [[unlikely]] {
277+
// Return null as is
278+
return literal;
279+
}
280+
281+
if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] {
282+
return NotSupported("Cannot truncate {}", literal.ToString());
283+
}
284+
285+
switch (literal.type()->type_id()) {
286+
DISPATCH_TRUNCATE_LITERAL_MAX(TypeId::kString);
287+
DISPATCH_TRUNCATE_LITERAL_MAX(TypeId::kBinary);
288+
default:
289+
return NotSupported("Truncate max is not supported for type: {}",
290+
literal.type()->ToString());
291+
}
292+
}
293+
107294
} // namespace iceberg

src/iceberg/util/truncate_util.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,20 @@ class ICEBERG_EXPORT TruncateUtils {
6161
return source;
6262
}
6363

64+
/// \brief Truncate a UTF-8 string to a specified number of code points for
65+
/// use as an upper-bound value.
66+
///
67+
/// When truncation is required, the returned value is the smallest UTF-8
68+
/// string greater than the truncated prefix. When no truncation is needed
69+
/// for the given width, the original string may be returned unchanged.
70+
///
71+
/// \param source The input string to truncate.
72+
/// \param L The maximum number of code points allowed in the output string.
73+
/// \return A Result containing the original string (if no truncation is
74+
/// needed), or the smallest string greater than the truncated prefix, or an
75+
/// error if no such value exists or the input is invalid UTF-8.
76+
static Result<std::string> TruncateUTF8Max(const std::string& source, size_t L);
77+
6478
/// \brief Truncate an integer v, either int32_t or int64_t, to v - (v % W).
6579
///
6680
/// The remainder, v % W, must be positive. For languages where % can produce negative
@@ -86,6 +100,19 @@ class ICEBERG_EXPORT TruncateUtils {
86100
/// - [Truncate Transform
87101
/// Details](https://iceberg.apache.org/spec/#truncate-transform-details)
88102
static Result<Literal> TruncateLiteral(const Literal& literal, int32_t width);
103+
104+
/// \brief Truncate a Literal to a specified width for use as an upper-bound value.
105+
///
106+
/// When truncation is required, the returned value is the smallest Literal greater than
107+
/// the truncated prefix. When no truncation is needed for the given width, the original
108+
/// Literal may be returned unchanged.
109+
///
110+
/// \param value The input Literal maximum value to truncate.
111+
/// \param width The width to truncate to.
112+
/// \return A Result containing either the original Literal (if no truncation is needed)
113+
/// or the smallest Literal greater than the truncated prefix, or an error if no such
114+
/// value exists or cannot be represented.
115+
static Result<Literal> TruncateLiteralMax(const Literal& value, int32_t width);
89116
};
90117

91118
} // namespace iceberg

0 commit comments

Comments
 (0)