Skip to content

Commit c75115b

Browse files
committed
feat: implement literal expressions with binary serialization support
1 parent 622a934 commit c75115b

File tree

10 files changed

+601
-65
lines changed

10 files changed

+601
-65
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,11 @@ set(ICEBERG_SOURCES
5555
manifest_reader_internal.cc
5656
manifest_writer.cc
5757
arrow_c_data_guard_internal.cc
58+
util/conversions.cc
5859
util/decimal.cc
60+
util/gzip_internal.cc
5961
util/murmurhash3_internal.cc
6062
util/timepoint.cc
61-
util/gzip_internal.cc
6263
util/uuid.cc)
6364

6465
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/expression/literal.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <concepts>
2424

2525
#include "iceberg/exception.h"
26+
#include "iceberg/util/conversions.h"
2627

2728
namespace iceberg {
2829

@@ -151,11 +152,11 @@ Literal Literal::Binary(std::vector<uint8_t> value) {
151152

152153
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
153154
std::shared_ptr<PrimitiveType> type) {
154-
return NotImplemented("Deserialization of Literal is not implemented yet");
155+
return Conversions::FromBytes(type, data);
155156
}
156157

157158
Result<std::vector<uint8_t>> Literal::Serialize() const {
158-
return NotImplemented("Serialization of Literal is not implemented yet");
159+
return Conversions::ToBytes(*this);
159160
}
160161

161162
// Getters

src/iceberg/expression/literal.h

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,76 @@ class ICEBERG_EXPORT Literal : public util::Formattable {
144144
private:
145145
Literal(Value value, std::shared_ptr<PrimitiveType> type);
146146

147+
friend class Conversions;
147148
friend class LiteralCaster;
148149

149-
private:
150150
Value value_;
151151
std::shared_ptr<PrimitiveType> type_;
152152
};
153153

154+
template <TypeId type_id>
155+
struct LiteralTraits {
156+
using ValueType = void;
157+
};
158+
159+
template <>
160+
struct LiteralTraits<TypeId::kBoolean> {
161+
using ValueType = bool;
162+
};
163+
164+
template <>
165+
struct LiteralTraits<TypeId::kInt> {
166+
using ValueType = int32_t;
167+
};
168+
169+
template <>
170+
struct LiteralTraits<TypeId::kDate> {
171+
using ValueType = int32_t;
172+
};
173+
174+
template <>
175+
struct LiteralTraits<TypeId::kLong> {
176+
using ValueType = int64_t;
177+
};
178+
179+
template <>
180+
struct LiteralTraits<TypeId::kTime> {
181+
using ValueType = int64_t;
182+
};
183+
184+
template <>
185+
struct LiteralTraits<TypeId::kTimestamp> {
186+
using ValueType = int64_t;
187+
};
188+
189+
template <>
190+
struct LiteralTraits<TypeId::kTimestampTz> {
191+
using ValueType = int64_t;
192+
};
193+
194+
template <>
195+
struct LiteralTraits<TypeId::kFloat> {
196+
using ValueType = float;
197+
};
198+
199+
template <>
200+
struct LiteralTraits<TypeId::kDouble> {
201+
using ValueType = double;
202+
};
203+
204+
template <>
205+
struct LiteralTraits<TypeId::kString> {
206+
using ValueType = std::string;
207+
};
208+
209+
template <>
210+
struct LiteralTraits<TypeId::kBinary> {
211+
using ValueType = std::vector<uint8_t>;
212+
};
213+
214+
template <>
215+
struct LiteralTraits<TypeId::kFixed> {
216+
using ValueType = std::vector<uint8_t>;
217+
};
218+
154219
} // namespace iceberg

src/iceberg/test/literal_test.cc

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,4 +383,118 @@ TEST(LiteralTest, DoubleZeroComparison) {
383383
EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less);
384384
}
385385

386+
// Type promotion tests
387+
TEST(LiteralSerializationTest, TypePromotion) {
388+
// 4-byte int data can be deserialized as long
389+
std::vector<uint8_t> int_data = {32, 0, 0, 0};
390+
auto long_result = Literal::Deserialize(int_data, int64());
391+
ASSERT_TRUE(long_result.has_value());
392+
EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong);
393+
EXPECT_EQ(long_result->ToString(), "32");
394+
395+
auto long_bytes = long_result->Serialize();
396+
ASSERT_TRUE(long_bytes.has_value());
397+
EXPECT_EQ(long_bytes->size(), 8);
398+
399+
// 4-byte float data can be deserialized as double
400+
std::vector<uint8_t> float_data = {0, 0, 128, 63};
401+
auto double_result = Literal::Deserialize(float_data, float64());
402+
ASSERT_TRUE(double_result.has_value());
403+
EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble);
404+
EXPECT_EQ(double_result->ToString(), "1.000000");
405+
406+
auto double_bytes = double_result->Serialize();
407+
ASSERT_TRUE(double_bytes.has_value());
408+
EXPECT_EQ(double_bytes->size(), 8);
409+
}
410+
411+
struct LiteralRoundTripParam {
412+
std::string test_name;
413+
std::vector<uint8_t> input_bytes;
414+
Literal expected_literal;
415+
std::shared_ptr<PrimitiveType> type;
416+
};
417+
418+
class LiteralSerializationParamTest
419+
: public ::testing::TestWithParam<LiteralRoundTripParam> {};
420+
421+
TEST_P(LiteralSerializationParamTest, RoundTrip) {
422+
const auto& param = GetParam();
423+
424+
// Deserialize from bytes
425+
Result<Literal> literal_result = Literal::Deserialize(param.input_bytes, param.type);
426+
ASSERT_TRUE(literal_result.has_value())
427+
<< "Deserialization failed: " << literal_result.error().message;
428+
429+
// Check type and value
430+
EXPECT_EQ(literal_result->type()->type_id(), param.expected_literal.type()->type_id());
431+
EXPECT_EQ(literal_result->ToString(), param.expected_literal.ToString());
432+
433+
// Serialize back to bytes
434+
Result<std::vector<uint8_t>> bytes_result = literal_result->Serialize();
435+
ASSERT_TRUE(bytes_result.has_value())
436+
<< "Serialization failed: " << bytes_result.error().message;
437+
EXPECT_EQ(*bytes_result, param.input_bytes);
438+
439+
// Deserialize again to verify idempotency
440+
Result<Literal> final_literal = Literal::Deserialize(*bytes_result, param.type);
441+
ASSERT_TRUE(final_literal.has_value())
442+
<< "Final deserialization failed: " << final_literal.error().message;
443+
EXPECT_EQ(final_literal->type()->type_id(), param.expected_literal.type()->type_id());
444+
EXPECT_EQ(final_literal->ToString(), param.expected_literal.ToString());
445+
}
446+
447+
INSTANTIATE_TEST_SUITE_P(
448+
BinarySerializationTests, LiteralSerializationParamTest,
449+
::testing::Values(
450+
// Basic types
451+
LiteralRoundTripParam{"BooleanTrue", {1}, Literal::Boolean(true), boolean()},
452+
LiteralRoundTripParam{"BooleanFalse", {0}, Literal::Boolean(false), boolean()},
453+
LiteralRoundTripParam{"Int", {32, 0, 0, 0}, Literal::Int(32), int32()},
454+
LiteralRoundTripParam{
455+
"Long", {32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), int64()},
456+
LiteralRoundTripParam{"Float", {0, 0, 128, 63}, Literal::Float(1.0f), float32()},
457+
LiteralRoundTripParam{
458+
"Double", {0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), float64()},
459+
LiteralRoundTripParam{"String",
460+
{105, 99, 101, 98, 101, 114, 103},
461+
Literal::String("iceberg"),
462+
string()},
463+
LiteralRoundTripParam{"BinaryData",
464+
{0x01, 0x02, 0x03, 0xFF},
465+
Literal::Binary({0x01, 0x02, 0x03, 0xFF}),
466+
binary()},
467+
// Edge cases that fit the round-trip pattern
468+
LiteralRoundTripParam{
469+
"NegativeInt", {224, 255, 255, 255}, Literal::Int(-32), int32()},
470+
LiteralRoundTripParam{"NegativeLong",
471+
{224, 255, 255, 255, 255, 255, 255, 255},
472+
Literal::Long(-32),
473+
int64()},
474+
// IEEE 754 representation for NaN and Infinity (in little-endian)
475+
LiteralRoundTripParam{"FloatInfinity",
476+
{0, 0, 128, 127},
477+
Literal::Float(std::numeric_limits<float>::infinity()),
478+
float32()},
479+
LiteralRoundTripParam{"FloatNaN",
480+
{0, 0, 192, 127},
481+
Literal::Float(std::numeric_limits<float>::quiet_NaN()),
482+
float32()}
483+
// TODO(Li Feiyang): Add tests for Date, Time, Timestamp, TimestampTz
484+
),
485+
486+
[](const testing::TestParamInfo<LiteralSerializationParamTest::ParamType>& info) {
487+
return info.param.test_name;
488+
});
489+
490+
TEST(LiteralSerializationEdgeCaseTest, EmptyStringSerialization) {
491+
auto empty_string = Literal::String("");
492+
auto empty_bytes = empty_string.Serialize();
493+
ASSERT_TRUE(empty_bytes.has_value());
494+
EXPECT_TRUE(empty_bytes->empty());
495+
496+
auto deserialize_result = Literal::Deserialize(*empty_bytes, string());
497+
EXPECT_THAT(deserialize_result, IsError(ErrorKind::kInvalidArgument));
498+
}
499+
386500
} // namespace iceberg

src/iceberg/test/manifest_list_reader_test.cc

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "iceberg/arrow/arrow_fs_file_io_internal.h"
2525
#include "iceberg/avro/avro_register.h"
26+
#include "iceberg/expression/literal.h"
2627
#include "iceberg/manifest_list.h"
2728
#include "iceberg/manifest_reader.h"
2829
#include "temp_file_test_base.h"
@@ -76,43 +77,38 @@ class ManifestListReaderV1Test : public ManifestListReaderTestBase {
7677
std::vector<int64_t> file_size = {6185, 6113};
7778
std::vector<int64_t> snapshot_id = {7532614258660258098, 7532614258660258098};
7879

79-
std::vector<std::vector<std::uint8_t>> lower_bounds = {
80-
{0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
81-
{0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
82-
83-
std::vector<std::vector<std::uint8_t>> upper_bounds = {
84-
{0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
85-
{0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
86-
87-
return {{.manifest_path = paths[0],
88-
.manifest_length = file_size[0],
89-
.partition_spec_id = 0,
90-
.added_snapshot_id = snapshot_id[0],
91-
.added_files_count = 4,
92-
.existing_files_count = 0,
93-
.deleted_files_count = 0,
94-
.added_rows_count = 6,
95-
.existing_rows_count = 0,
96-
.deleted_rows_count = 0,
97-
.partitions = {{.contains_null = false,
98-
.contains_nan = false,
99-
.lower_bound = lower_bounds[0],
100-
.upper_bound = upper_bounds[0]}}},
101-
102-
{.manifest_path = paths[1],
103-
.manifest_length = file_size[1],
104-
.partition_spec_id = 0,
105-
.added_snapshot_id = snapshot_id[1],
106-
.added_files_count = 0,
107-
.existing_files_count = 0,
108-
.deleted_files_count = 2,
109-
.added_rows_count = 0,
110-
.existing_rows_count = 0,
111-
.deleted_rows_count = 6,
112-
.partitions = {{.contains_null = false,
113-
.contains_nan = false,
114-
.lower_bound = lower_bounds[1],
115-
.upper_bound = upper_bounds[1]}}}};
80+
return {
81+
{.manifest_path = paths[0],
82+
.manifest_length = file_size[0],
83+
.partition_spec_id = 0,
84+
.added_snapshot_id = snapshot_id[0],
85+
.added_files_count = 4,
86+
.existing_files_count = 0,
87+
.deleted_files_count = 0,
88+
.added_rows_count = 6,
89+
.existing_rows_count = 0,
90+
.deleted_rows_count = 0,
91+
.partitions = {{.contains_null = false,
92+
.contains_nan = false,
93+
.lower_bound = Literal::String("2022-02-22").Serialize().value(),
94+
.upper_bound =
95+
Literal::String("2022-2-23").Serialize().value()}}},
96+
97+
{.manifest_path = paths[1],
98+
.manifest_length = file_size[1],
99+
.partition_spec_id = 0,
100+
.added_snapshot_id = snapshot_id[1],
101+
.added_files_count = 0,
102+
.existing_files_count = 0,
103+
.deleted_files_count = 2,
104+
.added_rows_count = 0,
105+
.existing_rows_count = 0,
106+
.deleted_rows_count = 6,
107+
.partitions = {
108+
{.contains_null = false,
109+
.contains_nan = false,
110+
.lower_bound = Literal::String("2022-2-22").Serialize().value(),
111+
.upper_bound = Literal::String("2022-2-23").Serialize().value()}}}};
116112
}
117113

118114
std::vector<ManifestFile> PrepareComplexTypeTestData() {

src/iceberg/test/manifest_reader_test.cc

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -94,24 +94,33 @@ class ManifestReaderV1Test : public ManifestReaderTestBase {
9494
"order_ts_hour=2021-01-26-00/"
9595
"00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
9696
std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
97+
98+
// TODO(Li Feiyang): The Decimal type and its serialization logic are not yet fully
99+
// implemented to support variable-length encoding as required by the Iceberg
100+
// specification. Using Literal::Binary as a temporary substitute to represent the raw
101+
// bytes for the decimal values.
97102
std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
98-
{{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
99-
{2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
100-
{3, {0x12, 0xe2}},
101-
{4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}},
102-
{{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
103-
{2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
104-
{3, {0x12, 0xe3}},
105-
{4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}},
106-
{{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
107-
{2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
108-
{3, {0x0e, '"'}},
109-
{4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}},
110-
{{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
111-
{2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
112-
{3, {0x0e, '!'}},
113-
{4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}},
103+
{{1, Literal::Long(1234).Serialize().value()},
104+
{2, Literal::Long(5678).Serialize().value()},
105+
{3, Literal::Binary({0x12, 0xe2}).Serialize().value()},
106+
107+
{4, Literal::Timestamp(1611706223000000LL).Serialize().value()}},
108+
{{1, Literal::Long(1234).Serialize().value()},
109+
{2, Literal::Long(5678).Serialize().value()},
110+
{3, Literal::Binary({0x12, 0xe3}).Serialize().value()},
111+
112+
{4, Literal::Timestamp(1706314223000000LL).Serialize().value()}},
113+
{{1, Literal::Long(123).Serialize().value()},
114+
{2, Literal::Long(456).Serialize().value()},
115+
{3, Literal::Binary({0x0e, 0x22}).Serialize().value()},
116+
117+
{4, Literal::Timestamp(1674691823000000LL).Serialize().value()}},
118+
{{1, Literal::Long(123).Serialize().value()},
119+
{2, Literal::Long(456).Serialize().value()},
120+
{3, Literal::Binary({0x0e, 0x21}).Serialize().value()},
121+
{4, Literal::Timestamp(1611619823000000LL).Serialize().value()}},
114122
};
123+
115124
for (int i = 0; i < 4; ++i) {
116125
ManifestEntry entry;
117126
entry.status = ManifestStatus::kAdded;
@@ -159,16 +168,16 @@ class ManifestReaderV2Test : public ManifestReaderTestBase {
159168
std::vector<int64_t> record_counts = {4};
160169

161170
std::vector<std::map<int32_t, std::vector<uint8_t>>> lower_bounds = {
162-
{{1, {0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
163-
{2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 'f', 'o', 'u', 'r'}},
164-
{3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '1'}},
165-
{4, {0xcd, 0xcc, 0xcc, 0xcc, 0xcc, 0xdc, 0x5e, 0x40}}}};
171+
{{1, Literal::Long(1).Serialize().value()},
172+
{2, Literal::String("record_four").Serialize().value()},
173+
{3, Literal::String("data_content_1").Serialize().value()},
174+
{4, Literal::Double(123.45).Serialize().value()}}};
166175

167176
std::vector<std::map<int32_t, std::vector<uint8_t>>> upper_bounds = {
168-
{{1, {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
169-
{2, {'r', 'e', 'c', 'o', 'r', 'd', '_', 't', 'w', 'o'}},
170-
{3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '4'}},
171-
{4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}};
177+
{{1, Literal::Long(4).Serialize().value()},
178+
{2, Literal::String("record_two").Serialize().value()},
179+
{3, Literal::String("data_content_4").Serialize().value()},
180+
{4, Literal::Double(456.78).Serialize().value()}}};
172181

173182
DataFile data_file{.file_path = test_dir_prefix + paths[0],
174183
.file_format = FileFormatType::kParquet,

0 commit comments

Comments
 (0)