Skip to content

Commit c7a8ad6

Browse files
committed
v1
1 parent f2d0abd commit c7a8ad6

File tree

1 file changed

+320
-2
lines changed

1 file changed

+320
-2
lines changed

src/iceberg/expression/literal.cc

Lines changed: 320 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,78 @@
1919

2020
#include "iceberg/expression/literal.h"
2121

22+
#include <bit>
2223
#include <cmath>
2324
#include <concepts>
25+
#include <cstring>
2426

2527
#include "iceberg/exception.h"
2628

2729
namespace iceberg {
2830

31+
namespace {
32+
/// \brief Write a value in little-endian format to the buffer.
33+
template <typename T>
34+
void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
35+
static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially copyable");
36+
37+
if constexpr (std::endian::native == std::endian::little) {
38+
const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
39+
buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
40+
} else {
41+
if constexpr (sizeof(T) > 1) {
42+
T le_value = std::byteswap(value);
43+
const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
44+
buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
45+
} else {
46+
// For single byte types, no byteswap needed
47+
buffer.push_back(static_cast<uint8_t>(value));
48+
}
49+
}
50+
}
51+
52+
/// \brief Read a value in little-endian format from the data.
53+
template <typename T>
54+
T ReadLittleEndian(std::span<const uint8_t> data) {
55+
static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially copyable");
56+
57+
if (data.size() < sizeof(T)) {
58+
throw IcebergError("Insufficient data to read type");
59+
}
60+
61+
T value;
62+
std::memcpy(&value, data.data(), sizeof(T));
63+
64+
if constexpr (std::endian::native != std::endian::little && sizeof(T) > 1) {
65+
value = std::byteswap(value);
66+
}
67+
return value;
68+
}
69+
70+
/// \brief Write UUID in big-endian format (as per Iceberg spec).
71+
void WriteUuidBigEndian(const std::array<uint8_t, 16>& uuid,
72+
std::vector<uint8_t>& buffer) {
73+
buffer.insert(buffer.end(), uuid.begin(), uuid.end());
74+
}
75+
76+
/// \brief Read UUID in big-endian format (as per Iceberg spec).
77+
std::array<uint8_t, 16> ReadUuidBigEndian(std::span<const uint8_t> data) {
78+
if (data.size() < 16) {
79+
throw IcebergError("Insufficient data to read UUID");
80+
}
81+
82+
std::array<uint8_t, 16> result;
83+
std::copy(data.begin(), data.begin() + 16, result.begin());
84+
return result;
85+
}
86+
87+
/// \brief Validate data size for fixed-size types.
88+
bool ValidateDataSize(std::span<const uint8_t> data, size_t expected_size,
89+
const std::string& type_name) {
90+
return data.size() == expected_size;
91+
}
92+
} // anonymous namespace
93+
2994
/// \brief LiteralCaster handles type casting operations for Literal.
3095
/// This is an internal implementation class.
3196
class LiteralCaster {
@@ -151,11 +216,264 @@ Literal Literal::Binary(std::vector<uint8_t> value) {
151216

152217
Result<Literal> Literal::Deserialize(std::span<const uint8_t> data,
153218
std::shared_ptr<PrimitiveType> type) {
154-
return NotImplemented("Deserialization of Literal is not implemented yet");
219+
if (!type) {
220+
return InvalidArgument("Type cannot be null");
221+
}
222+
223+
// Empty data represents null value
224+
if (data.empty()) {
225+
return Null(type);
226+
}
227+
228+
switch (type->type_id()) {
229+
case TypeId::kBoolean: {
230+
if (!ValidateDataSize(data, 1, "boolean")) {
231+
return InvalidArgument(
232+
"Invalid data size for boolean type, expected 1 byte, got {}", data.size());
233+
}
234+
// 0x00 for false, non-zero byte for true
235+
return Literal::Boolean(data[0] != 0x00);
236+
}
237+
238+
case TypeId::kInt: {
239+
if (!ValidateDataSize(data, 4, "int")) {
240+
return InvalidArgument("Invalid data size for int type, expected 4 bytes, got {}",
241+
data.size());
242+
}
243+
return Literal::Int(ReadLittleEndian<int32_t>(data));
244+
}
245+
246+
case TypeId::kDate: {
247+
if (!ValidateDataSize(data, 4, "date")) {
248+
return InvalidArgument(
249+
"Invalid data size for date type, expected 4 bytes, got {}", data.size());
250+
}
251+
return Literal::Date(ReadLittleEndian<int32_t>(data));
252+
}
253+
254+
case TypeId::kLong: {
255+
if (!ValidateDataSize(data, 8, "long")) {
256+
return InvalidArgument(
257+
"Invalid data size for long type, expected 8 bytes, got {}", data.size());
258+
}
259+
return Literal::Long(ReadLittleEndian<int64_t>(data));
260+
}
261+
262+
case TypeId::kTime: {
263+
if (!ValidateDataSize(data, 8, "time")) {
264+
return InvalidArgument(
265+
"Invalid data size for time type, expected 8 bytes, got {}", data.size());
266+
}
267+
return Literal::Time(ReadLittleEndian<int64_t>(data));
268+
}
269+
270+
case TypeId::kTimestamp: {
271+
if (!ValidateDataSize(data, 8, "timestamp")) {
272+
return InvalidArgument(
273+
"Invalid data size for timestamp type, expected 8 bytes, got {}",
274+
data.size());
275+
}
276+
return Literal::Timestamp(ReadLittleEndian<int64_t>(data));
277+
}
278+
279+
case TypeId::kTimestampTz: {
280+
if (!ValidateDataSize(data, 8, "timestamptz")) {
281+
return InvalidArgument(
282+
"Invalid data size for timestamptz type, expected 8 bytes, got {}",
283+
data.size());
284+
}
285+
return Literal::TimestampTz(ReadLittleEndian<int64_t>(data));
286+
}
287+
288+
case TypeId::kFloat: {
289+
if (!ValidateDataSize(data, 4, "float")) {
290+
return InvalidArgument(
291+
"Invalid data size for float type, expected 4 bytes, got {}", data.size());
292+
}
293+
return Literal::Float(ReadLittleEndian<float>(data));
294+
}
295+
296+
case TypeId::kDouble: {
297+
if (!ValidateDataSize(data, 8, "double")) {
298+
return InvalidArgument(
299+
"Invalid data size for double type, expected 8 bytes, got {}", data.size());
300+
}
301+
return Literal::Double(ReadLittleEndian<double>(data));
302+
}
303+
304+
case TypeId::kString: {
305+
// UTF-8 bytes (without length) - any size is valid
306+
return Literal::String(
307+
std::string(reinterpret_cast<const char*>(data.data()), data.size()));
308+
}
309+
310+
case TypeId::kBinary: {
311+
// Binary value (without length) - any size is valid
312+
return Literal::Binary(std::vector<uint8_t>(data.begin(), data.end()));
313+
}
314+
315+
case TypeId::kUuid: {
316+
if (!ValidateDataSize(data, 16, "uuid")) {
317+
return InvalidArgument(
318+
"Invalid data size for uuid type, expected 16 bytes, got {}", data.size());
319+
}
320+
auto uuid_bytes = ReadUuidBigEndian(data);
321+
return Literal(Literal::Value{uuid_bytes}, type);
322+
}
323+
324+
case TypeId::kDecimal: {
325+
// Decimal values can have variable length, but we store them in std::array<uint8_t,
326+
// 16>
327+
if (data.size() > 16) {
328+
return InvalidArgument(
329+
"Decimal data too large, maximum 16 bytes supported, got {}", data.size());
330+
}
331+
332+
std::array<uint8_t, 16> decimal_bytes{};
333+
// Copy data to the end of the array (big-endian format for decimals)
334+
std::copy(data.begin(), data.end(), decimal_bytes.end() - data.size());
335+
return Literal(Literal::Value{decimal_bytes}, type);
336+
}
337+
338+
case TypeId::kFixed: {
339+
// Fixed(L) - Binary value, length should match the fixed type length
340+
// For now, we'll store in std::vector<uint8_t> or std::array<uint8_t, 16> depending
341+
// on size
342+
if (data.size() == 16) {
343+
std::array<uint8_t, 16> fixed_bytes;
344+
std::copy(data.begin(), data.end(), fixed_bytes.begin());
345+
return Literal(Literal::Value{fixed_bytes}, type);
346+
} else {
347+
return Literal(Literal::Value{std::vector<uint8_t>(data.begin(), data.end())},
348+
type);
349+
}
350+
}
351+
352+
default:
353+
return NotImplemented("Deserialization for type {} is not supported",
354+
type->ToString());
355+
}
155356
}
156357

157358
Result<std::vector<uint8_t>> Literal::Serialize() const {
158-
return NotImplemented("Serialization of Literal is not implemented yet");
359+
// Cannot serialize special values
360+
if (IsAboveMax()) {
361+
return InvalidArgument("Cannot serialize AboveMax literal");
362+
}
363+
if (IsBelowMin()) {
364+
return InvalidArgument("Cannot serialize BelowMin literal");
365+
}
366+
367+
std::vector<uint8_t> result;
368+
369+
// Null values serialize to empty buffer
370+
if (IsNull()) {
371+
return result;
372+
}
373+
374+
switch (type_->type_id()) {
375+
case TypeId::kBoolean: {
376+
// 0x00 for false, non-zero byte for true
377+
result.push_back(std::get<bool>(value_) ? 0x01 : 0x00);
378+
break;
379+
}
380+
381+
case TypeId::kInt: {
382+
// Stored as 4-byte little-endian
383+
WriteLittleEndian(result, std::get<int32_t>(value_));
384+
break;
385+
}
386+
387+
case TypeId::kDate: {
388+
// Stores days from 1970-01-01 in a 4-byte little-endian int
389+
WriteLittleEndian(result, std::get<int32_t>(value_));
390+
break;
391+
}
392+
393+
case TypeId::kLong: {
394+
// Stored as 8-byte little-endian
395+
WriteLittleEndian(result, std::get<int64_t>(value_));
396+
break;
397+
}
398+
399+
case TypeId::kTime: {
400+
// Stores microseconds from midnight in an 8-byte little-endian long
401+
WriteLittleEndian(result, std::get<int64_t>(value_));
402+
break;
403+
}
404+
405+
case TypeId::kTimestamp: {
406+
// Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian
407+
// long
408+
WriteLittleEndian(result, std::get<int64_t>(value_));
409+
break;
410+
}
411+
412+
case TypeId::kTimestampTz: {
413+
// Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
414+
// little-endian long
415+
WriteLittleEndian(result, std::get<int64_t>(value_));
416+
break;
417+
}
418+
419+
case TypeId::kFloat: {
420+
// Stored as 4-byte little-endian
421+
WriteLittleEndian(result, std::get<float>(value_));
422+
break;
423+
}
424+
425+
case TypeId::kDouble: {
426+
// Stored as 8-byte little-endian
427+
WriteLittleEndian(result, std::get<double>(value_));
428+
break;
429+
}
430+
431+
case TypeId::kString: {
432+
// UTF-8 bytes (without length)
433+
const auto& str = std::get<std::string>(value_);
434+
result.assign(str.begin(), str.end());
435+
break;
436+
}
437+
438+
case TypeId::kBinary: {
439+
// Binary value (without length)
440+
result = std::get<std::vector<uint8_t>>(value_);
441+
break;
442+
}
443+
444+
case TypeId::kUuid: {
445+
// 16-byte big-endian value
446+
WriteUuidBigEndian(std::get<std::array<uint8_t, 16>>(value_), result);
447+
break;
448+
}
449+
450+
case TypeId::kDecimal: {
451+
// For decimal stored in std::array<uint8_t, 16>, treat as fixed(16)
452+
const auto& decimal_bytes = std::get<std::array<uint8_t, 16>>(value_);
453+
result.assign(decimal_bytes.begin(), decimal_bytes.end());
454+
break;
455+
}
456+
457+
case TypeId::kFixed: {
458+
// Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
459+
// std::vector<uint8_t>
460+
if (std::holds_alternative<std::array<uint8_t, 16>>(value_)) {
461+
const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value_);
462+
result.assign(fixed_bytes.begin(), fixed_bytes.end());
463+
} else if (std::holds_alternative<std::vector<uint8_t>>(value_)) {
464+
result = std::get<std::vector<uint8_t>>(value_);
465+
} else {
466+
return InvalidArgument("Invalid value type for Fixed literal");
467+
}
468+
break;
469+
}
470+
471+
default:
472+
return NotImplemented("Serialization for type {} is not supported",
473+
type_->ToString());
474+
}
475+
476+
return result;
159477
}
160478

161479
// Getters

0 commit comments

Comments
 (0)