Skip to content

Commit c38b392

Browse files
authored
feat(puffin): add format constants, utilities, and JSON serde (#603)
1 parent a5fc25f commit c38b392

15 files changed

+645
-27
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ set(ICEBERG_SOURCES
6767
partition_spec.cc
6868
partition_summary.cc
6969
puffin/file_metadata.cc
70+
puffin/puffin_format.cc
71+
puffin/json_serde.cc
7072
row/arrow_array_wrapper.cc
7173
row/manifest_wrapper.cc
7274
row/partition_values.cc

src/iceberg/deletes/roaring_position_bitmap.cc

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,28 +49,6 @@ int64_t ToPosition(int32_t key, uint32_t pos32) {
4949
return (int64_t{key} << 32) | int64_t{pos32};
5050
}
5151

52-
void WriteLE64(char* buf, int64_t value) {
53-
auto le = ToLittleEndian(static_cast<uint64_t>(value));
54-
std::memcpy(buf, &le, sizeof(le));
55-
}
56-
57-
void WriteLE32(char* buf, int32_t value) {
58-
auto le = ToLittleEndian(static_cast<uint32_t>(value));
59-
std::memcpy(buf, &le, sizeof(le));
60-
}
61-
62-
int64_t ReadLE64(const char* buf) {
63-
uint64_t v;
64-
std::memcpy(&v, buf, sizeof(v));
65-
return static_cast<int64_t>(FromLittleEndian(v));
66-
}
67-
68-
int32_t ReadLE32(const char* buf) {
69-
uint32_t v;
70-
std::memcpy(&v, buf, sizeof(v));
71-
return static_cast<int32_t>(FromLittleEndian(v));
72-
}
73-
7452
Status ValidatePosition(int64_t pos) {
7553
if (pos < 0 || pos > RoaringPositionBitmap::kMaxPosition) {
7654
return InvalidArgument("Bitmap supports positions that are >= 0 and <= {}: {}",
@@ -205,12 +183,12 @@ Result<std::string> RoaringPositionBitmap::Serialize() const {
205183
char* buf = result.data();
206184

207185
// Write bitmap count (array length including empties)
208-
WriteLE64(buf, static_cast<int64_t>(impl_->bitmaps.size()));
186+
WriteLittleEndian(static_cast<int64_t>(impl_->bitmaps.size()), buf);
209187
buf += kBitmapCountSizeBytes;
210188

211189
// Write each bitmap with its key
212190
for (int32_t key = 0; std::cmp_less(key, impl_->bitmaps.size()); ++key) {
213-
WriteLE32(buf, key);
191+
WriteLittleEndian(key, buf);
214192
buf += kBitmapKeySizeBytes;
215193
size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true);
216194
buf += written;
@@ -226,7 +204,7 @@ Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_vie
226204
ICEBERG_PRECHECK(remaining >= kBitmapCountSizeBytes,
227205
"Buffer too small for bitmap count: {} bytes", remaining);
228206

229-
int64_t bitmap_count = ReadLE64(buf);
207+
auto bitmap_count = ReadLittleEndian<int64_t>(buf);
230208
buf += kBitmapCountSizeBytes;
231209
remaining -= kBitmapCountSizeBytes;
232210

@@ -242,7 +220,7 @@ Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_vie
242220
ICEBERG_PRECHECK(remaining >= kBitmapKeySizeBytes,
243221
"Buffer too small for bitmap key: {} bytes", remaining);
244222

245-
int32_t key = ReadLE32(buf);
223+
auto key = ReadLittleEndian<int32_t>(buf);
246224
buf += kBitmapKeySizeBytes;
247225
remaining -= kBitmapKeySizeBytes;
248226

src/iceberg/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ iceberg_sources = files(
8989
'partition_spec.cc',
9090
'partition_summary.cc',
9191
'puffin/file_metadata.cc',
92+
'puffin/json_serde.cc',
93+
'puffin/puffin_format.cc',
9294
'row/arrow_array_wrapper.cc',
9395
'row/manifest_wrapper.cc',
9496
'row/partition_values.cc',

src/iceberg/puffin/json_serde.cc

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <nlohmann/json.hpp>
21+
22+
#include "iceberg/puffin/file_metadata.h"
23+
#include "iceberg/puffin/json_serde_internal.h"
24+
#include "iceberg/util/json_util_internal.h"
25+
#include "iceberg/util/macros.h"
26+
27+
namespace iceberg::puffin {
28+
29+
namespace {
30+
constexpr std::string_view kBlobs = "blobs";
31+
constexpr std::string_view kProperties = "properties";
32+
constexpr std::string_view kType = "type";
33+
constexpr std::string_view kFields = "fields";
34+
constexpr std::string_view kSnapshotId = "snapshot-id";
35+
constexpr std::string_view kSequenceNumber = "sequence-number";
36+
constexpr std::string_view kOffset = "offset";
37+
constexpr std::string_view kLength = "length";
38+
constexpr std::string_view kCompressionCodec = "compression-codec";
39+
} // namespace
40+
41+
nlohmann::json ToJson(const BlobMetadata& blob_metadata) {
42+
nlohmann::json json;
43+
json[kType] = blob_metadata.type;
44+
json[kFields] = blob_metadata.input_fields;
45+
json[kSnapshotId] = blob_metadata.snapshot_id;
46+
json[kSequenceNumber] = blob_metadata.sequence_number;
47+
json[kOffset] = blob_metadata.offset;
48+
json[kLength] = blob_metadata.length;
49+
50+
SetOptionalStringField(json, kCompressionCodec, blob_metadata.compression_codec);
51+
SetContainerField(json, kProperties, blob_metadata.properties);
52+
53+
return json;
54+
}
55+
56+
Result<BlobMetadata> BlobMetadataFromJson(const nlohmann::json& json) {
57+
BlobMetadata blob_metadata;
58+
59+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.type, GetJsonValue<std::string>(json, kType));
60+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.input_fields,
61+
GetJsonValue<std::vector<int32_t>>(json, kFields));
62+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.snapshot_id,
63+
GetJsonValue<int64_t>(json, kSnapshotId));
64+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.sequence_number,
65+
GetJsonValue<int64_t>(json, kSequenceNumber));
66+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.offset, GetJsonValue<int64_t>(json, kOffset));
67+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.length, GetJsonValue<int64_t>(json, kLength));
68+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.compression_codec,
69+
GetJsonValueOrDefault<std::string>(json, kCompressionCodec));
70+
ICEBERG_ASSIGN_OR_RAISE(blob_metadata.properties,
71+
FromJsonMap<std::string>(json, kProperties));
72+
73+
return blob_metadata;
74+
}
75+
76+
nlohmann::json ToJson(const FileMetadata& file_metadata) {
77+
nlohmann::json json;
78+
79+
nlohmann::json blobs_json = nlohmann::json::array();
80+
for (const auto& blob : file_metadata.blobs) {
81+
blobs_json.push_back(ToJson(blob));
82+
}
83+
json[kBlobs] = std::move(blobs_json);
84+
85+
SetContainerField(json, kProperties, file_metadata.properties);
86+
87+
return json;
88+
}
89+
90+
Result<FileMetadata> FileMetadataFromJson(const nlohmann::json& json) {
91+
FileMetadata file_metadata;
92+
93+
ICEBERG_ASSIGN_OR_RAISE(auto blobs_json, GetJsonValue<nlohmann::json>(json, kBlobs));
94+
if (!blobs_json.is_array()) {
95+
return JsonParseError("Cannot parse blobs from non-array: {}",
96+
SafeDumpJson(blobs_json));
97+
}
98+
99+
for (const auto& blob_json : blobs_json) {
100+
ICEBERG_ASSIGN_OR_RAISE(auto blob, BlobMetadataFromJson(blob_json));
101+
file_metadata.blobs.push_back(std::move(blob));
102+
}
103+
104+
ICEBERG_ASSIGN_OR_RAISE(file_metadata.properties,
105+
FromJsonMap<std::string>(json, kProperties));
106+
107+
return file_metadata;
108+
}
109+
110+
std::string ToJsonString(const FileMetadata& file_metadata, bool pretty) {
111+
auto json = ToJson(file_metadata);
112+
return pretty ? json.dump(2) : json.dump();
113+
}
114+
115+
Result<FileMetadata> FileMetadataFromJsonString(std::string_view json_string) {
116+
if (json_string.empty()) {
117+
return JsonParseError("Cannot parse empty JSON string");
118+
}
119+
try {
120+
auto json = nlohmann::json::parse(json_string);
121+
return FileMetadataFromJson(json);
122+
} catch (const nlohmann::json::parse_error& e) {
123+
return JsonParseError("Failed to parse JSON: {}", e.what());
124+
}
125+
}
126+
127+
} // namespace iceberg::puffin
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/json_serde_internal.h
23+
/// JSON serialization/deserialization for Puffin file metadata.
24+
25+
#include <string>
26+
#include <string_view>
27+
28+
#include <nlohmann/json_fwd.hpp>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/puffin/type_fwd.h"
32+
#include "iceberg/result.h"
33+
34+
namespace iceberg::puffin {
35+
36+
/// \brief Serialize a BlobMetadata to JSON.
37+
ICEBERG_EXPORT nlohmann::json ToJson(const BlobMetadata& blob_metadata);
38+
39+
/// \brief Deserialize a BlobMetadata from JSON.
40+
ICEBERG_EXPORT Result<BlobMetadata> BlobMetadataFromJson(const nlohmann::json& json);
41+
42+
/// \brief Serialize a FileMetadata to JSON.
43+
ICEBERG_EXPORT nlohmann::json ToJson(const FileMetadata& file_metadata);
44+
45+
/// \brief Deserialize a FileMetadata from JSON.
46+
ICEBERG_EXPORT Result<FileMetadata> FileMetadataFromJson(const nlohmann::json& json);
47+
48+
/// \brief Serialize a FileMetadata to a JSON string.
49+
ICEBERG_EXPORT std::string ToJsonString(const FileMetadata& file_metadata,
50+
bool pretty = false);
51+
52+
/// \brief Deserialize a FileMetadata from a JSON string.
53+
ICEBERG_EXPORT Result<FileMetadata> FileMetadataFromJsonString(
54+
std::string_view json_string);
55+
56+
} // namespace iceberg::puffin

src/iceberg/puffin/meson.build

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
install_headers(['file_metadata.h'], subdir: 'iceberg/puffin')
18+
install_headers(
19+
['file_metadata.h', 'puffin_format.h', 'type_fwd.h'],
20+
subdir: 'iceberg/puffin',
21+
)
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/puffin_format.h"
21+
22+
#include <cstddef>
23+
#include <utility>
24+
#include <vector>
25+
26+
namespace iceberg::puffin {
27+
28+
namespace {
29+
30+
// Returns (byte_index, bit_index) for a given flag within the 4-byte flags field.
31+
constexpr std::pair<int, int> GetFlagPosition(PuffinFlag flag) {
32+
switch (flag) {
33+
case PuffinFlag::kFooterPayloadCompressed:
34+
return {0, 0};
35+
}
36+
std::unreachable();
37+
}
38+
39+
// TODO(zhaoxuan1994): Move compression logic to a unified codec interface.
40+
Result<std::vector<std::byte>> Compress(PuffinCompressionCodec codec,
41+
std::span<const std::byte> input) {
42+
switch (codec) {
43+
case PuffinCompressionCodec::kNone:
44+
return std::vector<std::byte>(input.begin(), input.end());
45+
case PuffinCompressionCodec::kLz4:
46+
return NotSupported("LZ4 compression is not yet supported");
47+
case PuffinCompressionCodec::kZstd:
48+
return NotSupported("Zstd compression is not yet supported");
49+
}
50+
std::unreachable();
51+
}
52+
53+
Result<std::vector<std::byte>> Decompress(PuffinCompressionCodec codec,
54+
std::span<const std::byte> input) {
55+
switch (codec) {
56+
case PuffinCompressionCodec::kNone:
57+
return std::vector<std::byte>(input.begin(), input.end());
58+
case PuffinCompressionCodec::kLz4:
59+
return NotSupported("LZ4 decompression is not yet supported");
60+
case PuffinCompressionCodec::kZstd:
61+
return NotSupported("Zstd decompression is not yet supported");
62+
}
63+
std::unreachable();
64+
}
65+
66+
} // namespace
67+
68+
bool IsFlagSet(std::span<const uint8_t, 4> flags, PuffinFlag flag) {
69+
auto [byte_num, bit_num] = GetFlagPosition(flag);
70+
return (flags[byte_num] & (1 << bit_num)) != 0;
71+
}
72+
73+
void SetFlag(std::span<uint8_t, 4> flags, PuffinFlag flag) {
74+
auto [byte_num, bit_num] = GetFlagPosition(flag);
75+
flags[byte_num] |= (1 << bit_num);
76+
}
77+
78+
} // namespace iceberg::puffin

0 commit comments

Comments
 (0)