Skip to content

Commit 743c318

Browse files
authored
feat(puffin): add basic data structures and constants of puffin (#588)
Add the foundational types for Puffin file format support: - Blob, BlobMetadata, FileMetadata structs - PuffinCompressionCodec enum with codec name conversion - StandardBlobTypes and StandardPuffinProperties constants - ToString functions for all types
1 parent f79f885 commit 743c318

File tree

6 files changed

+278
-0
lines changed

6 files changed

+278
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ set(ICEBERG_SOURCES
6262
partition_field.cc
6363
partition_spec.cc
6464
partition_summary.cc
65+
puffin/file_metadata.cc
6566
row/arrow_array_wrapper.cc
6667
row/manifest_wrapper.cc
6768
row/partition_values.cc
@@ -167,6 +168,7 @@ add_subdirectory(catalog)
167168
add_subdirectory(data)
168169
add_subdirectory(expression)
169170
add_subdirectory(manifest)
171+
add_subdirectory(puffin)
170172
add_subdirectory(row)
171173
add_subdirectory(update)
172174
add_subdirectory(util)

src/iceberg/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ iceberg_sources = files(
8080
'partition_field.cc',
8181
'partition_spec.cc',
8282
'partition_summary.cc',
83+
'puffin/file_metadata.cc',
8384
'row/arrow_array_wrapper.cc',
8485
'row/manifest_wrapper.cc',
8586
'row/partition_values.cc',
@@ -222,6 +223,7 @@ install_headers(
222223
subdir('catalog')
223224
subdir('expression')
224225
subdir('manifest')
226+
subdir('puffin')
225227
subdir('row')
226228
subdir('update')
227229
subdir('util')

src/iceberg/puffin/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
iceberg_install_all_headers(iceberg/puffin)
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/file_metadata.h"
21+
22+
#include <format>
23+
#include <utility>
24+
25+
#include "iceberg/util/formatter_internal.h"
26+
27+
namespace iceberg::puffin {
28+
29+
namespace {
30+
constexpr std::string_view kLz4CodecName = "lz4";
31+
constexpr std::string_view kZstdCodecName = "zstd";
32+
} // namespace
33+
34+
std::string_view CodecName(PuffinCompressionCodec codec) {
35+
switch (codec) {
36+
case PuffinCompressionCodec::kNone:
37+
return "";
38+
case PuffinCompressionCodec::kLz4:
39+
return kLz4CodecName;
40+
case PuffinCompressionCodec::kZstd:
41+
return kZstdCodecName;
42+
}
43+
std::unreachable();
44+
}
45+
46+
Result<PuffinCompressionCodec> PuffinCompressionCodecFromName(
47+
std::string_view codec_name) {
48+
if (codec_name.empty()) {
49+
return PuffinCompressionCodec::kNone;
50+
}
51+
if (codec_name == kLz4CodecName) {
52+
return PuffinCompressionCodec::kLz4;
53+
}
54+
if (codec_name == kZstdCodecName) {
55+
return PuffinCompressionCodec::kZstd;
56+
}
57+
return InvalidArgument("Unknown codec name: {}", codec_name);
58+
}
59+
60+
std::string ToString(PuffinCompressionCodec codec) {
61+
return std::string(CodecName(codec));
62+
}
63+
64+
std::string ToString(const Blob& blob) {
65+
std::string repr = "Blob[";
66+
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type,
67+
blob.input_fields);
68+
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
69+
blob.snapshot_id, blob.sequence_number);
70+
std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size());
71+
if (blob.requested_compression.has_value()) {
72+
std::format_to(std::back_inserter(repr), ",requestedCompression={}",
73+
ToString(*blob.requested_compression));
74+
}
75+
if (!blob.properties.empty()) {
76+
std::format_to(std::back_inserter(repr), ",properties={}", blob.properties);
77+
}
78+
std::format_to(std::back_inserter(repr), "]");
79+
return repr;
80+
}
81+
82+
std::string ToString(const BlobMetadata& blob_metadata) {
83+
std::string repr = "BlobMetadata[";
84+
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},",
85+
blob_metadata.type, blob_metadata.input_fields);
86+
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
87+
blob_metadata.snapshot_id, blob_metadata.sequence_number);
88+
std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset,
89+
blob_metadata.length);
90+
if (!blob_metadata.compression_codec.empty()) {
91+
std::format_to(std::back_inserter(repr), ",compressionCodec='{}'",
92+
blob_metadata.compression_codec);
93+
}
94+
if (!blob_metadata.properties.empty()) {
95+
std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties);
96+
}
97+
std::format_to(std::back_inserter(repr), "]");
98+
return repr;
99+
}
100+
101+
std::string ToString(const FileMetadata& file_metadata) {
102+
std::string repr = "FileMetadata[";
103+
std::format_to(std::back_inserter(repr), "blobs=[");
104+
for (size_t i = 0; i < file_metadata.blobs.size(); ++i) {
105+
if (i > 0) {
106+
std::format_to(std::back_inserter(repr), ",");
107+
}
108+
std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i]));
109+
}
110+
std::format_to(std::back_inserter(repr), "]");
111+
if (!file_metadata.properties.empty()) {
112+
std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties);
113+
}
114+
std::format_to(std::back_inserter(repr), "]");
115+
return repr;
116+
}
117+
118+
} // namespace iceberg::puffin

src/iceberg/puffin/file_metadata.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/file_metadata.h
23+
/// Data structures for Puffin files.
24+
25+
#include <cstdint>
26+
#include <optional>
27+
#include <string>
28+
#include <string_view>
29+
#include <unordered_map>
30+
#include <vector>
31+
32+
#include "iceberg/iceberg_export.h"
33+
#include "iceberg/result.h"
34+
35+
namespace iceberg::puffin {
36+
37+
/// \brief Compression codecs supported by Puffin files.
38+
enum class PuffinCompressionCodec {
39+
kNone,
40+
kLz4,
41+
kZstd,
42+
};
43+
44+
ICEBERG_EXPORT std::string_view CodecName(PuffinCompressionCodec codec);
45+
46+
ICEBERG_EXPORT Result<PuffinCompressionCodec> PuffinCompressionCodecFromName(
47+
std::string_view codec_name);
48+
49+
ICEBERG_EXPORT std::string ToString(PuffinCompressionCodec codec);
50+
51+
/// \brief Standard blob types defined by the Iceberg specification.
52+
struct StandardBlobTypes {
53+
/// A serialized form of a "compact" Theta sketch produced by the
54+
/// Apache DataSketches library.
55+
static constexpr std::string_view kApacheDatasketchesThetaV1 =
56+
"apache-datasketches-theta-v1";
57+
58+
/// A serialized deletion vector according to the Iceberg spec.
59+
static constexpr std::string_view kDeletionVectorV1 = "deletion-vector-v1";
60+
};
61+
62+
/// \brief Standard file-level properties for Puffin files.
63+
struct StandardPuffinProperties {
64+
/// Human-readable identification of the application writing the file,
65+
/// along with its version.
66+
static constexpr std::string_view kCreatedBy = "created-by";
67+
};
68+
69+
/// \brief A blob in a Puffin file.
70+
struct ICEBERG_EXPORT Blob {
71+
/// See StandardBlobTypes for known types.
72+
std::string type;
73+
/// Ordered list of field IDs the blob was computed from.
74+
std::vector<int32_t> input_fields;
75+
/// ID of the Iceberg table's snapshot the blob was computed from.
76+
int64_t snapshot_id;
77+
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
78+
int64_t sequence_number;
79+
std::vector<uint8_t> data;
80+
/// If not set, the writer's default codec will be used.
81+
std::optional<PuffinCompressionCodec> requested_compression;
82+
std::unordered_map<std::string, std::string> properties;
83+
84+
friend bool operator==(const Blob& lhs, const Blob& rhs) = default;
85+
};
86+
87+
ICEBERG_EXPORT std::string ToString(const Blob& blob);
88+
89+
/// \brief Metadata about a blob stored in a Puffin file footer.
90+
struct ICEBERG_EXPORT BlobMetadata {
91+
/// See StandardBlobTypes for known types.
92+
std::string type;
93+
/// Ordered list of field IDs the blob was computed from.
94+
std::vector<int32_t> input_fields;
95+
/// ID of the Iceberg table's snapshot the blob was computed from.
96+
int64_t snapshot_id;
97+
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
98+
int64_t sequence_number;
99+
int64_t offset;
100+
int64_t length;
101+
/// Codec name (e.g. "lz4", "zstd"), or empty if not compressed.
102+
std::string compression_codec;
103+
std::unordered_map<std::string, std::string> properties;
104+
105+
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default;
106+
};
107+
108+
ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata);
109+
110+
/// \brief Metadata about a Puffin file.
111+
struct ICEBERG_EXPORT FileMetadata {
112+
std::vector<BlobMetadata> blobs;
113+
std::unordered_map<std::string, std::string> properties;
114+
115+
friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default;
116+
};
117+
118+
ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata);
119+
120+
} // namespace iceberg::puffin

src/iceberg/puffin/meson.build

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
install_headers(['file_metadata.h'], subdir: 'iceberg/puffin')

0 commit comments

Comments
 (0)