Skip to content

Commit 8a7ed69

Browse files
author
xuan.zhao
committed
feat(puffin): add basic data structures and constants
Add the foundational types for Puffin file format support: - Blob, BlobMetadata, FileMetadata structs - PuffinCompressionCodec enum with codec name conversion - StandardBlobTypes and StandardPuffinProperties constants - ToString functions for all types - 24 unit tests covering all public APIs
1 parent 8bf089f commit 8a7ed69

16 files changed

+860
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ set(ICEBERG_SOURCES
6262
partition_field.cc
6363
partition_spec.cc
6464
partition_summary.cc
65+
puffin/blob.cc
66+
puffin/blob_metadata.cc
67+
puffin/file_metadata.cc
68+
puffin/puffin_compression_codec.cc
6569
row/arrow_array_wrapper.cc
6670
row/manifest_wrapper.cc
6771
row/partition_values.cc
@@ -166,6 +170,7 @@ add_subdirectory(catalog)
166170
add_subdirectory(data)
167171
add_subdirectory(expression)
168172
add_subdirectory(manifest)
173+
add_subdirectory(puffin)
169174
add_subdirectory(row)
170175
add_subdirectory(update)
171176
add_subdirectory(util)

src/iceberg/meson.build

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ iceberg_sources = files(
8080
'partition_field.cc',
8181
'partition_spec.cc',
8282
'partition_summary.cc',
83+
'puffin/blob.cc',
84+
'puffin/blob_metadata.cc',
85+
'puffin/file_metadata.cc',
86+
'puffin/puffin_compression_codec.cc',
8387
'row/arrow_array_wrapper.cc',
8488
'row/manifest_wrapper.cc',
8589
'row/partition_values.cc',
@@ -221,6 +225,7 @@ install_headers(
221225
subdir('catalog')
222226
subdir('expression')
223227
subdir('manifest')
228+
subdir('puffin')
224229
subdir('row')
225230
subdir('update')
226231
subdir('util')

src/iceberg/puffin/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
iceberg_install_all_headers(iceberg/puffin)

src/iceberg/puffin/blob.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/blob.h"
21+
22+
#include <format>
23+
24+
#include "iceberg/util/formatter_internal.h"
25+
26+
namespace iceberg::puffin {
27+
28+
std::string ToString(const Blob& blob) {
29+
std::string repr = "Blob[";
30+
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type,
31+
blob.input_fields);
32+
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
33+
blob.snapshot_id, blob.sequence_number);
34+
std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size());
35+
if (blob.requested_compression.has_value()) {
36+
std::format_to(std::back_inserter(repr), ",requestedCompression={}",
37+
iceberg::puffin::ToString(*blob.requested_compression));
38+
}
39+
if (!blob.properties.empty()) {
40+
std::format_to(std::back_inserter(repr), ",properties={}", blob.properties);
41+
}
42+
std::format_to(std::back_inserter(repr), "]");
43+
return repr;
44+
}
45+
46+
} // namespace iceberg::puffin

src/iceberg/puffin/blob.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/blob.h
23+
/// Blob data structure for Puffin files.
24+
25+
#include <cstdint>
26+
#include <optional>
27+
#include <string>
28+
#include <unordered_map>
29+
#include <vector>
30+
31+
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/puffin/puffin_compression_codec.h"
33+
34+
namespace iceberg::puffin {
35+
36+
/// \brief A blob to be written to a Puffin file.
37+
///
38+
/// This represents the uncompressed blob data along with its metadata.
39+
/// The actual compression is handled during writing.
40+
struct ICEBERG_EXPORT Blob {
41+
/// Type of the blob. See StandardBlobTypes for known types.
42+
std::string type;
43+
/// List of field IDs the blob was computed for.
44+
/// The order of items is used to compute sketches stored in the blob.
45+
std::vector<int32_t> input_fields;
46+
/// ID of the Iceberg table's snapshot the blob was computed from.
47+
int64_t snapshot_id;
48+
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
49+
int64_t sequence_number;
50+
/// The uncompressed blob data.
51+
std::vector<uint8_t> data;
52+
/// Requested compression codec. If not set, the writer's default will be used.
53+
std::optional<PuffinCompressionCodec> requested_compression;
54+
/// Additional properties of the blob.
55+
std::unordered_map<std::string, std::string> properties;
56+
57+
/// \brief Compare two Blobs for equality.
58+
friend bool operator==(const Blob& lhs, const Blob& rhs) = default;
59+
};
60+
61+
/// \brief Returns a string representation of a Blob.
62+
ICEBERG_EXPORT std::string ToString(const Blob& blob);
63+
64+
} // namespace iceberg::puffin
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/blob_metadata.h"
21+
22+
#include <format>
23+
24+
#include "iceberg/util/formatter_internal.h"
25+
26+
namespace iceberg::puffin {
27+
28+
std::string ToString(const BlobMetadata& blob_metadata) {
29+
std::string repr = "BlobMetadata[";
30+
std::format_to(std::back_inserter(repr), "type='{}',inputFields={},",
31+
blob_metadata.type, blob_metadata.input_fields);
32+
std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},",
33+
blob_metadata.snapshot_id, blob_metadata.sequence_number);
34+
std::format_to(std::back_inserter(repr), "offset={},length={}", blob_metadata.offset,
35+
blob_metadata.length);
36+
if (blob_metadata.compression_codec.has_value()) {
37+
std::format_to(std::back_inserter(repr), ",compressionCodec='{}'",
38+
*blob_metadata.compression_codec);
39+
}
40+
if (!blob_metadata.properties.empty()) {
41+
std::format_to(std::back_inserter(repr), ",properties={}", blob_metadata.properties);
42+
}
43+
std::format_to(std::back_inserter(repr), "]");
44+
return repr;
45+
}
46+
47+
} // namespace iceberg::puffin

src/iceberg/puffin/blob_metadata.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/blob_metadata.h
23+
/// Blob metadata structure for Puffin files.
24+
25+
#include <cstdint>
26+
#include <optional>
27+
#include <string>
28+
#include <unordered_map>
29+
#include <vector>
30+
31+
#include "iceberg/iceberg_export.h"
32+
33+
namespace iceberg::puffin {
34+
35+
/// \brief Metadata about a blob stored in a Puffin file.
36+
///
37+
/// This represents the metadata stored in the Puffin file footer,
38+
/// including the blob's location within the file.
39+
struct ICEBERG_EXPORT BlobMetadata {
40+
/// Type of the blob. See StandardBlobTypes for known types.
41+
std::string type;
42+
/// List of field IDs the blob was computed for.
43+
std::vector<int32_t> input_fields;
44+
/// ID of the Iceberg table's snapshot the blob was computed from.
45+
int64_t snapshot_id;
46+
/// Sequence number of the Iceberg table's snapshot the blob was computed from.
47+
int64_t sequence_number;
48+
/// Offset in the file where the blob data starts.
49+
int64_t offset;
50+
/// Length of the blob data in the file (after compression, if compressed).
51+
int64_t length;
52+
/// Compression codec name, or std::nullopt if uncompressed.
53+
std::optional<std::string> compression_codec;
54+
/// Additional properties of the blob.
55+
std::unordered_map<std::string, std::string> properties;
56+
57+
/// \brief Compare two BlobMetadatas for equality.
58+
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) = default;
59+
};
60+
61+
/// \brief Returns a string representation of a BlobMetadata.
62+
ICEBERG_EXPORT std::string ToString(const BlobMetadata& blob_metadata);
63+
64+
} // namespace iceberg::puffin
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/puffin/file_metadata.h"
21+
22+
#include <format>
23+
24+
#include "iceberg/util/formatter_internal.h"
25+
26+
namespace iceberg::puffin {
27+
28+
std::string ToString(const FileMetadata& file_metadata) {
29+
std::string repr = "FileMetadata[";
30+
std::format_to(std::back_inserter(repr), "blobs=[");
31+
for (size_t i = 0; i < file_metadata.blobs.size(); ++i) {
32+
if (i > 0) {
33+
std::format_to(std::back_inserter(repr), ",");
34+
}
35+
std::format_to(std::back_inserter(repr), "{}", ToString(file_metadata.blobs[i]));
36+
}
37+
std::format_to(std::back_inserter(repr), "]");
38+
if (!file_metadata.properties.empty()) {
39+
std::format_to(std::back_inserter(repr), ",properties={}", file_metadata.properties);
40+
}
41+
std::format_to(std::back_inserter(repr), "]");
42+
return repr;
43+
}
44+
45+
} // namespace iceberg::puffin

src/iceberg/puffin/file_metadata.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/puffin/file_metadata.h
23+
/// File metadata structure for Puffin files.
24+
25+
#include <string>
26+
#include <unordered_map>
27+
#include <vector>
28+
29+
#include "iceberg/iceberg_export.h"
30+
#include "iceberg/puffin/blob_metadata.h"
31+
32+
namespace iceberg::puffin {
33+
34+
/// \brief Metadata about a Puffin file.
35+
///
36+
/// This represents the metadata stored in the Puffin file footer,
37+
/// including information about all blobs in the file.
38+
struct ICEBERG_EXPORT FileMetadata {
39+
/// List of blob metadata for all blobs in the file.
40+
std::vector<BlobMetadata> blobs;
41+
/// File-level properties.
42+
std::unordered_map<std::string, std::string> properties;
43+
44+
/// \brief Compare two FileMetadatas for equality.
45+
friend bool operator==(const FileMetadata& lhs, const FileMetadata& rhs) = default;
46+
};
47+
48+
/// \brief Returns a string representation of a FileMetadata.
49+
ICEBERG_EXPORT std::string ToString(const FileMetadata& file_metadata);
50+
51+
} // namespace iceberg::puffin

0 commit comments

Comments
 (0)