|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +#include "iceberg/puffin/puffin_reader.h" |
| 21 | + |
| 22 | +#include <algorithm> |
| 23 | +#include <array> |
| 24 | +#include <cstring> |
| 25 | +#include <string_view> |
| 26 | + |
| 27 | +#include "iceberg/puffin/json_serde_internal.h" |
| 28 | +#include "iceberg/puffin/puffin_format.h" |
| 29 | +#include "iceberg/util/endian.h" |
| 30 | +#include "iceberg/util/macros.h" |
| 31 | + |
| 32 | +namespace iceberg::puffin { |
| 33 | + |
| 34 | +namespace { |
| 35 | + |
| 36 | +// Validate magic bytes at the given offset. |
| 37 | +Status CheckMagic(std::span<const std::byte> data, int64_t offset) { |
| 38 | + if (offset < 0 || |
| 39 | + offset + PuffinFormat::kMagicLength > static_cast<int64_t>(data.size())) { |
| 40 | + return Invalid("Invalid file: cannot read magic at offset {}", offset); |
| 41 | + } |
| 42 | + auto* begin = reinterpret_cast<const uint8_t*>(data.data() + offset); |
| 43 | + if (!std::equal(PuffinFormat::kMagicV1.begin(), PuffinFormat::kMagicV1.end(), begin)) { |
| 44 | + return Invalid("Invalid file: expected magic at offset {}", offset); |
| 45 | + } |
| 46 | + return {}; |
| 47 | +} |
| 48 | + |
| 49 | +} // namespace |
| 50 | + |
| 51 | +PuffinReader::PuffinReader(std::span<const std::byte> data) : data_(data) {} |
| 52 | + |
| 53 | +Result<FileMetadata> PuffinReader::ReadFileMetadata() { |
| 54 | + auto file_size = static_cast<int64_t>(data_.size()); |
| 55 | + |
| 56 | + if (file_size < PuffinFormat::kFooterStructLength) { |
| 57 | + return Invalid("Invalid file: file length {} is less than minimal footer size {}", |
| 58 | + file_size, PuffinFormat::kFooterStructLength); |
| 59 | + } |
| 60 | + |
| 61 | + // Read footer struct from end of file |
| 62 | + auto footer_struct_offset = file_size - PuffinFormat::kFooterStructLength; |
| 63 | + |
| 64 | + // Validate footer end magic |
| 65 | + ICEBERG_RETURN_UNEXPECTED( |
| 66 | + CheckMagic(data_, footer_struct_offset + PuffinFormat::kFooterStructMagicOffset)); |
| 67 | + |
| 68 | + // Read payload size from footer struct |
| 69 | + auto payload_size = ReadLittleEndian<int32_t>( |
| 70 | + data_.data() + footer_struct_offset + PuffinFormat::kFooterStructPayloadSizeOffset); |
| 71 | + |
| 72 | + if (payload_size < 0) { |
| 73 | + return Invalid("Invalid file: negative payload size {}", payload_size); |
| 74 | + } |
| 75 | + |
| 76 | + // Calculate total footer size and validate |
| 77 | + int64_t footer_size = PuffinFormat::kFooterStartMagicLength + |
| 78 | + static_cast<int64_t>(payload_size) + |
| 79 | + PuffinFormat::kFooterStructLength; |
| 80 | + auto footer_offset = file_size - footer_size; |
| 81 | + if (footer_offset < 0) { |
| 82 | + return Invalid("Invalid file: footer size {} exceeds file size {}", footer_size, |
| 83 | + file_size); |
| 84 | + } |
| 85 | + |
| 86 | + // Validate footer start magic |
| 87 | + ICEBERG_RETURN_UNEXPECTED(CheckMagic(data_, footer_offset)); |
| 88 | + |
| 89 | + // Check flags for footer compression |
| 90 | + std::array<uint8_t, 4> flags{}; |
| 91 | + std::memcpy( |
| 92 | + flags.data(), |
| 93 | + data_.data() + footer_struct_offset + PuffinFormat::kFooterStructFlagsOffset, 4); |
| 94 | + |
| 95 | + PuffinCompressionCodec footer_compression = PuffinCompressionCodec::kNone; |
| 96 | + if (IsFlagSet(flags, PuffinFlag::kFooterPayloadCompressed)) { |
| 97 | + footer_compression = PuffinFormat::kDefaultFooterCompressionCodec; |
| 98 | + } |
| 99 | + |
| 100 | + // Extract footer payload |
| 101 | + auto payload_offset = footer_offset + PuffinFormat::kFooterStartMagicLength; |
| 102 | + std::span<const std::byte> payload_span(data_.data() + payload_offset, payload_size); |
| 103 | + ICEBERG_ASSIGN_OR_RAISE(auto payload_bytes, |
| 104 | + Decompress(footer_compression, payload_span)); |
| 105 | + |
| 106 | + // Parse JSON |
| 107 | + std::string_view json_str(reinterpret_cast<const char*>(payload_bytes.data()), |
| 108 | + payload_bytes.size()); |
| 109 | + ICEBERG_ASSIGN_OR_RAISE(auto file_metadata, FileMetadataFromJsonString(json_str)); |
| 110 | + |
| 111 | + // Validate header magic |
| 112 | + ICEBERG_RETURN_UNEXPECTED(CheckMagic(data_, 0)); |
| 113 | + |
| 114 | + return file_metadata; |
| 115 | +} |
| 116 | + |
| 117 | +Result<std::pair<BlobMetadata, std::vector<std::byte>>> PuffinReader::ReadBlob( |
| 118 | + const BlobMetadata& blob_metadata) { |
| 119 | + auto file_size = static_cast<int64_t>(data_.size()); |
| 120 | + |
| 121 | + if (blob_metadata.offset < 0 || blob_metadata.length < 0 || |
| 122 | + blob_metadata.offset > file_size || |
| 123 | + blob_metadata.length > file_size - blob_metadata.offset) { |
| 124 | + return Invalid("Invalid blob: offset {} + length {} exceeds file size {}", |
| 125 | + blob_metadata.offset, blob_metadata.length, file_size); |
| 126 | + } |
| 127 | + |
| 128 | + std::span<const std::byte> raw_data(data_.data() + blob_metadata.offset, |
| 129 | + blob_metadata.length); |
| 130 | + |
| 131 | + // Determine compression codec |
| 132 | + ICEBERG_ASSIGN_OR_RAISE( |
| 133 | + auto codec, PuffinCompressionCodecFromName(blob_metadata.compression_codec)); |
| 134 | + ICEBERG_ASSIGN_OR_RAISE(auto decompressed, Decompress(codec, raw_data)); |
| 135 | + |
| 136 | + return std::pair{blob_metadata, std::move(decompressed)}; |
| 137 | +} |
| 138 | + |
| 139 | +Result<std::vector<std::pair<BlobMetadata, std::vector<std::byte>>>> |
| 140 | +PuffinReader::ReadAll(const std::vector<BlobMetadata>& blobs) { |
| 141 | + std::vector<std::pair<BlobMetadata, std::vector<std::byte>>> results; |
| 142 | + results.reserve(blobs.size()); |
| 143 | + for (const auto& blob : blobs) { |
| 144 | + ICEBERG_ASSIGN_OR_RAISE(auto blob_pair, ReadBlob(blob)); |
| 145 | + results.push_back(std::move(blob_pair)); |
| 146 | + } |
| 147 | + return results; |
| 148 | +} |
| 149 | + |
| 150 | +} // namespace iceberg::puffin |
0 commit comments