|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +#include "iceberg/puffin/puffin_reader.h" |
| 21 | + |
| 22 | +#include <algorithm> |
| 23 | +#include <array> |
| 24 | +#include <cstdint> |
| 25 | +#include <cstring> |
| 26 | +#include <string_view> |
| 27 | + |
| 28 | +#include "iceberg/file_io.h" |
| 29 | +#include "iceberg/puffin/json_serde_internal.h" |
| 30 | +#include "iceberg/puffin/puffin_format.h" |
| 31 | +#include "iceberg/util/endian.h" |
| 32 | +#include "iceberg/util/macros.h" |
| 33 | + |
| 34 | +namespace iceberg::puffin { |
| 35 | + |
| 36 | +namespace { |
| 37 | + |
| 38 | +// Validate magic bytes in a buffer at the given offset. |
| 39 | +Status CheckMagic(std::span<const std::byte> data, int64_t offset) { |
| 40 | + if (offset < 0 || |
| 41 | + offset + PuffinFormat::kMagicLength > static_cast<int64_t>(data.size())) { |
| 42 | + return Invalid("Invalid file: cannot read magic at offset {}", offset); |
| 43 | + } |
| 44 | + auto* begin = reinterpret_cast<const uint8_t*>(data.data() + offset); |
| 45 | + if (!std::equal(PuffinFormat::kMagicV1.begin(), PuffinFormat::kMagicV1.end(), begin)) { |
| 46 | + return Invalid( |
| 47 | + "Invalid file: expected magic at offset {}, got [{:#04x}, {:#04x}, " |
| 48 | + "{:#04x}, {:#04x}]", |
| 49 | + offset, begin[0], begin[1], begin[2], begin[3]); |
| 50 | + } |
| 51 | + return {}; |
| 52 | +} |
| 53 | + |
| 54 | +// Validate that no unknown flag bits are set. |
| 55 | +Status CheckUnknownFlags(std::span<const uint8_t, 4> flags) { |
| 56 | + constexpr uint8_t kKnownBitsMask = 0x01; |
| 57 | + if ((flags[0] & ~kKnownBitsMask) != 0 || flags[1] != 0 || flags[2] != 0 || |
| 58 | + flags[3] != 0) { |
| 59 | + return Invalid( |
| 60 | + "Invalid file: unknown footer flags set [{:#04x}, {:#04x}, {:#04x}, {:#04x}]", |
| 61 | + flags[0], flags[1], flags[2], flags[3]); |
| 62 | + } |
| 63 | + return {}; |
| 64 | +} |
| 65 | + |
| 66 | +} // namespace |
| 67 | + |
| 68 | +PuffinReader::PuffinReader(std::span<const std::byte> data) |
| 69 | + : data_(data), file_size_(static_cast<int64_t>(data.size())) {} |
| 70 | + |
| 71 | +PuffinReader::PuffinReader(std::unique_ptr<InputFile> input_file) |
| 72 | + : input_file_(std::move(input_file)) {} |
| 73 | + |
| 74 | +PuffinReader::~PuffinReader() = default; |
| 75 | + |
| 76 | +Result<std::vector<std::byte>> PuffinReader::ReadBytes(int64_t offset, int64_t length) { |
| 77 | + if (IsFileMode()) { |
| 78 | + if (!stream_) { |
| 79 | + ICEBERG_ASSIGN_OR_RAISE(stream_, input_file_->Open()); |
| 80 | + } |
| 81 | + std::vector<std::byte> buf(length); |
| 82 | + ICEBERG_RETURN_UNEXPECTED(stream_->ReadFully(offset, buf)); |
| 83 | + return buf; |
| 84 | + } |
| 85 | + // Memory mode |
| 86 | + if (offset < 0 || length < 0 || offset > file_size_ || length > file_size_ - offset) { |
| 87 | + return Invalid("Read out of bounds: offset {} + length {} exceeds file size {}", |
| 88 | + offset, length, file_size_); |
| 89 | + } |
| 90 | + return std::vector<std::byte>(data_.data() + offset, data_.data() + offset + length); |
| 91 | +} |
| 92 | + |
| 93 | +Result<FileMetadata> PuffinReader::ReadFileMetadata() { |
| 94 | + // Get file size |
| 95 | + if (IsFileMode()) { |
| 96 | + ICEBERG_ASSIGN_OR_RAISE(file_size_, input_file_->Size()); |
| 97 | + } |
| 98 | + |
| 99 | + if (file_size_ < PuffinFormat::kFooterStructLength) { |
| 100 | + return Invalid("Invalid file: file length {} is less than minimal footer size {}", |
| 101 | + file_size_, PuffinFormat::kFooterStructLength); |
| 102 | + } |
| 103 | + |
| 104 | + // Validate header magic |
| 105 | + ICEBERG_ASSIGN_OR_RAISE(auto header_bytes, ReadBytes(0, PuffinFormat::kMagicLength)); |
| 106 | + ICEBERG_RETURN_UNEXPECTED(CheckMagic(header_bytes, 0)); |
| 107 | + |
| 108 | + // Read footer struct from end of file |
| 109 | + auto footer_struct_offset = file_size_ - PuffinFormat::kFooterStructLength; |
| 110 | + ICEBERG_ASSIGN_OR_RAISE( |
| 111 | + auto footer_struct, |
| 112 | + ReadBytes(footer_struct_offset, PuffinFormat::kFooterStructLength)); |
| 113 | + |
| 114 | + // Validate footer end magic |
| 115 | + ICEBERG_RETURN_UNEXPECTED( |
| 116 | + CheckMagic(footer_struct, PuffinFormat::kFooterStructMagicOffset)); |
| 117 | + |
| 118 | + // Read payload size |
| 119 | + auto payload_size = ReadLittleEndian<int32_t>( |
| 120 | + footer_struct.data() + PuffinFormat::kFooterStructPayloadSizeOffset); |
| 121 | + |
| 122 | + if (payload_size < 0) { |
| 123 | + return Invalid("Invalid file: negative payload size {}", payload_size); |
| 124 | + } |
| 125 | + |
| 126 | + // Calculate total footer size and validate |
| 127 | + int64_t footer_size = PuffinFormat::kFooterStartMagicLength + |
| 128 | + static_cast<int64_t>(payload_size) + |
| 129 | + PuffinFormat::kFooterStructLength; |
| 130 | + auto footer_offset = file_size_ - footer_size; |
| 131 | + if (footer_offset < 0) { |
| 132 | + return Invalid("Invalid file: footer size {} exceeds file size {}", footer_size, |
| 133 | + file_size_); |
| 134 | + } |
| 135 | + |
| 136 | + // Validate footer start magic |
| 137 | + ICEBERG_ASSIGN_OR_RAISE(auto footer_start_magic, |
| 138 | + ReadBytes(footer_offset, PuffinFormat::kMagicLength)); |
| 139 | + ICEBERG_RETURN_UNEXPECTED(CheckMagic(footer_start_magic, 0)); |
| 140 | + |
| 141 | + // Check flags |
| 142 | + std::array<uint8_t, 4> flags{}; |
| 143 | + std::memcpy(flags.data(), footer_struct.data() + PuffinFormat::kFooterStructFlagsOffset, |
| 144 | + 4); |
| 145 | + ICEBERG_RETURN_UNEXPECTED(CheckUnknownFlags(flags)); |
| 146 | + |
| 147 | + PuffinCompressionCodec footer_compression = PuffinCompressionCodec::kNone; |
| 148 | + if (IsFlagSet(flags, PuffinFlag::kFooterPayloadCompressed)) { |
| 149 | + footer_compression = PuffinFormat::kDefaultFooterCompressionCodec; |
| 150 | + } |
| 151 | + |
| 152 | + // Read and decompress footer payload |
| 153 | + auto payload_offset = footer_offset + PuffinFormat::kFooterStartMagicLength; |
| 154 | + ICEBERG_ASSIGN_OR_RAISE(auto payload_bytes, ReadBytes(payload_offset, payload_size)); |
| 155 | + ICEBERG_ASSIGN_OR_RAISE(auto decompressed, |
| 156 | + Decompress(footer_compression, payload_bytes)); |
| 157 | + |
| 158 | + // Parse JSON |
| 159 | + std::string_view json_str(reinterpret_cast<const char*>(decompressed.data()), |
| 160 | + decompressed.size()); |
| 161 | + return FileMetadataFromJsonString(json_str); |
| 162 | +} |
| 163 | + |
| 164 | +Result<std::pair<BlobMetadata, std::vector<std::byte>>> PuffinReader::ReadBlob( |
| 165 | + const BlobMetadata& blob_metadata) { |
| 166 | + if (blob_metadata.offset < 0 || blob_metadata.length < 0 || |
| 167 | + blob_metadata.offset > file_size_ || |
| 168 | + blob_metadata.length > file_size_ - blob_metadata.offset) { |
| 169 | + return Invalid("Invalid blob: offset {} + length {} exceeds file size {}", |
| 170 | + blob_metadata.offset, blob_metadata.length, file_size_); |
| 171 | + } |
| 172 | + |
| 173 | + ICEBERG_ASSIGN_OR_RAISE(auto raw_data, |
| 174 | + ReadBytes(blob_metadata.offset, blob_metadata.length)); |
| 175 | + |
| 176 | + // Determine compression codec |
| 177 | + ICEBERG_ASSIGN_OR_RAISE( |
| 178 | + auto codec, PuffinCompressionCodecFromName(blob_metadata.compression_codec)); |
| 179 | + ICEBERG_ASSIGN_OR_RAISE(auto decompressed, Decompress(codec, raw_data)); |
| 180 | + |
| 181 | + return std::pair{blob_metadata, std::move(decompressed)}; |
| 182 | +} |
| 183 | + |
| 184 | +Result<std::vector<std::pair<BlobMetadata, std::vector<std::byte>>>> |
| 185 | +PuffinReader::ReadAll(const std::vector<BlobMetadata>& blobs) { |
| 186 | + std::vector<std::pair<BlobMetadata, std::vector<std::byte>>> results; |
| 187 | + results.reserve(blobs.size()); |
| 188 | + for (const auto& blob : blobs) { |
| 189 | + ICEBERG_ASSIGN_OR_RAISE(auto blob_pair, ReadBlob(blob)); |
| 190 | + results.push_back(std::move(blob_pair)); |
| 191 | + } |
| 192 | + return results; |
| 193 | +} |
| 194 | + |
| 195 | +} // namespace iceberg::puffin |
0 commit comments