|
19 | 19 |
|
20 | 20 | #include "iceberg/data/position_delete_writer.h" |
21 | 21 |
|
| 22 | +#include <map> |
| 23 | +#include <set> |
| 24 | +#include <vector> |
| 25 | + |
| 26 | +#include <nanoarrow/nanoarrow.h> |
| 27 | + |
| 28 | +#include "iceberg/arrow/nanoarrow_status_internal.h" |
| 29 | +#include "iceberg/file_writer.h" |
| 30 | +#include "iceberg/manifest/manifest_entry.h" |
| 31 | +#include "iceberg/metadata_columns.h" |
| 32 | +#include "iceberg/schema.h" |
| 33 | +#include "iceberg/schema_internal.h" |
| 34 | +#include "iceberg/util/macros.h" |
| 35 | + |
22 | 36 | namespace iceberg { |
23 | 37 |
|
24 | 38 | class PositionDeleteWriter::Impl { |
25 | 39 | public: |
| 40 | + static Result<std::unique_ptr<Impl>> Make(PositionDeleteWriterOptions options) { |
| 41 | + // Build the position delete schema with file_path and pos columns |
| 42 | + std::vector<SchemaField> fields; |
| 43 | + fields.push_back(MetadataColumns::kDeleteFilePath); |
| 44 | + fields.push_back(MetadataColumns::kDeleteFilePos); |
| 45 | + |
| 46 | + auto delete_schema = std::make_shared<Schema>(std::move(fields)); |
| 47 | + |
| 48 | + WriterOptions writer_options{ |
| 49 | + .path = options.path, |
| 50 | + .schema = delete_schema, |
| 51 | + .io = options.io, |
| 52 | + .properties = WriterProperties::FromMap(options.properties), |
| 53 | + }; |
| 54 | + |
| 55 | + ICEBERG_ASSIGN_OR_RAISE(auto writer, |
| 56 | + WriterFactoryRegistry::Open(options.format, writer_options)); |
| 57 | + |
| 58 | + return std::unique_ptr<Impl>( |
| 59 | + new Impl(std::move(options), std::move(delete_schema), std::move(writer))); |
| 60 | + } |
| 61 | + |
| 62 | + Status Write(ArrowArray* data) { |
| 63 | + ICEBERG_DCHECK(writer_, "Writer not initialized"); |
| 64 | + return writer_->Write(data); |
| 65 | + } |
| 66 | + |
| 67 | + Status WriteDelete(std::string_view file_path, int64_t pos) { |
| 68 | + ICEBERG_DCHECK(writer_, "Writer not initialized"); |
| 69 | + buffered_paths_.emplace_back(file_path); |
| 70 | + buffered_positions_.push_back(pos); |
| 71 | + referenced_paths_.emplace(file_path); |
| 72 | + |
| 73 | + if (static_cast<int64_t>(buffered_paths_.size()) >= kFlushThreshold) { |
| 74 | + return FlushBuffer(); |
| 75 | + } |
| 76 | + return {}; |
| 77 | + } |
| 78 | + |
| 79 | + Result<int64_t> Length() const { |
| 80 | + ICEBERG_DCHECK(writer_, "Writer not initialized"); |
| 81 | + return writer_->length(); |
| 82 | + } |
| 83 | + |
| 84 | + Status Close() { |
| 85 | + ICEBERG_DCHECK(writer_, "Writer not initialized"); |
| 86 | + if (closed_) { |
| 87 | + return {}; |
| 88 | + } |
| 89 | + if (!buffered_paths_.empty()) { |
| 90 | + ICEBERG_RETURN_UNEXPECTED(FlushBuffer()); |
| 91 | + } |
| 92 | + ICEBERG_RETURN_UNEXPECTED(writer_->Close()); |
| 93 | + closed_ = true; |
| 94 | + return {}; |
| 95 | + } |
| 96 | + |
| 97 | + Result<FileWriter::WriteResult> Metadata() { |
| 98 | + ICEBERG_CHECK(closed_, "Cannot get metadata before closing the writer"); |
| 99 | + |
| 100 | + ICEBERG_ASSIGN_OR_RAISE(auto metrics, writer_->metrics()); |
| 101 | + ICEBERG_ASSIGN_OR_RAISE(auto length, writer_->length()); |
| 102 | + auto split_offsets = writer_->split_offsets(); |
| 103 | + |
| 104 | + // Serialize literal bounds to binary format |
| 105 | + std::map<int32_t, std::vector<uint8_t>> lower_bounds_map; |
| 106 | + for (const auto& [col_id, literal] : metrics.lower_bounds) { |
| 107 | + ICEBERG_ASSIGN_OR_RAISE(auto serialized, literal.Serialize()); |
| 108 | + lower_bounds_map[col_id] = std::move(serialized); |
| 109 | + } |
| 110 | + std::map<int32_t, std::vector<uint8_t>> upper_bounds_map; |
| 111 | + for (const auto& [col_id, literal] : metrics.upper_bounds) { |
| 112 | + ICEBERG_ASSIGN_OR_RAISE(auto serialized, literal.Serialize()); |
| 113 | + upper_bounds_map[col_id] = std::move(serialized); |
| 114 | + } |
| 115 | + |
| 116 | + // Set referenced_data_file if all deletes reference the same data file |
| 117 | + std::optional<std::string> referenced_data_file; |
| 118 | + if (referenced_paths_.size() == 1) { |
| 119 | + referenced_data_file = *referenced_paths_.begin(); |
| 120 | + } |
| 121 | + |
| 122 | + auto data_file = std::make_shared<DataFile>(DataFile{ |
| 123 | + .content = DataFile::Content::kPositionDeletes, |
| 124 | + .file_path = options_.path, |
| 125 | + .file_format = options_.format, |
| 126 | + .partition = options_.partition, |
| 127 | + .record_count = metrics.row_count.value_or(-1), |
| 128 | + .file_size_in_bytes = length, |
| 129 | + .column_sizes = {metrics.column_sizes.begin(), metrics.column_sizes.end()}, |
| 130 | + .value_counts = {metrics.value_counts.begin(), metrics.value_counts.end()}, |
| 131 | + .null_value_counts = {metrics.null_value_counts.begin(), |
| 132 | + metrics.null_value_counts.end()}, |
| 133 | + .nan_value_counts = {metrics.nan_value_counts.begin(), |
| 134 | + metrics.nan_value_counts.end()}, |
| 135 | + .lower_bounds = std::move(lower_bounds_map), |
| 136 | + .upper_bounds = std::move(upper_bounds_map), |
| 137 | + .split_offsets = std::move(split_offsets), |
| 138 | + .sort_order_id = std::nullopt, |
| 139 | + .referenced_data_file = std::move(referenced_data_file), |
| 140 | + }); |
| 141 | + |
| 142 | + FileWriter::WriteResult result; |
| 143 | + result.data_files.push_back(std::move(data_file)); |
| 144 | + return result; |
| 145 | + } |
| 146 | + |
| 147 | + private: |
| 148 | + static constexpr int64_t kFlushThreshold = 1000; |
| 149 | + |
| 150 | + Impl(PositionDeleteWriterOptions options, std::shared_ptr<Schema> delete_schema, |
| 151 | + std::unique_ptr<Writer> writer) |
| 152 | + : options_(std::move(options)), |
| 153 | + delete_schema_(std::move(delete_schema)), |
| 154 | + writer_(std::move(writer)) {} |
| 155 | + |
| 156 | + Status FlushBuffer() { |
| 157 | + ArrowSchema arrow_schema; |
| 158 | + ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*delete_schema_, &arrow_schema)); |
| 159 | + |
| 160 | + ArrowArray array; |
| 161 | + ArrowError error; |
| 162 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR( |
| 163 | + ArrowArrayInitFromSchema(&array, &arrow_schema, &error), error); |
| 164 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayStartAppending(&array)); |
| 165 | + |
| 166 | + for (size_t i = 0; i < buffered_paths_.size(); ++i) { |
| 167 | + ArrowStringView path_view(buffered_paths_[i].data(), |
| 168 | + static_cast<int64_t>(buffered_paths_[i].size())); |
| 169 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED( |
| 170 | + ArrowArrayAppendString(array.children[0], path_view)); |
| 171 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED( |
| 172 | + ArrowArrayAppendInt(array.children[1], buffered_positions_[i])); |
| 173 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(&array)); |
| 174 | + } |
| 175 | + |
| 176 | + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR( |
| 177 | + ArrowArrayFinishBuildingDefault(&array, &error), error); |
| 178 | + |
| 179 | + ICEBERG_RETURN_UNEXPECTED(writer_->Write(&array)); |
| 180 | + |
| 181 | + buffered_paths_.clear(); |
| 182 | + buffered_positions_.clear(); |
| 183 | + arrow_schema.release(&arrow_schema); |
| 184 | + return {}; |
| 185 | + } |
| 186 | + |
| 187 | + PositionDeleteWriterOptions options_; |
| 188 | + std::shared_ptr<Schema> delete_schema_; |
| 189 | + std::unique_ptr<Writer> writer_; |
| 190 | + bool closed_ = false; |
| 191 | + std::vector<std::string> buffered_paths_; |
| 192 | + std::vector<int64_t> buffered_positions_; |
| 193 | + std::set<std::string> referenced_paths_; |
26 | 194 | }; |
27 | 195 |
|
| 196 | +PositionDeleteWriter::PositionDeleteWriter(std::unique_ptr<Impl> impl) |
| 197 | + : impl_(std::move(impl)) {} |
| 198 | + |
28 | 199 | PositionDeleteWriter::~PositionDeleteWriter() = default; |
29 | 200 |
|
30 | | -Status PositionDeleteWriter::Write(ArrowArray* data) { return NotImplemented(""); } |
| 201 | +Result<std::unique_ptr<PositionDeleteWriter>> PositionDeleteWriter::Make( |
| 202 | + const PositionDeleteWriterOptions& options) { |
| 203 | + ICEBERG_ASSIGN_OR_RAISE(auto impl, Impl::Make(options)); |
| 204 | + return std::unique_ptr<PositionDeleteWriter>(new PositionDeleteWriter(std::move(impl))); |
| 205 | +} |
| 206 | + |
| 207 | +Status PositionDeleteWriter::Write(ArrowArray* data) { return impl_->Write(data); } |
31 | 208 |
|
32 | 209 | Status PositionDeleteWriter::WriteDelete(std::string_view file_path, int64_t pos) { |
33 | | - return NotImplemented(""); |
| 210 | + return impl_->WriteDelete(file_path, pos); |
34 | 211 | } |
35 | 212 |
|
36 | | -Result<int64_t> PositionDeleteWriter::Length() const { return NotImplemented(""); } |
| 213 | +Result<int64_t> PositionDeleteWriter::Length() const { return impl_->Length(); } |
37 | 214 |
|
38 | | -Status PositionDeleteWriter::Close() { return NotImplemented(""); } |
| 215 | +Status PositionDeleteWriter::Close() { return impl_->Close(); } |
39 | 216 |
|
40 | 217 | Result<FileWriter::WriteResult> PositionDeleteWriter::Metadata() { |
41 | | - return NotImplemented(""); |
| 218 | + return impl_->Metadata(); |
42 | 219 | } |
43 | 220 |
|
44 | 221 | } // namespace iceberg |
0 commit comments