1919
2020#include " iceberg/data/delete_loader.h"
2121
22+ #include < cstring>
23+ #include < span>
2224#include < string>
2325#include < vector>
2426
27+ #include < nanoarrow/nanoarrow.h>
28+
29+ #include " iceberg/arrow/nanoarrow_status_internal.h"
2530#include " iceberg/arrow_c_data_guard_internal.h"
2631#include " iceberg/deletes/position_delete_index.h"
32+ #include " iceberg/deletes/position_delete_range_consumer.h"
2733#include " iceberg/file_reader.h"
2834#include " iceberg/manifest/manifest_entry.h"
2935#include " iceberg/metadata_columns.h"
36+ #include " iceberg/result.h"
3037#include " iceberg/row/arrow_array_wrapper.h"
3138#include " iceberg/schema.h"
3239#include " iceberg/util/macros.h"
@@ -57,6 +64,25 @@ Result<std::unique_ptr<Reader>> OpenDeleteFile(const DataFile& file,
5764 return ReaderFactoryRegistry::Open (file.file_format , options);
5865}
5966
67+ // / Raw `int64` values buffer (offset-adjusted). Skips the validity bitmap:
68+ // / `kDeleteFilePos` is required by the V2 spec.
69+ const int64_t * Int64ValuesBuffer (const ArrowArrayView* view) {
70+ return view->buffer_views [1 ].data .as_int64 + view->offset ;
71+ }
72+
73+ // / String-equals at `row_idx` via nanoarrow's unsafe direct-buffer access.
74+ // / Skips the validity bitmap: `kDeleteFilePath` is required by the V2 spec.
75+ bool StringEquals (const ArrowArrayView* view, int64_t row_idx, std::string_view target) {
76+ ArrowStringView sv = ArrowArrayViewGetStringUnsafe (view, row_idx);
77+ if (static_cast <size_t >(sv.size_bytes ) != target.size ()) {
78+ return false ;
79+ }
80+ if (target.empty ()) {
81+ return true ;
82+ }
83+ return sv.data != nullptr && std::memcmp (sv.data , target.data (), target.size ()) == 0 ;
84+ }
85+
6086} // namespace
6187
6288DeleteLoader::DeleteLoader (std::shared_ptr<FileIO> io) : io_(std::move(io)) {}
@@ -71,30 +97,70 @@ Status DeleteLoader::LoadPositionDelete(const DataFile& file, PositionDeleteInde
7197 ICEBERG_ASSIGN_OR_RAISE (auto arrow_schema, reader->Schema ());
7298 internal::ArrowSchemaGuard schema_guard (&arrow_schema);
7399
100+ // Reused across batches; reads child buffers directly to avoid the
101+ // per-row `Scalar` dispatch in `ArrowArrayStructLike`.
102+ ArrowArrayView array_view;
103+ internal::ArrowArrayViewGuard view_guard (&array_view);
104+ ArrowError error;
105+ ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR (
106+ ArrowArrayViewInitFromSchema (&array_view, &arrow_schema, &error), error);
107+
108+ // Fast path when the writer's `referenced_data_file` hint matches our
109+ // target: skip the path column, hand `pos_data` straight to
110+ // `ForEachPositionDelete`. Trusts the hint -- spec-compliant writers
111+ // only set it when all rows share one data file.
112+ const bool use_referenced_data_file_fast_path =
113+ file.referenced_data_file .has_value () &&
114+ file.referenced_data_file .value () == data_file_path;
115+
116+ // Filter-path staging buffer; reused across batches via `clear()`.
117+ std::vector<int64_t > positions;
118+
74119 while (true ) {
75120 ICEBERG_ASSIGN_OR_RAISE (auto batch_opt, reader->Next ());
76121 if (!batch_opt.has_value ()) break ;
77122
78123 auto & batch = batch_opt.value ();
79124 internal::ArrowArrayGuard batch_guard (&batch);
80125
81- ICEBERG_ASSIGN_OR_RAISE (
82- auto row, ArrowArrayStructLike::Make (arrow_schema, batch, /* row_index= */ 0 ) );
126+ ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR (
127+ ArrowArrayViewSetArray (&array_view, & batch, &error), error );
83128
84- for (int64_t i = 0 ; i < batch.length ; ++i) {
85- if (i > 0 ) {
86- ICEBERG_RETURN_UNEXPECTED (row->Reset (i));
87- }
88- // Field 0: file_path
89- ICEBERG_ASSIGN_OR_RAISE (auto path_scalar, row->GetField (0 ));
90- auto path = std::get<std::string_view>(path_scalar);
91-
92- if (path == data_file_path) {
93- // Field 1: pos
94- ICEBERG_ASSIGN_OR_RAISE (auto pos_scalar, row->GetField (1 ));
95- index.Delete (std::get<int64_t >(pos_scalar));
129+ const int64_t length = batch.length ;
130+ if (length <= 0 ) {
131+ continue ;
132+ }
133+
134+ // Child indices must match `PosDeleteSchema()`: 0 = file_path, 1 = pos.
135+ const ArrowArrayView* path_view = array_view.children [0 ];
136+ const ArrowArrayView* pos_view = array_view.children [1 ];
137+
138+ // V2 spec marks pos and file_path as required (NOT NULL). The direct
139+ // buffer access below skips the validity bitmap, so a non-compliant
140+ // batch would silently corrupt the index. Fail fast instead.
141+ if (ArrowArrayViewComputeNullCount (pos_view) != 0 ||
142+ ArrowArrayViewComputeNullCount (path_view) != 0 ) {
143+ return InvalidArrowData (
144+ " position delete file has null values in required pos/file_path columns" );
145+ }
146+
147+ const int64_t * pos_data = Int64ValuesBuffer (pos_view);
148+
149+ if (use_referenced_data_file_fast_path) {
150+ ForEachPositionDelete (std::span<const int64_t >(pos_data, length), index);
151+ continue ;
152+ }
153+
154+ positions.clear ();
155+ if (positions.capacity () < static_cast <size_t >(length)) {
156+ positions.reserve (static_cast <size_t >(length));
157+ }
158+ for (int64_t i = 0 ; i < length; ++i) {
159+ if (StringEquals (path_view, i, data_file_path)) {
160+ positions.push_back (pos_data[i]);
96161 }
97162 }
163+ ForEachPositionDelete (positions, index);
98164 }
99165
100166 return reader->Close ();
0 commit comments