Skip to content

Commit 133742d

Browse files
wgtmacemkornfield
andauthored
feat: add roaring-based position bitmap (#595)
Co-authored-by: emkornfield <emkornfield@gmail.com>
1 parent 9adb90e commit 133742d

13 files changed

+987
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(ICEBERG_SOURCES
2525
data/position_delete_writer.cc
2626
data/writer.cc
2727
delete_file_index.cc
28+
deletes/roaring_position_bitmap.cc
2829
expression/aggregate.cc
2930
expression/binder.cc
3031
expression/evaluator.cc
@@ -166,6 +167,7 @@ iceberg_install_all_headers(iceberg)
166167

167168
add_subdirectory(catalog)
168169
add_subdirectory(data)
170+
add_subdirectory(deletes)
169171
add_subdirectory(expression)
170172
add_subdirectory(manifest)
171173
add_subdirectory(puffin)

src/iceberg/deletes/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
iceberg_install_all_headers(iceberg/deletes)

src/iceberg/deletes/meson.build

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
install_headers(['roaring_position_bitmap.h'], subdir: 'iceberg/deletes')
Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/deletes/roaring_position_bitmap.h"
21+
22+
#include <cstring>
23+
#include <exception>
24+
#include <limits>
25+
#include <utility>
26+
#include <vector>
27+
28+
#include <roaring/roaring.hh>
29+
30+
#include "iceberg/util/endian.h"
31+
#include "iceberg/util/macros.h"
32+
33+
namespace iceberg {
34+
35+
namespace {
36+
37+
constexpr size_t kBitmapCountSizeBytes = 8;
38+
constexpr size_t kBitmapKeySizeBytes = 4;
39+
40+
// Extracts high 32 bits from a 64-bit position (the key).
41+
int32_t Key(int64_t pos) { return static_cast<int32_t>(pos >> 32); }
42+
43+
// Extracts low 32 bits from a 64-bit position.
44+
uint32_t Pos32Bits(int64_t pos) { return static_cast<uint32_t>(0xFFFFFFFF & pos); }
45+
46+
// Combines key (high 32 bits) and pos32 (low 32 bits) into a 64-bit
47+
// position. The low 32 bits are zero-extended to avoid sign extension.
48+
int64_t ToPosition(int32_t key, uint32_t pos32) {
49+
return (int64_t{key} << 32) | int64_t{pos32};
50+
}
51+
52+
void WriteLE64(char* buf, int64_t value) {
53+
auto le = ToLittleEndian(static_cast<uint64_t>(value));
54+
std::memcpy(buf, &le, sizeof(le));
55+
}
56+
57+
void WriteLE32(char* buf, int32_t value) {
58+
auto le = ToLittleEndian(static_cast<uint32_t>(value));
59+
std::memcpy(buf, &le, sizeof(le));
60+
}
61+
62+
int64_t ReadLE64(const char* buf) {
63+
uint64_t v;
64+
std::memcpy(&v, buf, sizeof(v));
65+
return static_cast<int64_t>(FromLittleEndian(v));
66+
}
67+
68+
int32_t ReadLE32(const char* buf) {
69+
uint32_t v;
70+
std::memcpy(&v, buf, sizeof(v));
71+
return static_cast<int32_t>(FromLittleEndian(v));
72+
}
73+
74+
Status ValidatePosition(int64_t pos) {
75+
if (pos < 0 || pos > RoaringPositionBitmap::kMaxPosition) {
76+
return InvalidArgument("Bitmap supports positions that are >= 0 and <= {}: {}",
77+
RoaringPositionBitmap::kMaxPosition, pos);
78+
}
79+
return {};
80+
}
81+
82+
} // namespace
83+
84+
struct RoaringPositionBitmap::Impl {
85+
std::vector<roaring::Roaring> bitmaps;
86+
87+
void AllocateBitmapsIfNeeded(int32_t required_length) {
88+
if (std::cmp_less(bitmaps.size(), required_length)) {
89+
bitmaps.resize(static_cast<size_t>(required_length));
90+
}
91+
}
92+
};
93+
94+
RoaringPositionBitmap::RoaringPositionBitmap() : impl_(std::make_unique<Impl>()) {}
95+
96+
RoaringPositionBitmap::~RoaringPositionBitmap() = default;
97+
98+
RoaringPositionBitmap::RoaringPositionBitmap(const RoaringPositionBitmap& other)
99+
: impl_(other.impl_ != nullptr ? std::make_unique<Impl>(*other.impl_)
100+
: std::make_unique<Impl>()) {}
101+
102+
RoaringPositionBitmap& RoaringPositionBitmap::operator=(
103+
const RoaringPositionBitmap& other) {
104+
if (this == &other) {
105+
return *this;
106+
}
107+
impl_ = other.impl_ != nullptr ? std::make_unique<Impl>(*other.impl_)
108+
: std::make_unique<Impl>();
109+
return *this;
110+
}
111+
112+
RoaringPositionBitmap::RoaringPositionBitmap(RoaringPositionBitmap&&) noexcept = default;
113+
114+
RoaringPositionBitmap& RoaringPositionBitmap::operator=(
115+
RoaringPositionBitmap&&) noexcept = default;
116+
117+
RoaringPositionBitmap::RoaringPositionBitmap(std::unique_ptr<Impl> impl)
118+
: impl_(std::move(impl)) {}
119+
120+
Status RoaringPositionBitmap::Add(int64_t pos) {
121+
ICEBERG_RETURN_UNEXPECTED(ValidatePosition(pos));
122+
int32_t key = Key(pos);
123+
uint32_t pos32 = Pos32Bits(pos);
124+
impl_->AllocateBitmapsIfNeeded(key + 1);
125+
impl_->bitmaps[key].add(pos32);
126+
return {};
127+
}
128+
129+
Status RoaringPositionBitmap::AddRange(int64_t pos_start, int64_t pos_end) {
130+
for (int64_t pos = pos_start; pos < pos_end; ++pos) {
131+
ICEBERG_RETURN_UNEXPECTED(Add(pos));
132+
}
133+
return {};
134+
}
135+
136+
Result<bool> RoaringPositionBitmap::Contains(int64_t pos) const {
137+
ICEBERG_RETURN_UNEXPECTED(ValidatePosition(pos));
138+
int32_t key = Key(pos);
139+
uint32_t pos32 = Pos32Bits(pos);
140+
return std::cmp_less(key, impl_->bitmaps.size()) && impl_->bitmaps[key].contains(pos32);
141+
}
142+
143+
bool RoaringPositionBitmap::IsEmpty() const { return Cardinality() == 0; }
144+
145+
size_t RoaringPositionBitmap::Cardinality() const {
146+
size_t total = 0;
147+
for (const auto& bitmap : impl_->bitmaps) {
148+
total += bitmap.cardinality();
149+
}
150+
return total;
151+
}
152+
153+
void RoaringPositionBitmap::Or(const RoaringPositionBitmap& other) {
154+
impl_->AllocateBitmapsIfNeeded(static_cast<int32_t>(other.impl_->bitmaps.size()));
155+
for (size_t key = 0; key < other.impl_->bitmaps.size(); ++key) {
156+
impl_->bitmaps[key] |= other.impl_->bitmaps[key];
157+
}
158+
}
159+
160+
bool RoaringPositionBitmap::Optimize() {
161+
bool changed = false;
162+
for (auto& bitmap : impl_->bitmaps) {
163+
changed |= bitmap.runOptimize();
164+
}
165+
return changed;
166+
}
167+
168+
void RoaringPositionBitmap::ForEach(const std::function<void(int64_t)>& fn) const {
169+
for (size_t key = 0; key < impl_->bitmaps.size(); ++key) {
170+
for (uint32_t pos32 : impl_->bitmaps[key]) {
171+
fn(ToPosition(static_cast<int32_t>(key), pos32));
172+
}
173+
}
174+
}
175+
176+
size_t RoaringPositionBitmap::SerializedSizeInBytes() const {
177+
size_t size = kBitmapCountSizeBytes;
178+
for (const auto& bitmap : impl_->bitmaps) {
179+
size += kBitmapKeySizeBytes + bitmap.getSizeInBytes(/*portable=*/true);
180+
}
181+
return size;
182+
}
183+
184+
// Serializes using the portable format (little-endian).
185+
// See https://iceberg.apache.org/puffin-spec/#deletion-vector-v1-blob-type
186+
Result<std::string> RoaringPositionBitmap::Serialize() const {
187+
size_t size = SerializedSizeInBytes();
188+
std::string result(size, '\0');
189+
char* buf = result.data();
190+
191+
// Write bitmap count (array length including empties)
192+
WriteLE64(buf, static_cast<int64_t>(impl_->bitmaps.size()));
193+
buf += kBitmapCountSizeBytes;
194+
195+
// Write each bitmap with its key
196+
for (int32_t key = 0; std::cmp_less(key, impl_->bitmaps.size()); ++key) {
197+
WriteLE32(buf, key);
198+
buf += kBitmapKeySizeBytes;
199+
size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true);
200+
buf += written;
201+
}
202+
203+
return result;
204+
}
205+
206+
Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_view bytes) {
207+
const char* buf = bytes.data();
208+
size_t remaining = bytes.size();
209+
210+
ICEBERG_PRECHECK(remaining >= kBitmapCountSizeBytes,
211+
"Buffer too small for bitmap count: {} bytes", remaining);
212+
213+
int64_t bitmap_count = ReadLE64(buf);
214+
buf += kBitmapCountSizeBytes;
215+
remaining -= kBitmapCountSizeBytes;
216+
217+
ICEBERG_PRECHECK(
218+
bitmap_count >= 0 && bitmap_count <= std::numeric_limits<int32_t>::max(),
219+
"Invalid bitmap count: {}", bitmap_count);
220+
221+
auto impl = std::make_unique<Impl>();
222+
int32_t last_key = -1;
223+
auto remaining_count = static_cast<int32_t>(bitmap_count);
224+
225+
while (remaining_count > 0) {
226+
ICEBERG_PRECHECK(remaining >= kBitmapKeySizeBytes,
227+
"Buffer too small for bitmap key: {} bytes", remaining);
228+
229+
int32_t key = ReadLE32(buf);
230+
buf += kBitmapKeySizeBytes;
231+
remaining -= kBitmapKeySizeBytes;
232+
233+
ICEBERG_PRECHECK(key >= 0, "Invalid unsigned key: {}", key);
234+
ICEBERG_PRECHECK(key < std::numeric_limits<int32_t>::max(), "Key is too large: {}",
235+
key);
236+
ICEBERG_PRECHECK(key > last_key,
237+
"Keys must be sorted in ascending order, got key {} after {}", key,
238+
last_key);
239+
240+
// Fill gaps with empty bitmaps
241+
while (last_key < key - 1) {
242+
impl->bitmaps.emplace_back();
243+
++last_key;
244+
}
245+
246+
// Read bitmap using portable safe deserialization.
247+
// CRoaring's readSafe may throw on corrupted data.
248+
roaring::Roaring bitmap;
249+
try {
250+
bitmap = roaring::Roaring::readSafe(buf, remaining);
251+
} catch (const std::exception& e) {
252+
return InvalidArgument("Failed to deserialize bitmap at key {}: {}", key, e.what());
253+
}
254+
size_t bitmap_size = bitmap.getSizeInBytes(/*portable=*/true);
255+
ICEBERG_PRECHECK(
256+
bitmap_size <= remaining,
257+
"Buffer too small for bitmap key {}: {} bytes needed, {} bytes available", key,
258+
bitmap_size, remaining);
259+
buf += bitmap_size;
260+
remaining -= bitmap_size;
261+
262+
impl->bitmaps.emplace_back(std::move(bitmap));
263+
last_key = key;
264+
--remaining_count;
265+
}
266+
267+
return RoaringPositionBitmap(std::move(impl));
268+
}
269+
270+
} // namespace iceberg

0 commit comments

Comments
 (0)