Skip to content

Commit bf6c652

Browse files
committed
feat: add roaring-based position bitmap
1 parent 4db7c67 commit bf6c652

File tree

8 files changed

+858
-0
lines changed

8 files changed

+858
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(ICEBERG_SOURCES
2525
data/position_delete_writer.cc
2626
data/writer.cc
2727
delete_file_index.cc
28+
deletes/roaring_position_bitmap.cc
2829
expression/aggregate.cc
2930
expression/binder.cc
3031
expression/evaluator.cc
@@ -165,6 +166,7 @@ iceberg_install_all_headers(iceberg)
165166

166167
add_subdirectory(catalog)
167168
add_subdirectory(data)
169+
add_subdirectory(deletes)
168170
add_subdirectory(expression)
169171
add_subdirectory(manifest)
170172
add_subdirectory(row)

src/iceberg/deletes/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
iceberg_install_all_headers(iceberg/deletes)

src/iceberg/deletes/meson.build

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
install_headers(['roaring_position_bitmap.h'], subdir: 'iceberg/deletes')
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/deletes/roaring_position_bitmap.h"
21+
22+
#include <cassert>
23+
#include <cstdint>
24+
#include <limits>
25+
#include <string>
26+
#include <string_view>
27+
#include <utility>
28+
#include <vector>
29+
30+
#include "roaring/roaring.hh"
31+
32+
namespace iceberg {
33+
34+
namespace {
35+
36+
constexpr int64_t kBitmapCountSizeBytes = 8;
37+
constexpr int64_t kBitmapKeySizeBytes = 4;
38+
39+
// Extracts high 32 bits from a 64-bit position (the key).
40+
int32_t Key(int64_t pos) { return static_cast<int32_t>(pos >> 32); }
41+
42+
// Extracts low 32 bits from a 64-bit position.
43+
uint32_t Pos32Bits(int64_t pos) { return static_cast<uint32_t>(pos); }
44+
45+
// Combines key (high 32 bits) and pos32 (low 32 bits) into a 64-bit
46+
// position. The low 32 bits are zero-extended to avoid sign extension.
47+
int64_t ToPosition(int32_t key, uint32_t pos32) {
48+
return (static_cast<int64_t>(key) << 32) | static_cast<int64_t>(pos32);
49+
}
50+
51+
void WriteLE64(char* buf, int64_t value) {
52+
auto v = static_cast<uint64_t>(value);
53+
for (int i = 0; i < 8; ++i) {
54+
buf[i] = static_cast<char>((v >> (i * 8)) & 0xFF);
55+
}
56+
}
57+
58+
void WriteLE32(char* buf, int32_t value) {
59+
auto v = static_cast<uint32_t>(value);
60+
for (int i = 0; i < 4; ++i) {
61+
buf[i] = static_cast<char>((v >> (i * 8)) & 0xFF);
62+
}
63+
}
64+
65+
int64_t ReadLE64(const char* buf) {
66+
auto b = reinterpret_cast<const uint8_t*>(buf);
67+
uint64_t v = 0;
68+
for (int i = 0; i < 8; ++i) {
69+
v |= static_cast<uint64_t>(b[i]) << (i * 8);
70+
}
71+
return static_cast<int64_t>(v);
72+
}
73+
74+
int32_t ReadLE32(const char* buf) {
75+
auto b = reinterpret_cast<const uint8_t*>(buf);
76+
uint32_t v = 0;
77+
for (int i = 0; i < 4; ++i) {
78+
v |= static_cast<uint32_t>(b[i]) << (i * 8);
79+
}
80+
return static_cast<int32_t>(v);
81+
}
82+
83+
} // namespace
84+
85+
struct RoaringPositionBitmap::Impl {
86+
std::vector<roaring::Roaring> bitmaps;
87+
88+
void AllocateBitmapsIfNeeded(int32_t required_length) {
89+
if (static_cast<int32_t>(bitmaps.size()) < required_length) {
90+
bitmaps.resize(static_cast<size_t>(required_length));
91+
}
92+
}
93+
};
94+
95+
RoaringPositionBitmap::RoaringPositionBitmap() : impl_(std::make_unique<Impl>()) {}
96+
97+
RoaringPositionBitmap::~RoaringPositionBitmap() = default;
98+
99+
RoaringPositionBitmap::RoaringPositionBitmap(RoaringPositionBitmap&&) noexcept = default;
100+
101+
RoaringPositionBitmap& RoaringPositionBitmap::operator=(
102+
RoaringPositionBitmap&&) noexcept = default;
103+
104+
RoaringPositionBitmap::RoaringPositionBitmap(std::unique_ptr<Impl> impl)
105+
: impl_(std::move(impl)) {}
106+
107+
void RoaringPositionBitmap::Add(int64_t pos) {
108+
assert(pos >= 0 && pos <= kMaxPosition);
109+
int32_t key = Key(pos);
110+
uint32_t pos32 = Pos32Bits(pos);
111+
impl_->AllocateBitmapsIfNeeded(key + 1);
112+
impl_->bitmaps[key].add(pos32);
113+
}
114+
115+
void RoaringPositionBitmap::AddRange(int64_t pos_start, int64_t pos_end) {
116+
for (int64_t pos = pos_start; pos < pos_end; ++pos) {
117+
Add(pos);
118+
}
119+
}
120+
121+
bool RoaringPositionBitmap::Contains(int64_t pos) const {
122+
assert(pos >= 0 && pos <= kMaxPosition);
123+
int32_t key = Key(pos);
124+
uint32_t pos32 = Pos32Bits(pos);
125+
return key < static_cast<int32_t>(impl_->bitmaps.size()) &&
126+
impl_->bitmaps[key].contains(pos32);
127+
}
128+
129+
bool RoaringPositionBitmap::IsEmpty() const { return Cardinality() == 0; }
130+
131+
int64_t RoaringPositionBitmap::Cardinality() const {
132+
int64_t total = 0;
133+
for (const auto& bitmap : impl_->bitmaps) {
134+
total += static_cast<int64_t>(bitmap.cardinality());
135+
}
136+
return total;
137+
}
138+
139+
void RoaringPositionBitmap::Or(const RoaringPositionBitmap& other) {
140+
impl_->AllocateBitmapsIfNeeded(static_cast<int32_t>(other.impl_->bitmaps.size()));
141+
for (size_t key = 0; key < other.impl_->bitmaps.size(); ++key) {
142+
impl_->bitmaps[key] |= other.impl_->bitmaps[key];
143+
}
144+
}
145+
146+
bool RoaringPositionBitmap::RunLengthEncode() {
147+
bool changed = false;
148+
for (auto& bitmap : impl_->bitmaps) {
149+
changed |= bitmap.runOptimize();
150+
}
151+
return changed;
152+
}
153+
154+
void RoaringPositionBitmap::ForEach(const std::function<void(int64_t)>& fn) const {
155+
for (size_t key = 0; key < impl_->bitmaps.size(); ++key) {
156+
for (uint32_t pos32 : impl_->bitmaps[key]) {
157+
fn(ToPosition(static_cast<int32_t>(key), pos32));
158+
}
159+
}
160+
}
161+
162+
int64_t RoaringPositionBitmap::SerializedSizeInBytes() const {
163+
int64_t size = kBitmapCountSizeBytes;
164+
for (const auto& bitmap : impl_->bitmaps) {
165+
size += kBitmapKeySizeBytes +
166+
static_cast<int64_t>(bitmap.getSizeInBytes(/*portable=*/true));
167+
}
168+
return size;
169+
}
170+
171+
Result<std::string> RoaringPositionBitmap::Serialize() const {
172+
int64_t size = SerializedSizeInBytes();
173+
std::string result(static_cast<size_t>(size), '\0');
174+
char* buf = result.data();
175+
176+
// Write bitmap count (array length including empties)
177+
WriteLE64(buf, static_cast<int64_t>(impl_->bitmaps.size()));
178+
buf += kBitmapCountSizeBytes;
179+
180+
// Write each bitmap with its key
181+
for (int32_t key = 0; key < static_cast<int32_t>(impl_->bitmaps.size()); ++key) {
182+
WriteLE32(buf, key);
183+
buf += kBitmapKeySizeBytes;
184+
size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true);
185+
buf += written;
186+
}
187+
188+
return result;
189+
}
190+
191+
Result<RoaringPositionBitmap> RoaringPositionBitmap::Deserialize(std::string_view bytes) {
192+
const char* buf = bytes.data();
193+
size_t remaining = bytes.size();
194+
195+
if (remaining < static_cast<size_t>(kBitmapCountSizeBytes)) {
196+
return InvalidArgument("Buffer too small for bitmap count");
197+
}
198+
199+
int64_t bitmap_count = ReadLE64(buf);
200+
buf += kBitmapCountSizeBytes;
201+
remaining -= kBitmapCountSizeBytes;
202+
203+
if (bitmap_count < 0 || bitmap_count > std::numeric_limits<int32_t>::max()) {
204+
return InvalidArgument("Invalid bitmap count: {}", bitmap_count);
205+
}
206+
207+
auto impl = std::make_unique<Impl>();
208+
int32_t last_key = -1;
209+
int32_t remaining_count = static_cast<int32_t>(bitmap_count);
210+
211+
while (remaining_count > 0) {
212+
if (remaining < static_cast<size_t>(kBitmapKeySizeBytes)) {
213+
return InvalidArgument("Buffer too small for bitmap key");
214+
}
215+
216+
int32_t key = ReadLE32(buf);
217+
buf += kBitmapKeySizeBytes;
218+
remaining -= kBitmapKeySizeBytes;
219+
220+
// Validate key (matches Java's readKey)
221+
if (key < 0) {
222+
return InvalidArgument("Invalid unsigned key: {}", key);
223+
}
224+
if (key > std::numeric_limits<int32_t>::max() - 1) {
225+
return InvalidArgument("Key is too large: {}", key);
226+
}
227+
if (key <= last_key) {
228+
return InvalidArgument("Keys must be sorted in ascending order");
229+
}
230+
231+
// Fill gaps with empty bitmaps
232+
while (last_key < key - 1) {
233+
impl->bitmaps.emplace_back();
234+
++last_key;
235+
}
236+
237+
// Read bitmap using portable safe deserialization
238+
roaring::Roaring bitmap = roaring::Roaring::readSafe(buf, remaining);
239+
size_t bitmap_size = bitmap.getSizeInBytes(/*portable=*/true);
240+
if (bitmap_size > remaining) {
241+
return InvalidArgument("Buffer too small for bitmap data at key {}", key);
242+
}
243+
buf += bitmap_size;
244+
remaining -= bitmap_size;
245+
246+
impl->bitmaps.push_back(std::move(bitmap));
247+
last_key = key;
248+
--remaining_count;
249+
}
250+
251+
return RoaringPositionBitmap(std::move(impl));
252+
}
253+
254+
} // namespace iceberg

0 commit comments

Comments
 (0)