Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions src/iceberg/deletes/roaring_position_bitmap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cstring>
#include <exception>
#include <limits>
#include <stdexcept>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -128,8 +129,27 @@ void RoaringPositionBitmap::Add(int64_t pos) {
}

void RoaringPositionBitmap::AddRange(int64_t pos_start, int64_t pos_end) {
for (int64_t pos = pos_start; pos < pos_end; ++pos) {
Add(pos);
if (pos_start > pos_end) {
throw std::invalid_argument("AddRange requires pos_start <= pos_end, got [" +
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's return Status instead of throwing exception if you really think this is an invalid case. IMHO, we need to be consistent with line 123 and line 140 to return on invalid inputs.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it is an invalid input, however for consistency with other methods in the c++ implementation, then both versions are valid.

Before my changes in the PR to the Java base, it also would not throw any exceptions. since it used the same for loop definition. To keep the existing behaviour, I have removed the exception throwing.

std::to_string(pos_start) + ", " +
std::to_string(pos_end) + ")");
}
pos_start = std::max(pos_start, int64_t{0});
pos_end = std::min(pos_end, kMaxPosition + 1);
if (pos_start >= pos_end) {
return;
}

int32_t start_key = Key(pos_start);
int32_t end_key = Key(pos_end - 1);
impl_->AllocateBitmapsIfNeeded(end_key + 1);

for (int32_t key = start_key; key <= end_key; ++key) {
uint64_t low_start = (key == start_key) ? Pos32Bits(pos_start) : uint64_t{0};
uint64_t low_end = (key == end_key)
? static_cast<uint64_t>(Pos32Bits(pos_end - 1)) + 1
: (uint64_t{1} << 32);
impl_->bitmaps[key].addRange(low_start, low_end);
}
}

Expand Down
6 changes: 5 additions & 1 deletion src/iceberg/deletes/roaring_position_bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ class ICEBERG_EXPORT RoaringPositionBitmap {
void Add(int64_t pos);

/// \brief Sets a range of positions [pos_start, pos_end).
/// \note Invalid positions are silently ignored
/// \param pos_start the start of the range (inclusive), clamped to 0
/// \param pos_end the end of the range (exclusive), clamped to kMaxPosition + 1
/// \throws std::invalid_argument if pos_start > pos_end
/// \note If pos_start == pos_end, this method does nothing.
/// Positions outside [0, kMaxPosition] are silently ignored.
void AddRange(int64_t pos_start, int64_t pos_end);

/// \brief Checks if a position is set in the bitmap.
Expand Down
91 changes: 90 additions & 1 deletion src/iceberg/test/roaring_position_bitmap_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <fstream>
#include <random>
#include <set>
#include <stdexcept>
#include <string>
#include <vector>

Expand Down Expand Up @@ -128,9 +129,93 @@ INSTANTIATE_TEST_SUITE_P(
.start = (int64_t{1} << 32) - 5,
.end = (int64_t{1} << 32) + 5,
.absent_positions = {0, (int64_t{1} << 32) + 5},
},
AddRangeParams{
.name = "single_position",
.start = 42,
.end = 43,
.absent_positions = {41, 43},
}),
[](const ::testing::TestParamInfo<AddRangeParams>& info) { return info.param.name; });

TEST(RoaringPositionBitmapTest, TestAddRangeLargeContiguous) {
RoaringPositionBitmap bitmap;
bitmap.AddRange(500, 200500);

ASSERT_EQ(bitmap.Cardinality(), 200000u);
ASSERT_TRUE(bitmap.Contains(500));
ASSERT_TRUE(bitmap.Contains(200499));
ASSERT_FALSE(bitmap.Contains(499));
ASSERT_FALSE(bitmap.Contains(200500));
}

TEST(RoaringPositionBitmapTest, TestAddRangeSpanningThreeKeys) {
RoaringPositionBitmap bitmap;

int64_t start = (int64_t{0} << 32) | int64_t{0xFFFFFFF0};
int64_t end = (int64_t{2} << 32) | int64_t{0x10};
bitmap.AddRange(start, end);

ASSERT_EQ(bitmap.Cardinality(), static_cast<size_t>(end - start));
ASSERT_TRUE(bitmap.Contains(start));
ASSERT_TRUE(bitmap.Contains(end - 1));
ASSERT_FALSE(bitmap.Contains(start - 1));
ASSERT_FALSE(bitmap.Contains(end));
ASSERT_TRUE(bitmap.Contains(int64_t{1} << 32));
// Verify a sample near the end of the middle key is also present
ASSERT_TRUE(bitmap.Contains((int64_t{1} << 32) | int64_t{0xFFFFFFF0}));
}

TEST(RoaringPositionBitmapTest, TestAddRangeClampNegativeStart) {
RoaringPositionBitmap bitmap;
bitmap.AddRange(-1, 10);
ASSERT_EQ(bitmap.Cardinality(), 10u);
ASSERT_TRUE(bitmap.Contains(0));
ASSERT_TRUE(bitmap.Contains(9));
ASSERT_FALSE(bitmap.Contains(-1));
}

TEST(RoaringPositionBitmapTest, TestAddRangeClampBeyondMaxPosition) {
RoaringPositionBitmap bitmap;
// Range entirely beyond kMaxPosition: after clamping both endpoints the range
// becomes empty, so no allocation or insertion happens.
bitmap.AddRange(RoaringPositionBitmap::kMaxPosition + 1,
RoaringPositionBitmap::kMaxPosition + 10);
ASSERT_TRUE(bitmap.IsEmpty());
}

struct AddRangeNoOpParams {
const char* name;
int64_t start;
int64_t end;
};

class RoaringPositionBitmapAddRangeNoOpTest
: public ::testing::TestWithParam<AddRangeNoOpParams> {};

TEST_P(RoaringPositionBitmapAddRangeNoOpTest, IsNoOp) {
const auto& param = GetParam();
RoaringPositionBitmap bitmap;
bitmap.AddRange(param.start, param.end);
ASSERT_TRUE(bitmap.IsEmpty());
ASSERT_EQ(bitmap.Cardinality(), 0u);
}

INSTANTIATE_TEST_SUITE_P(
AddRangeNoOpScenarios, RoaringPositionBitmapAddRangeNoOpTest,
::testing::Values(
AddRangeNoOpParams{.name = "equal", .start = 100, .end = 100},
AddRangeNoOpParams{.name = "zero_length_at_zero", .start = 0, .end = 0},
AddRangeNoOpParams{.name = "negative_both", .start = -10, .end = -5}),
[](const ::testing::TestParamInfo<AddRangeNoOpParams>& info) {
return info.param.name;
});

TEST(RoaringPositionBitmapTest, TestAddRangeReversedThrows) {
RoaringPositionBitmap bitmap;
ASSERT_THROW(bitmap.AddRange(100, 50), std::invalid_argument);
}

struct OrParams {
const char* name;
std::set<int64_t> lhs_input;
Expand Down Expand Up @@ -398,7 +483,11 @@ TEST(RoaringPositionBitmapTest, TestIsEmpty) {

TEST(RoaringPositionBitmapTest, TestOptimize) {
RoaringPositionBitmap bitmap;
bitmap.AddRange(0, 10000);
// Use Add() instead of AddRange() because addRange() creates run-length
// encoded containers directly, leaving nothing for Optimize() to compress.
for (int64_t i = 0; i < 10000; ++i) {
bitmap.Add(i);
}
size_t size_before = bitmap.SerializedSizeInBytes();

bool changed = bitmap.Optimize();
Expand Down
Loading