Skip to content

Commit 5fc0fb9

Browse files
committed
refine test for filter
1 parent ef039e5 commit 5fc0fb9

File tree

1 file changed

+78
-41
lines changed

1 file changed

+78
-41
lines changed

src/iceberg/test/table_scan_test.cc

Lines changed: 78 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,10 @@ class TableScanTest : public testing::TestWithParam<int> {
142142

143143
std::shared_ptr<DataFile> MakeDataFile(const std::string& path,
144144
const PartitionValues& partition,
145-
int32_t spec_id, int64_t record_count = 1) {
146-
return std::make_shared<DataFile>(DataFile{
145+
int32_t spec_id, int64_t record_count = 1,
146+
std::optional<int32_t> lower_id = std::nullopt,
147+
std::optional<int32_t> upper_id = std::nullopt) {
148+
auto file = std::make_shared<DataFile>(DataFile{
147149
.file_path = path,
148150
.file_format = FileFormatType::kParquet,
149151
.partition = partition,
@@ -152,6 +154,14 @@ class TableScanTest : public testing::TestWithParam<int> {
152154
.sort_order_id = 0,
153155
.partition_spec_id = spec_id,
154156
});
157+
// Set lower/upper bounds for field_id=1 ("id" column) if provided
158+
if (lower_id.has_value()) {
159+
file->lower_bounds[1] = Literal::Int(lower_id.value()).Serialize().value();
160+
}
161+
if (upper_id.has_value()) {
162+
file->upper_bounds[1] = Literal::Int(upper_id.value()).Serialize().value();
163+
}
164+
return file;
155165
}
156166

157167
ManifestEntry MakeEntry(ManifestStatus status, int64_t snapshot_id,
@@ -486,37 +496,35 @@ TEST_P(TableScanTest, PlanFilesWithMultipleManifests) {
486496
TEST_P(TableScanTest, PlanFilesWithFilter) {
487497
int version = GetParam();
488498

489-
// Test that filters are properly passed to the ManifestGroup
490-
auto filter = Expressions::Equal("id", Literal::Int(42));
499+
constexpr int64_t kSnapshotId = 1000L;
491500
const auto part_value = PartitionValues({Literal::Int(0)});
492501

493-
// Create data manifest with files
502+
// Create two data files with non-overlapping id ranges:
503+
// - data1.parquet: id range [1, 50]
504+
// - data2.parquet: id range [51, 100]
494505
std::vector<ManifestEntry> data_entries{
495-
MakeEntry(ManifestStatus::kAdded, /*snapshot_id=*/1000L, /*sequence_number=*/1,
506+
MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
496507
MakeDataFile("/path/to/data1.parquet", part_value,
497-
partitioned_spec_->spec_id())),
498-
MakeEntry(ManifestStatus::kAdded, /*snapshot_id=*/1000L, /*sequence_number=*/1,
508+
partitioned_spec_->spec_id(), /*record_count=*/1,
509+
/*lower_id=*/1, /*upper_id=*/50)),
510+
MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
499511
MakeDataFile("/path/to/data2.parquet", part_value,
500-
partitioned_spec_->spec_id()))};
501-
auto data_manifest = WriteDataManifest(version, /*snapshot_id=*/1000L,
502-
std::move(data_entries), partitioned_spec_);
503-
504-
// Write manifest list
505-
std::string manifest_list_path = WriteManifestList(
506-
version, /*snapshot_id=*/1000L, /*sequence_number=*/1, {data_manifest});
512+
partitioned_spec_->spec_id(), /*record_count=*/1,
513+
/*lower_id=*/51, /*upper_id=*/100))};
514+
auto data_manifest =
515+
WriteDataManifest(version, kSnapshotId, std::move(data_entries), partitioned_spec_);
516+
std::string manifest_list_path =
517+
WriteManifestList(version, kSnapshotId, /*sequence_number=*/1, {data_manifest});
507518

508-
// Create a snapshot that references this manifest list
509519
auto timestamp_ms = TimePointMsFromUnixMs(1609459200000L);
510-
auto snapshot_with_manifest =
511-
std::make_shared<Snapshot>(Snapshot{.snapshot_id = 1000L,
512-
.parent_snapshot_id = std::nullopt,
513-
.sequence_number = 1L,
514-
.timestamp_ms = timestamp_ms,
515-
.manifest_list = manifest_list_path,
516-
.summary = {},
517-
.schema_id = schema_->schema_id()});
518-
519-
auto metadata_with_manifest = std::make_shared<TableMetadata>(TableMetadata{
520+
auto snapshot = std::make_shared<Snapshot>(Snapshot{.snapshot_id = kSnapshotId,
521+
.parent_snapshot_id = std::nullopt,
522+
.sequence_number = 1L,
523+
.timestamp_ms = timestamp_ms,
524+
.manifest_list = manifest_list_path,
525+
.schema_id = schema_->schema_id()});
526+
527+
auto metadata = std::make_shared<TableMetadata>(TableMetadata{
520528
.format_version = 2,
521529
.table_uuid = "test-table-uuid",
522530
.location = "/tmp/table",
@@ -528,25 +536,54 @@ TEST_P(TableScanTest, PlanFilesWithFilter) {
528536
.partition_specs = {partitioned_spec_, unpartitioned_spec_},
529537
.default_spec_id = partitioned_spec_->spec_id(),
530538
.last_partition_id = 1000,
531-
.current_snapshot_id = 1000L,
532-
.snapshots = {snapshot_with_manifest},
539+
.current_snapshot_id = kSnapshotId,
540+
.snapshots = {snapshot},
533541
.snapshot_log = {SnapshotLogEntry{.timestamp_ms = timestamp_ms,
534-
.snapshot_id = 1000L}},
542+
.snapshot_id = kSnapshotId}},
535543
.default_sort_order_id = 0,
536-
.refs = {
537-
{"main", std::make_shared<SnapshotRef>(SnapshotRef{
538-
.snapshot_id = 1000L, .retention = SnapshotRef::Branch{}})}}});
544+
.refs = {{"main",
545+
std::make_shared<SnapshotRef>(SnapshotRef{
546+
.snapshot_id = kSnapshotId, .retention = SnapshotRef::Branch{}})}}});
547+
548+
// Test 1: Filter matches only data1.parquet (id=25 is in range [1, 50])
549+
{
550+
ICEBERG_UNWRAP_OR_FAIL(auto builder, TableScanBuilder::Make(metadata, file_io_));
551+
builder->Filter(Expressions::Equal("id", Literal::Int(25)));
552+
ICEBERG_UNWRAP_OR_FAIL(auto scan, builder->Build());
553+
ICEBERG_UNWRAP_OR_FAIL(auto tasks, scan->PlanFiles());
554+
ASSERT_EQ(tasks.size(), 1);
555+
EXPECT_EQ(tasks[0]->data_file()->file_path, "/path/to/data1.parquet");
556+
}
539557

540-
ICEBERG_UNWRAP_OR_FAIL(auto builder,
541-
TableScanBuilder::Make(metadata_with_manifest, file_io_));
542-
builder->Filter(filter);
543-
ICEBERG_UNWRAP_OR_FAIL(auto scan, builder->Build());
544-
EXPECT_EQ(scan->filter(), filter);
558+
// Test 2: Filter matches only data2.parquet (id=75 is in range [51, 100])
559+
{
560+
ICEBERG_UNWRAP_OR_FAIL(auto builder, TableScanBuilder::Make(metadata, file_io_));
561+
builder->Filter(Expressions::Equal("id", Literal::Int(75)));
562+
ICEBERG_UNWRAP_OR_FAIL(auto scan, builder->Build());
563+
ICEBERG_UNWRAP_OR_FAIL(auto tasks, scan->PlanFiles());
564+
ASSERT_EQ(tasks.size(), 1);
565+
EXPECT_EQ(tasks[0]->data_file()->file_path, "/path/to/data2.parquet");
566+
}
545567

546-
ICEBERG_UNWRAP_OR_FAIL(auto tasks, scan->PlanFiles());
547-
ASSERT_EQ(tasks.size(), 2);
548-
EXPECT_THAT(GetPaths(tasks), testing::UnorderedElementsAre("/path/to/data1.parquet",
549-
"/path/to/data2.parquet"));
568+
// Test 3: Filter matches both files (id > 0 covers both ranges)
569+
{
570+
ICEBERG_UNWRAP_OR_FAIL(auto builder, TableScanBuilder::Make(metadata, file_io_));
571+
builder->Filter(Expressions::GreaterThan("id", Literal::Int(0)));
572+
ICEBERG_UNWRAP_OR_FAIL(auto scan, builder->Build());
573+
ICEBERG_UNWRAP_OR_FAIL(auto tasks, scan->PlanFiles());
574+
ASSERT_EQ(tasks.size(), 2);
575+
EXPECT_THAT(GetPaths(tasks), testing::UnorderedElementsAre("/path/to/data1.parquet",
576+
"/path/to/data2.parquet"));
577+
}
578+
579+
// Test 4: Filter matches no files (id=200 is outside both ranges)
580+
{
581+
ICEBERG_UNWRAP_OR_FAIL(auto builder, TableScanBuilder::Make(metadata, file_io_));
582+
builder->Filter(Expressions::Equal("id", Literal::Int(200)));
583+
ICEBERG_UNWRAP_OR_FAIL(auto scan, builder->Build());
584+
ICEBERG_UNWRAP_OR_FAIL(auto tasks, scan->PlanFiles());
585+
EXPECT_TRUE(tasks.empty());
586+
}
550587
}
551588

552589
TEST_P(TableScanTest, PlanFilesWithDeleteFiles) {

0 commit comments

Comments
 (0)