Skip to content

Commit d9c7a81

Browse files
committed
[refactor](be) Clean up new parquet page skip filter
### What problem does this PR solve? Issue Number: close #xxx Related PR: #64214 Problem Summary: The new parquet data page filter setup was implemented as an inline closure inside RecordReader creation, which made the page ordinal tracking, profile updates and page skip plan lookup hard to read. This refactors the filter into a small DataPageSkipFilter helper, centralizes page skip plan lookup/filter installation, and documents the important page-index invariants around data-page ordinals, non-repeated leaves, and double-skip accounting. The remaining touched reader files are formatting-only changes from the project clang-format script. ### Release note None ### Check List (For Author) - Test: Unit Test - Pending: NewParquetReaderTest.* - Behavior changed: No - Does this need documentation: No
1 parent fbdcdf7 commit d9c7a81

4 files changed

Lines changed: 70 additions & 30 deletions

File tree

be/src/format/new_parquet/parquet_statistics.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,6 +1116,8 @@ bool build_page_skip_plan_for_leaf(
11161116
int64_t row_group_rows, ParquetPageSkipPlan* page_skip_plan) {
11171117
DORIS_CHECK(page_skip_plan != nullptr);
11181118
*page_skip_plan = ParquetPageSkipPlan {};
1119+
// OffsetIndex first_row_index is row-based only for non-repeated leaves. LIST/MAP/repeated
1120+
// leaves need repetition-level-aware range mapping and are intentionally left out for now.
11191121
if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE ||
11201122
column_schema.descriptor == nullptr || column_schema.leaf_column_id < 0 ||
11211123
column_schema.descriptor->max_repetition_level() != 0) {

be/src/format/new_parquet/reader/column_reader.cpp

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <cstddef>
2626
#include <cstdint>
2727
#include <exception>
28+
#include <map>
2829
#include <memory>
2930
#include <string>
3031
#include <utility>
@@ -47,6 +48,63 @@
4748
namespace doris::parquet {
4849
namespace {
4950

51+
class DataPageSkipFilter {
52+
public:
53+
DataPageSkipFilter(const ParquetPageSkipPlan* page_skip_plan,
54+
ParquetPageSkipProfile page_skip_profile)
55+
: _page_skip_plan(page_skip_plan), _page_skip_profile(page_skip_profile) {
56+
DORIS_CHECK(_page_skip_plan != nullptr);
57+
}
58+
59+
bool operator()(const ::parquet::DataPageStats&) {
60+
// Arrow invokes this callback once for each DATA_PAGE/DATA_PAGE_V2 and never for
61+
// dictionary pages, so this ordinal matches Parquet OffsetIndex page locations.
62+
const size_t page_idx = _next_data_page_idx++;
63+
const bool skip = _page_skip_plan->should_skip_page(page_idx);
64+
if (!skip) {
65+
return false;
66+
}
67+
update_skip_profile(page_idx);
68+
return true;
69+
}
70+
71+
private:
72+
void update_skip_profile(size_t page_idx) const {
73+
if (_page_skip_profile.skipped_pages != nullptr) {
74+
COUNTER_UPDATE(_page_skip_profile.skipped_pages, 1);
75+
}
76+
if (_page_skip_profile.skipped_bytes != nullptr) {
77+
COUNTER_UPDATE(_page_skip_profile.skipped_bytes,
78+
_page_skip_plan->skipped_page_compressed_size(page_idx));
79+
}
80+
}
81+
82+
const ParquetPageSkipPlan* _page_skip_plan = nullptr;
83+
ParquetPageSkipProfile _page_skip_profile;
84+
size_t _next_data_page_idx = 0;
85+
};
86+
87+
const ParquetPageSkipPlan* find_page_skip_plan(
88+
const std::map<int, ParquetPageSkipPlan>* page_skip_plans, int leaf_column_id) {
89+
if (page_skip_plans == nullptr) {
90+
return nullptr;
91+
}
92+
const auto plan_it = page_skip_plans->find(leaf_column_id);
93+
return plan_it == page_skip_plans->end() ? nullptr : &plan_it->second;
94+
}
95+
96+
void install_data_page_filter(std::unique_ptr<::parquet::PageReader>& page_reader,
97+
const std::map<int, ParquetPageSkipPlan>* page_skip_plans,
98+
int leaf_column_id, ParquetPageSkipProfile page_skip_profile) {
99+
DORIS_CHECK(page_reader != nullptr);
100+
const ParquetPageSkipPlan* page_skip_plan =
101+
find_page_skip_plan(page_skip_plans, leaf_column_id);
102+
if (page_skip_plan == nullptr) {
103+
return;
104+
}
105+
page_reader->set_data_page_filter(DataPageSkipFilter(page_skip_plan, page_skip_profile));
106+
}
107+
50108
bool supports_nested_scalar_record_reader(const ParquetColumnSchema& column_schema) {
51109
if (supports_record_reader(column_schema.type_descriptor)) {
52110
return true;
@@ -156,13 +214,7 @@ Status ParquetColumnReaderFactory::create_scalar_reader(
156214
if (reader == nullptr) {
157215
return Status::InvalidArgument("reader is null");
158216
}
159-
const ParquetPageSkipPlan* page_skip_plan = nullptr;
160-
if (_page_skip_plans != nullptr) {
161-
auto plan_it = _page_skip_plans->find(column_schema.leaf_column_id);
162-
if (plan_it != _page_skip_plans->end()) {
163-
page_skip_plan = &plan_it->second;
164-
}
165-
}
217+
const auto* page_skip_plan = find_page_skip_plan(_page_skip_plans, column_schema.leaf_column_id);
166218
*reader = std::make_unique<ScalarColumnReader>(column_schema, std::move(record_reader),
167219
page_skip_plan, _column_reader_profile);
168220
return Status::OK();
@@ -246,29 +298,8 @@ Status ParquetColumnReaderFactory::get_record_reader(
246298
if (_record_readers[leaf_column_id] == nullptr) {
247299
try {
248300
auto page_reader = _row_group->GetColumnPageReader(leaf_column_id);
249-
if (_page_skip_plans != nullptr) {
250-
auto plan_it = _page_skip_plans->find(leaf_column_id);
251-
if (plan_it != _page_skip_plans->end()) {
252-
const ParquetPageSkipPlan* page_skip_plan = &plan_it->second;
253-
page_reader->set_data_page_filter(
254-
[page_skip_plan, page_skip_profile = _page_skip_profile,
255-
page_idx = size_t {0}](const ::parquet::DataPageStats&) mutable {
256-
const bool skip = page_skip_plan->should_skip_page(page_idx);
257-
if (skip) {
258-
if (page_skip_profile.skipped_pages != nullptr) {
259-
COUNTER_UPDATE(page_skip_profile.skipped_pages, 1);
260-
}
261-
if (page_skip_profile.skipped_bytes != nullptr) {
262-
COUNTER_UPDATE(page_skip_profile.skipped_bytes,
263-
page_skip_plan->skipped_page_compressed_size(
264-
page_idx));
265-
}
266-
}
267-
++page_idx;
268-
return skip;
269-
});
270-
}
271-
}
301+
install_data_page_filter(page_reader, _page_skip_plans, leaf_column_id,
302+
_page_skip_profile);
272303
const auto level_info = ::parquet::internal::LevelInfo::ComputeLevelInfo(descriptor);
273304
_record_readers[leaf_column_id] = ::parquet::internal::RecordReader::Make(
274305
descriptor, level_info, ::arrow::default_memory_pool(),

be/src/format/new_parquet/reader/scalar_column_reader.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ int64_t ScalarColumnReader::page_filtered_rows_to_skip(int64_t rows) const {
113113
const int64_t start = std::max(range.start, _row_group_rows_read);
114114
const int64_t end = std::min(range_end, skip_end);
115115
if (start < end) {
116+
// Scheduler gap skips are derived from page-index selected_ranges. A page-filtered
117+
// range can only overlap such a gap when the whole data page is outside every selected
118+
// range, so partial overlap would mean the planner and scheduler are out of sync.
116119
DORIS_CHECK(start == range.start);
117120
DORIS_CHECK(end == range_end);
118121
filtered_rows += end - start;

be/src/format/new_parquet/selection_vector.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ struct RowRange {
3333

3434
struct ParquetPageSkipPlan {
3535
int leaf_column_id = -1;
36+
// Page ordinal is the data-page ordinal in the column chunk. It intentionally excludes
37+
// dictionary pages, matching Arrow PageReader::set_data_page_filter().
3638
std::vector<uint8_t> skipped_pages;
3739
std::vector<int64_t> skipped_page_compressed_sizes;
40+
// Row ranges covered by skipped data pages. ScalarColumnReader uses these ranges to avoid
41+
// calling RecordReader::SkipRecords() again for pages already skipped by Arrow.
3842
std::vector<RowRange> skipped_ranges;
3943

4044
bool empty() const { return skipped_ranges.empty(); }

0 commit comments

Comments
 (0)