Skip to content

Commit 912d191

Browse files
committed
feat: add projection and filtering to manifest reader
- Consolidate `ManifestReader` implementation into `manifest_reader.cc` and remove `manifest_reader_internal.cc`. - Implement fluent API for column selection, partition filtering, and row filtering. - Support lazy initialization of the underlying Avro reader. - Add various filtering support for entries.
1 parent dba8f92 commit 912d191

10 files changed

Lines changed: 1019 additions & 677 deletions

src/iceberg/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ set(ICEBERG_SOURCES
4242
manifest/manifest_entry.cc
4343
manifest/manifest_list.cc
4444
manifest/manifest_reader.cc
45-
manifest/manifest_reader_internal.cc
4645
manifest/manifest_writer.cc
4746
manifest/v1_metadata.cc
4847
manifest/v2_metadata.cc

src/iceberg/manifest/manifest_list.cc

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,24 @@
1919

2020
#include "iceberg/manifest/manifest_list.h"
2121

22-
#include "iceberg/schema.h"
22+
#include <memory>
23+
24+
#include "iceberg/type.h"
2325

2426
namespace iceberg {
2527

26-
const StructType& PartitionFieldSummary::Type() {
27-
static const StructType kInstance{{
28+
const std::shared_ptr<StructType>& PartitionFieldSummary::Type() {
29+
static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
2830
PartitionFieldSummary::kContainsNull,
2931
PartitionFieldSummary::kContainsNaN,
3032
PartitionFieldSummary::kLowerBound,
3133
PartitionFieldSummary::kUpperBound,
32-
}};
34+
});
3335
return kInstance;
3436
}
3537

36-
const std::shared_ptr<Schema>& ManifestFile::Type() {
37-
static const auto kInstance = std::make_shared<Schema>(std::vector<SchemaField>{
38+
const std::shared_ptr<StructType>& ManifestFile::Type() {
39+
static const auto kInstance = std::make_shared<StructType>(std::vector<SchemaField>{
3840
kManifestPath,
3941
kManifestLength,
4042
kPartitionSpecId,

src/iceberg/manifest/manifest_list.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ struct ICEBERG_EXPORT PartitionFieldSummary {
6969

7070
bool operator==(const PartitionFieldSummary& other) const = default;
7171

72-
static const StructType& Type();
72+
static const std::shared_ptr<StructType>& Type();
7373
};
7474

7575
/// \brief The type of files tracked by the manifest, either data or delete files; 0 for
@@ -197,7 +197,7 @@ struct ICEBERG_EXPORT ManifestFile {
197197

198198
bool operator==(const ManifestFile& other) const = default;
199199

200-
static const std::shared_ptr<Schema>& Type();
200+
static const std::shared_ptr<StructType>& Type();
201201
};
202202

203203
/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are

src/iceberg/manifest/manifest_reader.cc

Lines changed: 850 additions & 32 deletions
Large diffs are not rendered by default.

src/iceberg/manifest/manifest_reader.h

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
/// Data reader interface for manifest files.
2424

2525
#include <memory>
26+
#include <string>
2627
#include <vector>
2728

2829
#include "iceberg/iceberg_export.h"
@@ -32,34 +33,96 @@
3233

3334
namespace iceberg {
3435

36+
class Expression;
37+
class PartitionSet;
38+
3539
/// \brief Read manifest entries from a manifest file.
40+
///
41+
/// This class provides a fluent builder-style API to configure how manifest entries
42+
/// are read, including column selection, partition filtering, and row filtering.
43+
/// The underlying Avro reader is lazily created when Entries() or LiveEntries() is
44+
/// called.
3645
class ICEBERG_EXPORT ManifestReader {
3746
public:
47+
/// \brief Special value to select all columns from manifest files.
48+
inline static const std::vector<std::string> kAllColumns{"*"};
49+
3850
virtual ~ManifestReader() = default;
3951

4052
/// \brief Read all manifest entries in the manifest file.
41-
virtual Result<std::vector<ManifestEntry>> Entries() const = 0;
53+
///
54+
/// This method returns all entries including deleted entries. Filtering is applied
55+
/// based on the configuration set via Select(), FilterPartitions(), and FilterRows().
56+
virtual Result<std::vector<ManifestEntry>> Entries() = 0;
4257

43-
/// \brief Get the metadata of the manifest file.
44-
virtual Result<std::unordered_map<std::string, std::string>> Metadata() const = 0;
58+
/// \brief Read only live (non-deleted) manifest entries.
59+
///
60+
/// This method returns only entries with status ADDED or EXISTING.
61+
/// Filtering is applied based on the configuration.
62+
virtual Result<std::vector<ManifestEntry>> LiveEntries() = 0;
63+
64+
/// \brief Select specific columns to read from the manifest entries.
65+
///
66+
/// When filtering with FilterRows(), stats columns (value_counts, null_value_counts,
67+
/// nan_value_counts, lower_bounds, upper_bounds, record_count) are automatically
68+
/// included for filtering evaluation even if not specified in the select list.
69+
///
70+
/// \param columns List of column names to read.
71+
/// \return Reference to this reader for method chaining.
72+
virtual ManifestReader& Select(const std::vector<std::string>& columns) = 0;
73+
74+
/// \brief Filter manifest entries by partition expression.
75+
///
76+
/// \param expr Expression to filter partitions.
77+
/// \return Reference to this reader for method chaining.
78+
virtual ManifestReader& FilterPartitions(std::shared_ptr<Expression> expr) = 0;
79+
80+
/// \brief Filter manifest entries to a specific set of partitions.
81+
///
82+
/// \param partition_set Set of partitions to include.
83+
/// \return Reference to this reader for method chaining.
84+
virtual ManifestReader& FilterPartitions(
85+
std::shared_ptr<PartitionSet> partition_set) = 0;
86+
87+
/// \brief Filter manifest entries by row-level expression.
88+
///
89+
/// This filter uses file-level statistics (lower_bounds, upper_bounds, null_counts,
90+
/// etc.) to skip files that cannot contain matching rows. Stats columns are
91+
/// automatically projected for evaluation.
92+
///
93+
/// \param expr Expression to filter rows.
94+
/// \return Reference to this reader for method chaining.
95+
virtual ManifestReader& FilterRows(std::shared_ptr<Expression> expr) = 0;
96+
97+
/// \brief Set case sensitivity for column name matching.
98+
///
99+
/// \param case_sensitive Whether column name matching is case-sensitive.
100+
/// \return Reference to this reader for method chaining.
101+
virtual ManifestReader& CaseSensitive(bool case_sensitive) = 0;
45102

46103
/// \brief Creates a reader for a manifest file.
47104
/// \param manifest A ManifestFile object containing metadata about the manifest.
48105
/// \param file_io File IO implementation to use.
49-
/// \param partition_type Schema for the partition.
106+
/// \param schema Schema used to bind the partition type.
107+
/// \param spec Partition spec used for this manifest file.
50108
/// \return A Result containing the reader or an error.
51109
static Result<std::unique_ptr<ManifestReader>> Make(
52110
const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
53-
std::shared_ptr<StructType> partition_type);
111+
std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec);
54112

55113
/// \brief Creates a reader for a manifest file.
56114
/// \param manifest_location Path to the manifest file.
57115
/// \param file_io File IO implementation to use.
58-
/// \param partition_type Schema for the partition.
116+
/// \param schema Schema used to bind the partition type.
117+
/// \param spec Partition spec used for this manifest file.
59118
/// \return A Result containing the reader or an error.
60119
static Result<std::unique_ptr<ManifestReader>> Make(
61120
std::string_view manifest_location, std::shared_ptr<FileIO> file_io,
62-
std::shared_ptr<StructType> partition_type);
121+
std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec);
122+
123+
/// \brief Add stats columns to the column list if needed.
124+
static std::vector<std::string> WithStatsColumns(
125+
const std::vector<std::string>& columns);
63126
};
64127

65128
/// \brief Read manifest files from a manifest list file.

0 commit comments

Comments
 (0)