|
23 | 23 | /// Data reader interface for manifest files. |
24 | 24 |
|
25 | 25 | #include <memory> |
| 26 | +#include <string> |
26 | 27 | #include <vector> |
27 | 28 |
|
28 | 29 | #include "iceberg/iceberg_export.h" |
|
32 | 33 |
|
33 | 34 | namespace iceberg { |
34 | 35 |
|
| 36 | +class Expression; |
| 37 | +class PartitionSet; |
| 38 | + |
35 | 39 | /// \brief Read manifest entries from a manifest file. |
| 40 | +/// |
| 41 | +/// This class provides a fluent builder-style API to configure how manifest entries |
| 42 | +/// are read, including column selection, partition filtering, and row filtering. |
| 43 | +/// The underlying Avro reader is lazily created when Entries() or LiveEntries() is |
| 44 | +/// called. |
36 | 45 | class ICEBERG_EXPORT ManifestReader { |
37 | 46 | public: |
| 47 | + /// \brief Special value to select all columns from manifest files. |
| 48 | + inline static const std::vector<std::string> kAllColumns{"*"}; |
| 49 | + |
38 | 50 | virtual ~ManifestReader() = default; |
39 | 51 |
|
40 | 52 | /// \brief Read all manifest entries in the manifest file. |
41 | | - virtual Result<std::vector<ManifestEntry>> Entries() const = 0; |
| 53 | + /// |
| 54 | + /// This method returns all entries including deleted entries. Filtering is applied |
| 55 | + /// based on the configuration set via Select(), FilterPartitions(), and FilterRows(). |
| 56 | + virtual Result<std::vector<ManifestEntry>> Entries() = 0; |
42 | 57 |
|
43 | | - /// \brief Get the metadata of the manifest file. |
44 | | - virtual Result<std::unordered_map<std::string, std::string>> Metadata() const = 0; |
| 58 | + /// \brief Read only live (non-deleted) manifest entries. |
| 59 | + /// |
| 60 | + /// This method returns only entries with status ADDED or EXISTING. |
| 61 | + /// Filtering is applied based on the configuration. |
| 62 | + virtual Result<std::vector<ManifestEntry>> LiveEntries() = 0; |
| 63 | + |
| 64 | + /// \brief Select specific columns to read from the manifest entries. |
| 65 | + /// |
| 66 | + /// When filtering with FilterRows(), stats columns (value_counts, null_value_counts, |
| 67 | + /// nan_value_counts, lower_bounds, upper_bounds, record_count) are automatically |
| 68 | + /// included for filtering evaluation even if not specified in the select list. |
| 69 | + /// |
| 70 | + /// \param columns List of column names to read. |
| 71 | + /// \return Reference to this reader for method chaining. |
| 72 | + virtual ManifestReader& Select(const std::vector<std::string>& columns) = 0; |
| 73 | + |
| 74 | + /// \brief Filter manifest entries by partition expression. |
| 75 | + /// |
| 76 | + /// \param expr Expression to filter partitions. |
| 77 | + /// \return Reference to this reader for method chaining. |
| 78 | + virtual ManifestReader& FilterPartitions(std::shared_ptr<Expression> expr) = 0; |
| 79 | + |
| 80 | + /// \brief Filter manifest entries to a specific set of partitions. |
| 81 | + /// |
| 82 | + /// \param partition_set Set of partitions to include. |
| 83 | + /// \return Reference to this reader for method chaining. |
| 84 | + virtual ManifestReader& FilterPartitions( |
| 85 | + std::shared_ptr<PartitionSet> partition_set) = 0; |
| 86 | + |
| 87 | + /// \brief Filter manifest entries by row-level expression. |
| 88 | + /// |
| 89 | + /// This filter uses file-level statistics (lower_bounds, upper_bounds, null_counts, |
| 90 | + /// etc.) to skip files that cannot contain matching rows. Stats columns are |
| 91 | + /// automatically projected for evaluation. |
| 92 | + /// |
| 93 | + /// \param expr Expression to filter rows. |
| 94 | + /// \return Reference to this reader for method chaining. |
| 95 | + virtual ManifestReader& FilterRows(std::shared_ptr<Expression> expr) = 0; |
| 96 | + |
| 97 | + /// \brief Set case sensitivity for column name matching. |
| 98 | + /// |
| 99 | + /// \param case_sensitive Whether column name matching is case-sensitive. |
| 100 | + /// \return Reference to this reader for method chaining. |
| 101 | + virtual ManifestReader& CaseSensitive(bool case_sensitive) = 0; |
45 | 102 |
|
46 | 103 | /// \brief Creates a reader for a manifest file. |
47 | 104 | /// \param manifest A ManifestFile object containing metadata about the manifest. |
48 | 105 | /// \param file_io File IO implementation to use. |
49 | | - /// \param partition_type Schema for the partition. |
| 106 | + /// \param schema Schema used to bind the partition type. |
| 107 | + /// \param spec Partition spec used for this manifest file. |
50 | 108 | /// \return A Result containing the reader or an error. |
51 | 109 | static Result<std::unique_ptr<ManifestReader>> Make( |
52 | 110 | const ManifestFile& manifest, std::shared_ptr<FileIO> file_io, |
53 | | - std::shared_ptr<StructType> partition_type); |
| 111 | + std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec); |
54 | 112 |
|
55 | 113 | /// \brief Creates a reader for a manifest file. |
56 | 114 | /// \param manifest_location Path to the manifest file. |
57 | 115 | /// \param file_io File IO implementation to use. |
58 | | - /// \param partition_type Schema for the partition. |
| 116 | + /// \param schema Schema used to bind the partition type. |
| 117 | + /// \param spec Partition spec used for this manifest file. |
59 | 118 | /// \return A Result containing the reader or an error. |
60 | 119 | static Result<std::unique_ptr<ManifestReader>> Make( |
61 | 120 | std::string_view manifest_location, std::shared_ptr<FileIO> file_io, |
62 | | - std::shared_ptr<StructType> partition_type); |
| 121 | + std::shared_ptr<Schema> schema, std::shared_ptr<PartitionSpec> spec); |
| 122 | + |
| 123 | + /// \brief Add stats columns to the column list if needed. |
| 124 | + static std::vector<std::string> WithStatsColumns( |
| 125 | + const std::vector<std::string>& columns); |
63 | 126 | }; |
64 | 127 |
|
65 | 128 | /// \brief Read manifest files from a manifest list file. |
|
0 commit comments