3737#include " core/column/column_array.h"
3838#include " core/column/column_nullable.h"
3939#include " core/column/column_struct.h"
40+ #include " core/column/column_vector.h"
4041#include " core/data_type/data_type.h"
4142#include " core/data_type/data_type_array.h"
4243#include " core/data_type/data_type_factory.hpp"
4344#include " core/data_type/data_type_nullable.h"
4445#include " core/data_type/data_type_number.h"
4546#include " core/data_type/data_type_string.h"
4647#include " core/data_type/data_type_struct.h"
48+ #include " format/parquet/vparquet_column_chunk_reader.h"
4749#include " format/parquet/vparquet_reader.h"
4850#include " io/fs/file_meta_cache.h"
4951#include " io/fs/file_reader_writer_fwd.h"
5658
5759namespace doris {
5860
61+ class IcebergReaderTestHelper : public IcebergTableReader {
62+ public:
63+ using IcebergTableReader::_is_fully_dictionary_encoded;
64+ };
65+
5966class IcebergReaderTest : public ::testing::Test {
6067protected:
6168 void SetUp () override {
@@ -68,6 +75,60 @@ class IcebergReaderTest : public ::testing::Test {
6875
6976 void TearDown () override { cache.reset (); }
7077
78+ std::string mixed_position_delete_file () const {
79+ return " ./be/test/exec/test_data/iceberg_mixed_position_delete_parquet/"
80+ " mixed_encoding_position_delete.parquet" ;
81+ }
82+
83+ std::unique_ptr<ParquetReader> create_delete_file_parquet_reader (
84+ RuntimeProfile* profile, RuntimeState* runtime_state, TFileScanRangeParams* scan_params,
85+ TFileRangeDesc* scan_range, io::FileReaderSPtr* file_reader,
86+ const tparquet::FileMetaData** file_meta_data) {
87+ auto local_fs = io::global_local_filesystem ();
88+ auto st = local_fs->open_file (mixed_position_delete_file (), file_reader);
89+ EXPECT_TRUE (st.ok ()) << st;
90+ if (!st.ok ()) {
91+ return nullptr ;
92+ }
93+
94+ scan_params->format_type = TFileFormatType::FORMAT_PARQUET;
95+
96+ scan_range->start_offset = 0 ;
97+ scan_range->size = (*file_reader)->size ();
98+ scan_range->path = mixed_position_delete_file ();
99+
100+ auto parquet_reader =
101+ ParquetReader::create_unique (profile, *scan_params, *scan_range, 1024 ,
102+ &timezone_obj, nullptr , runtime_state, cache.get ());
103+ EXPECT_NE (parquet_reader, nullptr );
104+ if (parquet_reader == nullptr ) {
105+ return nullptr ;
106+ }
107+
108+ parquet_reader->set_file_reader (*file_reader);
109+
110+ phmap::flat_hash_map<int , std::vector<std::shared_ptr<ColumnPredicate>>> predicates;
111+ st = parquet_reader->init_reader (delete_file_column_names,
112+ &delete_file_col_name_to_block_idx, {}, predicates,
113+ nullptr , nullptr , nullptr , nullptr , nullptr );
114+ EXPECT_TRUE (st.ok ()) << st;
115+ if (!st.ok ()) {
116+ return nullptr ;
117+ }
118+
119+ std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>
120+ partition_columns;
121+ std::unordered_map<std::string, VExprContextSPtr> missing_columns;
122+ st = parquet_reader->set_fill_columns (partition_columns, missing_columns);
123+ EXPECT_TRUE (st.ok ()) << st;
124+ if (!st.ok ()) {
125+ return nullptr ;
126+ }
127+
128+ *file_meta_data = parquet_reader->get_meta_data ();
129+ return parquet_reader;
130+ }
131+
71132 // Helper function to create complex struct types for testing
72133 void create_complex_struct_types (DataTypePtr& coordinates_struct_type,
73134 DataTypePtr& address_struct_type,
@@ -462,8 +523,124 @@ class IcebergReaderTest : public ::testing::Test {
462523
463524 std::unique_ptr<doris::FileMetaCache> cache;
464525 cctz::time_zone timezone_obj;
526+ std::vector<std::string> delete_file_column_names = {" file_path" , " pos" };
527+ std::unordered_map<std::string, uint32_t > delete_file_col_name_to_block_idx = {{" file_path" , 0 },
528+ {" pos" , 1 }};
465529};
466530
531+ TEST_F (IcebergReaderTest, detects_fully_dictionary_encoded_parquet_column) {
532+ tparquet::ColumnMetaData column_metadata;
533+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
534+ column_metadata.__isset .encoding_stats = true ;
535+
536+ tparquet::PageEncodingStats dict_page;
537+ dict_page.page_type = tparquet::PageType::DATA_PAGE;
538+ dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
539+ dict_page.count = 3 ;
540+
541+ column_metadata.encoding_stats = {dict_page};
542+
543+ EXPECT_TRUE (IcebergReaderTestHelper::_is_fully_dictionary_encoded (column_metadata));
544+ }
545+
546+ TEST_F (IcebergReaderTest, rejects_mixed_dictionary_and_plain_parquet_column) {
547+ tparquet::ColumnMetaData column_metadata;
548+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
549+ column_metadata.__isset .encoding_stats = true ;
550+
551+ tparquet::PageEncodingStats dict_page;
552+ dict_page.page_type = tparquet::PageType::DATA_PAGE;
553+ dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
554+ dict_page.count = 2 ;
555+
556+ tparquet::PageEncodingStats plain_page;
557+ plain_page.page_type = tparquet::PageType::DATA_PAGE;
558+ plain_page.encoding = tparquet::Encoding::PLAIN;
559+ plain_page.count = 1 ;
560+
561+ column_metadata.encoding_stats = {dict_page, plain_page};
562+
563+ EXPECT_FALSE (IcebergReaderTestHelper::_is_fully_dictionary_encoded (column_metadata));
564+ }
565+
566+ TEST_F (IcebergReaderTest, rejects_mixed_dictionary_and_plain_parquet_v2_column) {
567+ tparquet::ColumnMetaData column_metadata;
568+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
569+ column_metadata.__isset .encoding_stats = true ;
570+
571+ tparquet::PageEncodingStats dict_page;
572+ dict_page.page_type = tparquet::PageType::DATA_PAGE_V2;
573+ dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY;
574+ dict_page.count = 2 ;
575+
576+ tparquet::PageEncodingStats plain_page;
577+ plain_page.page_type = tparquet::PageType::DATA_PAGE_V2;
578+ plain_page.encoding = tparquet::Encoding::PLAIN;
579+ plain_page.count = 1 ;
580+
581+ column_metadata.encoding_stats = {dict_page, plain_page};
582+
583+ EXPECT_FALSE (IcebergReaderTestHelper::_is_fully_dictionary_encoded (column_metadata));
584+ }
585+
586+ TEST_F (IcebergReaderTest, rejects_non_dictionary_encoding_without_encoding_stats) {
587+ tparquet::ColumnMetaData column_metadata;
588+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
589+ column_metadata.__isset .encoding_stats = false ;
590+ column_metadata.encodings = {tparquet::Encoding::PLAIN_DICTIONARY, tparquet::Encoding::PLAIN,
591+ tparquet::Encoding::RLE};
592+
593+ EXPECT_FALSE (IcebergReaderTestHelper::_is_fully_dictionary_encoded (column_metadata));
594+ }
595+
596+ TEST_F (IcebergReaderTest, falls_back_to_encodings_when_data_page_stats_are_missing) {
597+ tparquet::ColumnMetaData column_metadata;
598+ column_metadata.type = tparquet::Type::BYTE_ARRAY;
599+ column_metadata.__isset .encoding_stats = true ;
600+
601+ tparquet::PageEncodingStats dict_page_header;
602+ dict_page_header.page_type = tparquet::PageType::DICTIONARY_PAGE;
603+ dict_page_header.encoding = tparquet::Encoding::PLAIN;
604+ dict_page_header.count = 1 ;
605+ column_metadata.encoding_stats = {dict_page_header};
606+
607+ column_metadata.encodings = {tparquet::Encoding::PLAIN, tparquet::Encoding::RLE,
608+ tparquet::Encoding::RLE_DICTIONARY};
609+
610+ EXPECT_FALSE (IcebergReaderTestHelper::_is_fully_dictionary_encoded (column_metadata));
611+ }
612+
613+ TEST_F (IcebergReaderTest, generated_position_delete_file_is_mixed_encoded) {
614+ RuntimeProfile profile (" test_profile" );
615+ RuntimeState runtime_state ((TQueryOptions ()), TQueryGlobals ());
616+ TFileScanRangeParams scan_params;
617+ TFileRangeDesc scan_range;
618+ io::FileReaderSPtr file_reader;
619+ const tparquet::FileMetaData* file_meta_data = nullptr ;
620+ auto parquet_reader = create_delete_file_parquet_reader (
621+ &profile, &runtime_state, &scan_params, &scan_range, &file_reader, &file_meta_data);
622+ ASSERT_NE (parquet_reader, nullptr );
623+ ASSERT_NE (file_meta_data, nullptr );
624+ ASSERT_EQ (file_meta_data->row_groups .size (), 1 );
625+
626+ const auto & file_path_meta = file_meta_data->row_groups [0 ].columns [0 ].meta_data ;
627+ EXPECT_TRUE (file_meta_data->row_groups [0 ].columns [0 ].__isset .meta_data );
628+ EXPECT_TRUE (has_dict_page (file_path_meta));
629+ bool has_plain_encoding = false ;
630+ bool has_dictionary_encoding = false ;
631+ for (const auto encoding : file_path_meta.encodings ) {
632+ if (encoding == tparquet::Encoding::PLAIN) {
633+ has_plain_encoding = true ;
634+ }
635+ if (encoding == tparquet::Encoding::PLAIN_DICTIONARY ||
636+ encoding == tparquet::Encoding::RLE_DICTIONARY) {
637+ has_dictionary_encoding = true ;
638+ }
639+ }
640+ EXPECT_TRUE (has_plain_encoding);
641+ EXPECT_TRUE (has_dictionary_encoding);
642+ }
643+
467644// Test reading real Iceberg Parquet file using IcebergTableReader
468645TEST_F (IcebergReaderTest, read_iceberg_parquet_file) {
469646 // Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email
0 commit comments