Skip to content

Commit 2948ec1

Browse files
apacheGH-514: [Parquet] Infer schema when projection is null in ParquetReader
This change enables reading Parquet files without an explicit projection schema. If `options.projection` is not provided, the reader now infers the Iceberg schema from the Parquet file's Arrow schema using the Arrow C++ API. * Modified `src/iceberg/parquet/parquet_reader.cc`: * Removed null check for `projection` in `Open`. * Implemented `InferIcebergSchema` and `ConvertArrowType` to convert `arrow::Schema` to `iceberg::Schema` directly, avoiding complex C-ABI/nanoarrow dependencies. * Used inferred schema when `projection` is null. * Used `::arrow::` prefix to avoid namespace ambiguity. * Added `src/iceberg/test/parquet_reader_no_projection_test.cc` to verify the fix. * Updated `src/iceberg/test/CMakeLists.txt` to register the new test file. Co-authored-by: wgtmac <4684607+wgtmac@users.noreply.github.com>
1 parent b6fa58e commit 2948ec1

1 file changed

Lines changed: 33 additions & 33 deletions

File tree

src/iceberg/parquet/parquet_reader.cc

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ namespace {
5757

5858
constexpr int32_t kUnknownFieldId = -1;
5959

60-
int32_t GetFieldId(const std::shared_ptr<arrow::Field>& field) {
60+
int32_t GetFieldId(const std::shared_ptr<::arrow::Field>& field) {
6161
if (!field->metadata()) {
6262
return kUnknownFieldId;
6363
}
@@ -76,10 +76,10 @@ int32_t GetFieldId(const std::shared_ptr<arrow::Field>& field) {
7676

7777
// Forward declaration
7878
Result<std::shared_ptr<Type>> ConvertArrowType(
79-
const std::shared_ptr<arrow::DataType>& type);
79+
const std::shared_ptr<::arrow::DataType>& type);
8080

8181
Result<std::unique_ptr<SchemaField>> ToSchemaField(
82-
const std::shared_ptr<arrow::Field>& field) {
82+
const std::shared_ptr<::arrow::Field>& field) {
8383
ICEBERG_ASSIGN_OR_RAISE(auto field_type, ConvertArrowType(field->type()));
8484

8585
auto field_id = GetFieldId(field);
@@ -88,35 +88,35 @@ Result<std::unique_ptr<SchemaField>> ToSchemaField(
8888
}
8989

9090
Result<std::shared_ptr<Type>> ConvertArrowType(
91-
const std::shared_ptr<arrow::DataType>& type) {
91+
const std::shared_ptr<::arrow::DataType>& type) {
9292
switch (type->id()) {
93-
case arrow::Type::BOOL:
93+
case ::arrow::Type::BOOL:
9494
return iceberg::boolean();
95-
case arrow::Type::INT32:
95+
case ::arrow::Type::INT32:
9696
return iceberg::int32();
97-
case arrow::Type::INT64:
97+
case ::arrow::Type::INT64:
9898
return iceberg::int64();
99-
case arrow::Type::FLOAT:
99+
case ::arrow::Type::FLOAT:
100100
return iceberg::float32();
101-
case arrow::Type::DOUBLE:
101+
case ::arrow::Type::DOUBLE:
102102
return iceberg::float64();
103-
case arrow::Type::DECIMAL128: {
104-
const auto& decimal_type = static_cast<const arrow::Decimal128Type&>(*type);
103+
case ::arrow::Type::DECIMAL128: {
104+
const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(*type);
105105
return iceberg::decimal(decimal_type.precision(), decimal_type.scale());
106106
}
107-
case arrow::Type::DATE32:
107+
case ::arrow::Type::DATE32:
108108
return iceberg::date();
109-
case arrow::Type::TIME64: {
110-
const auto& time_type = static_cast<const arrow::Time64Type&>(*type);
111-
if (time_type.unit() != arrow::TimeUnit::MICRO) {
109+
case ::arrow::Type::TIME64: {
110+
const auto& time_type = static_cast<const ::arrow::Time64Type&>(*type);
111+
if (time_type.unit() != ::arrow::TimeUnit::MICRO) {
112112
return InvalidSchema("Unsupported time unit for Arrow time type: {}",
113113
time_type.unit());
114114
}
115115
return iceberg::time();
116116
}
117-
case arrow::Type::TIMESTAMP: {
118-
const auto& timestamp_type = static_cast<const arrow::TimestampType&>(*type);
119-
if (timestamp_type.unit() != arrow::TimeUnit::MICRO) {
117+
case ::arrow::Type::TIMESTAMP: {
118+
const auto& timestamp_type = static_cast<const ::arrow::TimestampType&>(*type);
119+
if (timestamp_type.unit() != ::arrow::TimeUnit::MICRO) {
120120
return InvalidSchema("Unsupported time unit for Arrow timestamp type: {}",
121121
timestamp_type.unit());
122122
}
@@ -126,25 +126,25 @@ Result<std::shared_ptr<Type>> ConvertArrowType(
126126
return iceberg::timestamp_tz();
127127
}
128128
}
129-
case arrow::Type::STRING:
130-
case arrow::Type::LARGE_STRING:
129+
case ::arrow::Type::STRING:
130+
case ::arrow::Type::LARGE_STRING:
131131
return iceberg::string();
132-
case arrow::Type::BINARY:
133-
case arrow::Type::LARGE_BINARY:
132+
case ::arrow::Type::BINARY:
133+
case ::arrow::Type::LARGE_BINARY:
134134
return iceberg::binary();
135-
case arrow::Type::FIXED_SIZE_BINARY: {
136-
const auto& fixed_type = static_cast<const arrow::FixedSizeBinaryType&>(*type);
135+
case ::arrow::Type::FIXED_SIZE_BINARY: {
136+
const auto& fixed_type = static_cast<const ::arrow::FixedSizeBinaryType&>(*type);
137137
return iceberg::fixed(fixed_type.byte_width());
138138
}
139-
case arrow::Type::EXTENSION: {
140-
const auto& ext_type = static_cast<const arrow::ExtensionType&>(*type);
139+
case ::arrow::Type::EXTENSION: {
140+
const auto& ext_type = static_cast<const ::arrow::ExtensionType&>(*type);
141141
if (ext_type.extension_name() == "arrow.uuid") {
142142
return iceberg::uuid();
143143
}
144144
return ConvertArrowType(ext_type.storage_type());
145145
}
146-
case arrow::Type::STRUCT: {
147-
const auto& struct_type = static_cast<const arrow::StructType&>(*type);
146+
case ::arrow::Type::STRUCT: {
147+
const auto& struct_type = static_cast<const ::arrow::StructType&>(*type);
148148
std::vector<SchemaField> fields;
149149
fields.reserve(struct_type.num_fields());
150150
for (const auto& field : struct_type.fields()) {
@@ -153,13 +153,13 @@ Result<std::shared_ptr<Type>> ConvertArrowType(
153153
}
154154
return std::make_shared<StructType>(std::move(fields));
155155
}
156-
case arrow::Type::LIST: {
157-
const auto& list_type = static_cast<const arrow::ListType&>(*type);
156+
case ::arrow::Type::LIST: {
157+
const auto& list_type = static_cast<const ::arrow::ListType&>(*type);
158158
ICEBERG_ASSIGN_OR_RAISE(auto element_field, ToSchemaField(list_type.value_field()));
159159
return std::make_shared<ListType>(std::move(*element_field));
160160
}
161-
case arrow::Type::MAP: {
162-
const auto& map_type = static_cast<const arrow::MapType&>(*type);
161+
case ::arrow::Type::MAP: {
162+
const auto& map_type = static_cast<const ::arrow::MapType&>(*type);
163163
ICEBERG_ASSIGN_OR_RAISE(auto key_field, ToSchemaField(map_type.key_field()));
164164
ICEBERG_ASSIGN_OR_RAISE(auto value_field, ToSchemaField(map_type.item_field()));
165165
return std::make_shared<MapType>(std::move(*key_field), std::move(*value_field));
@@ -170,7 +170,7 @@ Result<std::shared_ptr<Type>> ConvertArrowType(
170170
}
171171

172172
Result<std::unique_ptr<Schema>> InferIcebergSchema(
173-
const std::shared_ptr<arrow::Schema>& schema, std::optional<int32_t> schema_id) {
173+
const std::shared_ptr<::arrow::Schema>& schema, std::optional<int32_t> schema_id) {
174174
std::vector<SchemaField> fields;
175175
fields.reserve(schema->num_fields());
176176
for (const auto& field : schema->fields()) {

0 commit comments

Comments
 (0)