|
19 | 19 |
|
20 | 20 | #include "iceberg/parquet/parquet_reader.h" |
21 | 21 |
|
| 22 | +#include <charconv> |
| 23 | +#include <cstring> |
22 | 24 | #include <numeric> |
| 25 | +#include <optional> |
| 26 | +#include <string> |
23 | 27 |
|
24 | 28 | #include <arrow/c/bridge.h> |
25 | 29 | #include <arrow/memory_pool.h> |
|
35 | 39 | #include "iceberg/arrow/arrow_fs_file_io_internal.h" |
36 | 40 | #include "iceberg/arrow/arrow_status_internal.h" |
37 | 41 | #include "iceberg/arrow/metadata_column_util_internal.h" |
38 | | -#include "iceberg/arrow_c_data_guard_internal.h" |
| 42 | +#include "iceberg/constants.h" |
39 | 43 | #include "iceberg/parquet/parquet_data_util_internal.h" |
40 | 44 | #include "iceberg/parquet/parquet_register.h" |
41 | 45 | #include "iceberg/parquet/parquet_schema_util_internal.h" |
42 | 46 | #include "iceberg/result.h" |
| 47 | +#include "iceberg/schema.h" |
43 | 48 | #include "iceberg/schema_internal.h" |
44 | 49 | #include "iceberg/schema_util.h" |
| 50 | +#include "iceberg/type.h" |
45 | 51 | #include "iceberg/util/checked_cast.h" |
46 | 52 | #include "iceberg/util/macros.h" |
47 | 53 |
|
48 | 54 | namespace iceberg::parquet { |
49 | 55 |
|
50 | 56 | namespace { |
51 | 57 |
|
| 58 | +constexpr int32_t kUnknownFieldId = -1; |
| 59 | + |
| 60 | +int32_t GetFieldId(const std::shared_ptr<arrow::Field>& field) { |
| 61 | + if (!field->metadata()) { |
| 62 | + return kUnknownFieldId; |
| 63 | + } |
| 64 | + |
| 65 | + int idx = field->metadata()->FindKey(kParquetFieldIdKey); |
| 66 | + if (idx == -1) { |
| 67 | + return kUnknownFieldId; |
| 68 | + } |
| 69 | + |
| 70 | + std::string value = field->metadata()->value(idx); |
| 71 | + int32_t field_id = kUnknownFieldId; |
| 72 | + std::from_chars(value.data(), value.data() + value.size(), field_id); |
| 73 | + |
| 74 | + return field_id; |
| 75 | +} |
| 76 | + |
| 77 | +// Forward declaration |
| 78 | +Result<std::shared_ptr<Type>> ConvertArrowType( |
| 79 | + const std::shared_ptr<arrow::DataType>& type); |
| 80 | + |
| 81 | +Result<std::unique_ptr<SchemaField>> ToSchemaField( |
| 82 | + const std::shared_ptr<arrow::Field>& field) { |
| 83 | + ICEBERG_ASSIGN_OR_RAISE(auto field_type, ConvertArrowType(field->type())); |
| 84 | + |
| 85 | + auto field_id = GetFieldId(field); |
| 86 | + return std::make_unique<SchemaField>(field_id, field->name(), std::move(field_type), |
| 87 | + field->nullable()); |
| 88 | +} |
| 89 | + |
| 90 | +Result<std::shared_ptr<Type>> ConvertArrowType( |
| 91 | + const std::shared_ptr<arrow::DataType>& type) { |
| 92 | + switch (type->id()) { |
| 93 | + case arrow::Type::BOOL: |
| 94 | + return iceberg::boolean(); |
| 95 | + case arrow::Type::INT32: |
| 96 | + return iceberg::int32(); |
| 97 | + case arrow::Type::INT64: |
| 98 | + return iceberg::int64(); |
| 99 | + case arrow::Type::FLOAT: |
| 100 | + return iceberg::float32(); |
| 101 | + case arrow::Type::DOUBLE: |
| 102 | + return iceberg::float64(); |
| 103 | + case arrow::Type::DECIMAL128: { |
| 104 | + const auto& decimal_type = static_cast<const arrow::Decimal128Type&>(*type); |
| 105 | + return iceberg::decimal(decimal_type.precision(), decimal_type.scale()); |
| 106 | + } |
| 107 | + case arrow::Type::DATE32: |
| 108 | + return iceberg::date(); |
| 109 | + case arrow::Type::TIME64: { |
| 110 | + const auto& time_type = static_cast<const arrow::Time64Type&>(*type); |
| 111 | + if (time_type.unit() != arrow::TimeUnit::MICRO) { |
| 112 | + return InvalidSchema("Unsupported time unit for Arrow time type: {}", |
| 113 | + time_type.unit()); |
| 114 | + } |
| 115 | + return iceberg::time(); |
| 116 | + } |
| 117 | + case arrow::Type::TIMESTAMP: { |
| 118 | + const auto& timestamp_type = static_cast<const arrow::TimestampType&>(*type); |
| 119 | + if (timestamp_type.unit() != arrow::TimeUnit::MICRO) { |
| 120 | + return InvalidSchema("Unsupported time unit for Arrow timestamp type: {}", |
| 121 | + timestamp_type.unit()); |
| 122 | + } |
| 123 | + if (timestamp_type.timezone().empty()) { |
| 124 | + return iceberg::timestamp(); |
| 125 | + } else { |
| 126 | + return iceberg::timestamp_tz(); |
| 127 | + } |
| 128 | + } |
| 129 | + case arrow::Type::STRING: |
| 130 | + case arrow::Type::LARGE_STRING: |
| 131 | + return iceberg::string(); |
| 132 | + case arrow::Type::BINARY: |
| 133 | + case arrow::Type::LARGE_BINARY: |
| 134 | + return iceberg::binary(); |
| 135 | + case arrow::Type::FIXED_SIZE_BINARY: { |
| 136 | + const auto& fixed_type = static_cast<const arrow::FixedSizeBinaryType&>(*type); |
| 137 | + return iceberg::fixed(fixed_type.byte_width()); |
| 138 | + } |
| 139 | + case arrow::Type::EXTENSION: { |
| 140 | + const auto& ext_type = static_cast<const arrow::ExtensionType&>(*type); |
| 141 | + if (ext_type.extension_name() == "arrow.uuid") { |
| 142 | + return iceberg::uuid(); |
| 143 | + } |
| 144 | + return ConvertArrowType(ext_type.storage_type()); |
| 145 | + } |
| 146 | + case arrow::Type::STRUCT: { |
| 147 | + const auto& struct_type = static_cast<const arrow::StructType&>(*type); |
| 148 | + std::vector<SchemaField> fields; |
| 149 | + fields.reserve(struct_type.num_fields()); |
| 150 | + for (const auto& field : struct_type.fields()) { |
| 151 | + ICEBERG_ASSIGN_OR_RAISE(auto schema_field, ToSchemaField(field)); |
| 152 | + fields.emplace_back(std::move(*schema_field)); |
| 153 | + } |
| 154 | + return std::make_shared<StructType>(std::move(fields)); |
| 155 | + } |
| 156 | + case arrow::Type::LIST: { |
| 157 | + const auto& list_type = static_cast<const arrow::ListType&>(*type); |
| 158 | + ICEBERG_ASSIGN_OR_RAISE(auto element_field, ToSchemaField(list_type.value_field())); |
| 159 | + return std::make_shared<ListType>(std::move(*element_field)); |
| 160 | + } |
| 161 | + case arrow::Type::MAP: { |
| 162 | + const auto& map_type = static_cast<const arrow::MapType&>(*type); |
| 163 | + ICEBERG_ASSIGN_OR_RAISE(auto key_field, ToSchemaField(map_type.key_field())); |
| 164 | + ICEBERG_ASSIGN_OR_RAISE(auto value_field, ToSchemaField(map_type.item_field())); |
| 165 | + return std::make_shared<MapType>(std::move(*key_field), std::move(*value_field)); |
| 166 | + } |
| 167 | + default: |
| 168 | + return InvalidSchema("Unsupported Arrow type: {}", type->ToString()); |
| 169 | + } |
| 170 | +} |
| 171 | + |
| 172 | +Result<std::unique_ptr<Schema>> InferIcebergSchema( |
| 173 | + const std::shared_ptr<arrow::Schema>& schema, std::optional<int32_t> schema_id) { |
| 174 | + std::vector<SchemaField> fields; |
| 175 | + fields.reserve(schema->num_fields()); |
| 176 | + for (const auto& field : schema->fields()) { |
| 177 | + ICEBERG_ASSIGN_OR_RAISE(auto schema_field, ToSchemaField(field)); |
| 178 | + fields.emplace_back(std::move(*schema_field)); |
| 179 | + } |
| 180 | + auto id = schema_id.value_or(Schema::kInitialSchemaId); |
| 181 | + return std::make_unique<Schema>(std::move(fields), id); |
| 182 | +} |
| 183 | + |
52 | 184 | Result<std::shared_ptr<::arrow::io::RandomAccessFile>> OpenInputStream( |
53 | 185 | const ReaderOptions& options) { |
54 | 186 | ::arrow::fs::FileInfo file_info(options.path, ::arrow::fs::FileType::File); |
@@ -135,10 +267,8 @@ class ParquetReader::Impl { |
135 | 267 | } else { |
136 | 268 | std::shared_ptr<::arrow::Schema> arrow_schema; |
137 | 269 | ICEBERG_ARROW_RETURN_NOT_OK(reader_->GetSchema(&arrow_schema)); |
138 | | - ArrowSchema c_arrow_schema; |
139 | | - ICEBERG_ARROW_RETURN_NOT_OK(::arrow::ExportSchema(*arrow_schema, &c_arrow_schema)); |
140 | | - internal::ArrowSchemaGuard guard(&c_arrow_schema); |
141 | | - ICEBERG_ASSIGN_OR_RAISE(auto schema, FromArrowSchema(c_arrow_schema, std::nullopt)); |
| 270 | + ICEBERG_ASSIGN_OR_RAISE(auto schema, |
| 271 | + InferIcebergSchema(arrow_schema, std::nullopt)); |
142 | 272 | read_schema_ = std::move(schema); |
143 | 273 | } |
144 | 274 |
|
|
0 commit comments