Skip to content

Commit 67655bb

Browse files
committed
1
1 parent 74fced5 commit 67655bb

2 files changed

Lines changed: 165 additions & 144 deletions

File tree

src/iceberg/util/type_util.cc

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#include "iceberg/util/type_util.h"
2121

22+
#include <stack>
23+
2224
#include "iceberg/result.h"
2325
#include "iceberg/util/checked_cast.h"
2426
#include "iceberg/util/string_util.h"
@@ -142,4 +144,153 @@ void NameToIdVisitor::Finish() {
142144
}
143145
}
144146

147+
Status PositionPathVisitor::Visit(const PrimitiveType& type) {
148+
if (current_field_id_ == kUnassignedFieldId) {
149+
return InvalidSchema("Current field id is not assigned, type: {}", type.ToString());
150+
}
151+
152+
if (auto ret = position_path_.try_emplace(current_field_id_, current_path_);
153+
!ret.second) {
154+
return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}",
155+
current_field_id_, ret.first->second, current_path_);
156+
}
157+
158+
return {};
159+
}
160+
161+
Status PositionPathVisitor::Visit(const StructType& type) {
162+
for (size_t i = 0; i < type.fields().size(); ++i) {
163+
const auto& field = type.fields()[i];
164+
current_field_id_ = field.field_id();
165+
current_path_.push_back(i);
166+
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
167+
current_path_.pop_back();
168+
}
169+
return {};
170+
}
171+
172+
// Non-struct types are not supported yet, but it is not an error.
173+
Status PositionPathVisitor::Visit(const ListType& type) { return {}; }
174+
175+
Status PositionPathVisitor::Visit(const MapType& type) { return {}; }
176+
177+
std::unordered_map<int32_t, std::vector<size_t>> PositionPathVisitor::Finish() {
178+
return std::move(position_path_);
179+
}
180+
181+
PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
182+
bool select_full_types)
183+
: selected_ids_(selected_ids), select_full_types_(select_full_types) {}
184+
185+
Result<std::shared_ptr<Type>> PruneColumnVisitor::Visit(
186+
const std::shared_ptr<Type>& type) const {
187+
switch (type->type_id()) {
188+
case TypeId::kStruct:
189+
return Visit(internal::checked_pointer_cast<StructType>(type));
190+
case TypeId::kList:
191+
return Visit(internal::checked_pointer_cast<ListType>(type));
192+
case TypeId::kMap:
193+
return Visit(internal::checked_pointer_cast<MapType>(type));
194+
default:
195+
return nullptr;
196+
}
197+
}
198+
199+
Result<std::shared_ptr<Type>> PruneColumnVisitor::Visit(const SchemaField& field) const {
200+
if (selected_ids_.contains(field.field_id())) {
201+
return (select_full_types_ || field.type()->is_primitive()) ? field.type()
202+
: Visit(field.type());
203+
}
204+
return Visit(field.type());
205+
}
206+
207+
SchemaField PruneColumnVisitor::MakeField(const SchemaField& field,
208+
std::shared_ptr<Type> type) {
209+
return {field.field_id(), std::string(field.name()), std::move(type), field.optional(),
210+
std::string(field.doc())};
211+
}
212+
213+
Result<std::shared_ptr<Type>> PruneColumnVisitor::Visit(
214+
const std::shared_ptr<StructType>& type) const {
215+
bool same_types = true;
216+
std::vector<SchemaField> selected_fields;
217+
for (const auto& field : type->fields()) {
218+
ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
219+
if (child_type) {
220+
same_types = same_types && (child_type == field.type());
221+
selected_fields.emplace_back(MakeField(field, std::move(child_type)));
222+
}
223+
}
224+
225+
if (selected_fields.empty()) {
226+
return nullptr;
227+
} else if (same_types && selected_fields.size() == type->fields().size()) {
228+
return type;
229+
}
230+
return std::make_shared<StructType>(std::move(selected_fields));
231+
}
232+
233+
Result<std::shared_ptr<Type>> PruneColumnVisitor::Visit(
234+
const std::shared_ptr<ListType>& type) const {
235+
const auto& elem_field = type->fields()[0];
236+
ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
237+
if (elem_type == nullptr) {
238+
return nullptr;
239+
} else if (elem_type == elem_field.type()) {
240+
return type;
241+
}
242+
return std::make_shared<ListType>(MakeField(elem_field, std::move(elem_type)));
243+
}
244+
245+
Result<std::shared_ptr<Type>> PruneColumnVisitor::Visit(
246+
const std::shared_ptr<MapType>& type) const {
247+
const auto& key_field = type->fields()[0];
248+
const auto& value_field = type->fields()[1];
249+
ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
250+
ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
251+
252+
if (key_type == nullptr && value_type == nullptr) {
253+
return nullptr;
254+
} else if (value_type == value_field.type() &&
255+
(key_type == key_field.type() || key_type == nullptr)) {
256+
return type;
257+
} else if (value_type == nullptr) {
258+
return InvalidArgument("Cannot project Map without value field");
259+
}
260+
return std::make_shared<MapType>(
261+
(key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))),
262+
MakeField(value_field, std::move(value_type)));
263+
}
264+
265+
std::unordered_map<int32_t, int32_t> indexParents(const StructType& root_struct) {
266+
std::unordered_map<int32_t, int32_t> id_to_parent;
267+
std::stack<int32_t> parent_id_stack;
268+
269+
// Recursive function to visit and build parent relationships
270+
std::function<void(const Type&)> visit = [&](const Type& type) -> void {
271+
switch (type.type_id()) {
272+
case TypeId::kStruct:
273+
case TypeId::kList:
274+
case TypeId::kMap: {
275+
const auto& nested_type = static_cast<const NestedType&>(type);
276+
for (const auto& field : nested_type.fields()) {
277+
if (!parent_id_stack.empty()) {
278+
id_to_parent[field.field_id()] = parent_id_stack.top();
279+
}
280+
parent_id_stack.push(field.field_id());
281+
visit(*field.type());
282+
parent_id_stack.pop();
283+
}
284+
break;
285+
}
286+
287+
default:
288+
break;
289+
}
290+
};
291+
292+
visit(root_struct);
293+
return id_to_parent;
294+
}
295+
145296
} // namespace iceberg

src/iceberg/util/type_util.h

Lines changed: 14 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
#include <functional>
2323
#include <memory>
24-
#include <span>
25-
#include <stack>
2624
#include <string>
2725
#include <string_view>
2826
#include <unordered_map>
@@ -31,11 +29,8 @@
3129

3230
#include "iceberg/result.h"
3331
#include "iceberg/schema_field.h"
34-
#include "iceberg/type.h"
35-
#include "iceberg/util/checked_cast.h"
36-
#include "iceberg/util/formatter_internal.h"
32+
#include "iceberg/type_fwd.h"
3733
#include "iceberg/util/string_util.h"
38-
#include "iceberg/util/visit_type.h"
3934

4035
/// \file iceberg/util/type_util.h
4136
/// Utility functions and visitors for Iceberg types.
@@ -86,38 +81,11 @@ class NameToIdVisitor {
8681
/// \brief Visitor for building a map from field ID to position path.
8782
class PositionPathVisitor {
8883
public:
89-
Status Visit(const PrimitiveType& type) {
90-
if (current_field_id_ == kUnassignedFieldId) {
91-
return InvalidSchema("Current field id is not assigned, type: {}", type.ToString());
92-
}
93-
94-
if (auto ret = position_path_.try_emplace(current_field_id_, current_path_);
95-
!ret.second) {
96-
return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr path: {}",
97-
current_field_id_, ret.first->second, current_path_);
98-
}
99-
100-
return {};
101-
}
102-
103-
Status Visit(const StructType& type) {
104-
for (size_t i = 0; i < type.fields().size(); ++i) {
105-
const auto& field = type.fields()[i];
106-
current_field_id_ = field.field_id();
107-
current_path_.push_back(i);
108-
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
109-
current_path_.pop_back();
110-
}
111-
return {};
112-
}
113-
114-
// Non-struct types are not supported yet, but it is not an error.
115-
Status Visit(const ListType& type) { return {}; }
116-
Status Visit(const MapType& type) { return {}; }
117-
118-
std::unordered_map<int32_t, std::vector<size_t>> Finish() {
119-
return std::move(position_path_);
120-
}
84+
Status Visit(const PrimitiveType& type);
85+
Status Visit(const StructType& type);
86+
Status Visit(const ListType& type);
87+
Status Visit(const MapType& type);
88+
std::unordered_map<int32_t, std::vector<size_t>> Finish();
12189

12290
private:
12391
constexpr static int32_t kUnassignedFieldId = -1;
@@ -137,83 +105,14 @@ class PositionPathVisitor {
137105
class PruneColumnVisitor {
138106
public:
139107
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
140-
bool select_full_types)
141-
: selected_ids_(selected_ids), select_full_types_(select_full_types) {}
142-
143-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const {
144-
switch (type->type_id()) {
145-
case TypeId::kStruct:
146-
return Visit(internal::checked_pointer_cast<StructType>(type));
147-
case TypeId::kList:
148-
return Visit(internal::checked_pointer_cast<ListType>(type));
149-
case TypeId::kMap:
150-
return Visit(internal::checked_pointer_cast<MapType>(type));
151-
default:
152-
return nullptr;
153-
}
154-
}
155-
156-
Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
157-
if (selected_ids_.contains(field.field_id())) {
158-
return (select_full_types_ || field.type()->is_primitive()) ? field.type()
159-
: Visit(field.type());
160-
}
161-
return Visit(field.type());
162-
}
108+
bool select_full_types);
163109

164-
static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> type) {
165-
return {field.field_id(), std::string(field.name()), std::move(type),
166-
field.optional(), std::string(field.doc())};
167-
}
168-
169-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const {
170-
bool same_types = true;
171-
std::vector<SchemaField> selected_fields;
172-
for (const auto& field : type->fields()) {
173-
ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
174-
if (child_type) {
175-
same_types = same_types && (child_type == field.type());
176-
selected_fields.emplace_back(MakeField(field, std::move(child_type)));
177-
}
178-
}
179-
180-
if (selected_fields.empty()) {
181-
return nullptr;
182-
} else if (same_types && selected_fields.size() == type->fields().size()) {
183-
return type;
184-
}
185-
return std::make_shared<StructType>(std::move(selected_fields));
186-
}
187-
188-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const {
189-
const auto& elem_field = type->fields()[0];
190-
ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
191-
if (elem_type == nullptr) {
192-
return nullptr;
193-
} else if (elem_type == elem_field.type()) {
194-
return type;
195-
}
196-
return std::make_shared<ListType>(MakeField(elem_field, std::move(elem_type)));
197-
}
198-
199-
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const {
200-
const auto& key_field = type->fields()[0];
201-
const auto& value_field = type->fields()[1];
202-
ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
203-
ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
204-
205-
if (key_type == nullptr && value_type == nullptr) {
206-
return nullptr;
207-
} else if (value_type == value_field.type() &&
208-
(key_type == key_field.type() || key_type == nullptr)) {
209-
return type;
210-
} else if (value_type == nullptr) {
211-
return InvalidArgument("Cannot project Map without value field");
212-
}
213-
return std::make_shared<MapType>(
214-
(key_type == nullptr ? key_field : MakeField(key_field, std::move(key_type))),
215-
MakeField(value_field, std::move(value_type)));
216-
}
110+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const;
111+
Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const;
112+
static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> type);
113+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) const;
114+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) const;
115+
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) const;
217116

218117
private:
219118
const std::unordered_set<int32_t>& selected_ids_;
@@ -229,35 +128,6 @@ class PruneColumnVisitor {
229128
/// - All field IDs must be unique across the entire schema hierarchy
230129
/// If the struct is part of a Schema, these invariants are enforced by
231130
/// StructType::InitFieldById which checks for duplicate field IDs.
232-
static std::unordered_map<int32_t, int32_t> indexParents(const StructType& root_struct) {
233-
std::unordered_map<int32_t, int32_t> id_to_parent;
234-
std::stack<int32_t> parent_id_stack;
235-
236-
// Recursive function to visit and build parent relationships
237-
std::function<void(const Type&)> visit = [&](const Type& type) -> void {
238-
switch (type.type_id()) {
239-
case TypeId::kStruct:
240-
case TypeId::kList:
241-
case TypeId::kMap: {
242-
const auto& nested_type = static_cast<const NestedType&>(type);
243-
for (const auto& field : nested_type.fields()) {
244-
if (!parent_id_stack.empty()) {
245-
id_to_parent[field.field_id()] = parent_id_stack.top();
246-
}
247-
parent_id_stack.push(field.field_id());
248-
visit(*field.type());
249-
parent_id_stack.pop();
250-
}
251-
break;
252-
}
253-
254-
default:
255-
break;
256-
}
257-
};
258-
259-
visit(root_struct);
260-
return id_to_parent;
261-
}
131+
std::unordered_map<int32_t, int32_t> indexParents(const StructType& root_struct);
262132

263133
} // namespace iceberg

0 commit comments

Comments
 (0)