Skip to content

Commit 610cf80

Browse files
committed
Make DataFileSet preserve insertion order for v3 row ID assignment
Change DataFileSet from std::unordered_set to a custom class that preserves insertion order, similar to Java's DataFileSet which uses LinkedHashSet. This is important for row ID assignment in v3 manifests, where row IDs are assigned based on the order files are written. The implementation uses both a vector (for insertion order) and an unordered_set (for O(1) duplicate detection) to maintain the same API while preserving order.
1 parent 7cfc0e9 commit 610cf80

File tree

1 file changed

+83
-2
lines changed

1 file changed

+83
-2
lines changed

src/iceberg/util/content_file_util.h

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@
2222
/// \file iceberg/util/content_file_util.h
2323
/// Utility functions for content files (data files and delete files).
2424

25+
#include <algorithm>
2526
#include <functional>
2627
#include <memory>
2728
#include <optional>
2829
#include <span>
2930
#include <string>
3031
#include <unordered_set>
32+
#include <vector>
3133

3234
#include "iceberg/iceberg_export.h"
3335
#include "iceberg/manifest/manifest_entry.h"
@@ -61,8 +63,87 @@ struct ICEBERG_EXPORT DataFilePtrEqual {
6163
};
6264

6365
/// \brief A set of DataFile pointers, deduplicated by file path.
64-
using DataFileSet =
65-
std::unordered_set<std::shared_ptr<DataFile>, DataFilePtrHash, DataFilePtrEqual>;
66+
///
67+
/// This preserves insertion order, which is important for row ID assignment in v3
68+
/// manifests. Similar to Java's DataFileSet which uses LinkedHashSet to maintain
69+
/// insertion order.
70+
class ICEBERG_EXPORT DataFileSet {
71+
public:
72+
using value_type = std::shared_ptr<DataFile>;
73+
using iterator = typename std::vector<value_type>::iterator;
74+
using const_iterator = typename std::vector<value_type>::const_iterator;
75+
76+
DataFileSet() = default;
77+
78+
/// \brief Insert a data file into the set.
79+
/// \param file The data file to insert
80+
/// \return A pair with an iterator to the inserted element (or the existing one) and
81+
/// a bool indicating whether insertion took place
82+
std::pair<iterator, bool> insert(const value_type& file) {
83+
if (!file) {
84+
return {elements_.end(), false};
85+
}
86+
// Check if file already exists using the hash set for O(1) lookup
87+
auto [hash_iter, hash_inserted] = hash_set_.insert(file);
88+
if (!hash_inserted) {
89+
// File already exists, find it in the vector using the element from hash_set_
90+
const auto& existing_file = *hash_iter;
91+
auto vec_iter = std::ranges::find_if(elements_, [&existing_file](const auto& elem) {
92+
return DataFilePtrEqual{}(elem, existing_file);
93+
});
94+
return {vec_iter, false};
95+
}
96+
elements_.push_back(*hash_iter);
97+
return {std::prev(elements_.end()), true};
98+
}
99+
100+
/// \brief Insert a data file into the set (move version).
101+
std::pair<iterator, bool> insert(value_type&& file) {
102+
if (!file) {
103+
return {elements_.end(), false};
104+
}
105+
// Check if file already exists
106+
auto [hash_iter, hash_inserted] = hash_set_.insert(file);
107+
if (!hash_inserted) {
108+
// File already exists, find it in the vector using the element from hash_set_
109+
const auto& existing_file = *hash_iter;
110+
auto vec_iter = std::ranges::find_if(elements_, [&existing_file](const auto& elem) {
111+
return DataFilePtrEqual{}(elem, existing_file);
112+
});
113+
return {vec_iter, false};
114+
}
115+
elements_.push_back(*hash_iter);
116+
return {std::prev(elements_.end()), true};
117+
}
118+
119+
/// \brief Get the number of elements in the set.
120+
size_t size() const { return elements_.size(); }
121+
122+
/// \brief Check if the set is empty.
123+
bool empty() const { return elements_.empty(); }
124+
125+
/// \brief Clear all elements from the set.
126+
void clear() {
127+
elements_.clear();
128+
hash_set_.clear();
129+
}
130+
131+
/// \brief Get iterator to the beginning.
132+
iterator begin() { return elements_.begin(); }
133+
const_iterator begin() const { return elements_.begin(); }
134+
const_iterator cbegin() const { return elements_.cbegin(); }
135+
136+
/// \brief Get iterator to the end.
137+
iterator end() { return elements_.end(); }
138+
const_iterator end() const { return elements_.end(); }
139+
const_iterator cend() const { return elements_.cend(); }
140+
141+
private:
142+
// Vector to preserve insertion order
143+
std::vector<value_type> elements_;
144+
// Hash set for O(1) duplicate detection
145+
std::unordered_set<value_type, DataFilePtrHash, DataFilePtrEqual> hash_set_;
146+
};
66147

67148
/// \brief Utility functions for content files.
68149
struct ICEBERG_EXPORT ContentFileUtil {

0 commit comments

Comments
 (0)