|
22 | 22 | /// \file iceberg/util/content_file_util.h |
23 | 23 | /// Utility functions for content files (data files and delete files). |
24 | 24 |
|
| 25 | +#include <algorithm> |
25 | 26 | #include <functional> |
26 | 27 | #include <memory> |
27 | 28 | #include <optional> |
28 | 29 | #include <span> |
29 | 30 | #include <string> |
30 | 31 | #include <unordered_set> |
| 32 | +#include <vector> |
31 | 33 |
|
32 | 34 | #include "iceberg/iceberg_export.h" |
33 | 35 | #include "iceberg/manifest/manifest_entry.h" |
@@ -61,8 +63,87 @@ struct ICEBERG_EXPORT DataFilePtrEqual { |
61 | 63 | }; |
62 | 64 |
|
63 | 65 | /// \brief A set of DataFile pointers, deduplicated by file path. |
64 | | -using DataFileSet = |
65 | | - std::unordered_set<std::shared_ptr<DataFile>, DataFilePtrHash, DataFilePtrEqual>; |
| 66 | +/// |
| 67 | +/// This preserves insertion order, which is important for row ID assignment in v3 |
| 68 | +/// manifests. Similar to Java's DataFileSet which uses LinkedHashSet to maintain |
| 69 | +/// insertion order. |
| 70 | +class ICEBERG_EXPORT DataFileSet { |
| 71 | + public: |
| 72 | + using value_type = std::shared_ptr<DataFile>; |
| 73 | + using iterator = typename std::vector<value_type>::iterator; |
| 74 | + using const_iterator = typename std::vector<value_type>::const_iterator; |
| 75 | + |
| 76 | + DataFileSet() = default; |
| 77 | + |
| 78 | + /// \brief Insert a data file into the set. |
| 79 | + /// \param file The data file to insert |
| 80 | + /// \return A pair with an iterator to the inserted element (or the existing one) and |
| 81 | + /// a bool indicating whether insertion took place |
| 82 | + std::pair<iterator, bool> insert(const value_type& file) { |
| 83 | + if (!file) { |
| 84 | + return {elements_.end(), false}; |
| 85 | + } |
| 86 | + // Check if file already exists using the hash set for O(1) lookup |
| 87 | + auto [hash_iter, hash_inserted] = hash_set_.insert(file); |
| 88 | + if (!hash_inserted) { |
| 89 | + // File already exists, find it in the vector using the element from hash_set_ |
| 90 | + const auto& existing_file = *hash_iter; |
| 91 | + auto vec_iter = std::ranges::find_if(elements_, [&existing_file](const auto& elem) { |
| 92 | + return DataFilePtrEqual{}(elem, existing_file); |
| 93 | + }); |
| 94 | + return {vec_iter, false}; |
| 95 | + } |
| 96 | + elements_.push_back(*hash_iter); |
| 97 | + return {std::prev(elements_.end()), true}; |
| 98 | + } |
| 99 | + |
| 100 | + /// \brief Insert a data file into the set (move version). |
| 101 | + std::pair<iterator, bool> insert(value_type&& file) { |
| 102 | + if (!file) { |
| 103 | + return {elements_.end(), false}; |
| 104 | + } |
| 105 | + // Check if file already exists |
| 106 | + auto [hash_iter, hash_inserted] = hash_set_.insert(file); |
| 107 | + if (!hash_inserted) { |
| 108 | + // File already exists, find it in the vector using the element from hash_set_ |
| 109 | + const auto& existing_file = *hash_iter; |
| 110 | + auto vec_iter = std::ranges::find_if(elements_, [&existing_file](const auto& elem) { |
| 111 | + return DataFilePtrEqual{}(elem, existing_file); |
| 112 | + }); |
| 113 | + return {vec_iter, false}; |
| 114 | + } |
| 115 | + elements_.push_back(*hash_iter); |
| 116 | + return {std::prev(elements_.end()), true}; |
| 117 | + } |
| 118 | + |
| 119 | + /// \brief Get the number of elements in the set. |
| 120 | + size_t size() const { return elements_.size(); } |
| 121 | + |
| 122 | + /// \brief Check if the set is empty. |
| 123 | + bool empty() const { return elements_.empty(); } |
| 124 | + |
| 125 | + /// \brief Clear all elements from the set. |
| 126 | + void clear() { |
| 127 | + elements_.clear(); |
| 128 | + hash_set_.clear(); |
| 129 | + } |
| 130 | + |
| 131 | + /// \brief Get iterator to the beginning. |
| 132 | + iterator begin() { return elements_.begin(); } |
| 133 | + const_iterator begin() const { return elements_.begin(); } |
| 134 | + const_iterator cbegin() const { return elements_.cbegin(); } |
| 135 | + |
| 136 | + /// \brief Get iterator to the end. |
| 137 | + iterator end() { return elements_.end(); } |
| 138 | + const_iterator end() const { return elements_.end(); } |
| 139 | + const_iterator cend() const { return elements_.cend(); } |
| 140 | + |
| 141 | + private: |
| 142 | + // Vector to preserve insertion order |
| 143 | + std::vector<value_type> elements_; |
| 144 | + // Hash set for O(1) duplicate detection |
| 145 | + std::unordered_set<value_type, DataFilePtrHash, DataFilePtrEqual> hash_set_; |
| 146 | +}; |
66 | 147 |
|
67 | 148 | /// \brief Utility functions for content files. |
68 | 149 | struct ICEBERG_EXPORT ContentFileUtil { |
|
0 commit comments