Skip to content

Commit c8d4292

Browse files
committed
feat: implement ValidateRedundantPartitions for PartitionSpec
- Add ValidateRedundantPartitions method to detect redundant partition fields - Use deduplication key (source_id + transform.DedupName()) to identify conflicts - Integrate validation into PartitionSpec::Validate method
1 parent 61a7de5 commit c8d4292

6 files changed

Lines changed: 419 additions & 5 deletions

File tree

src/iceberg/partition_spec.cc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@
2323
#include <cstddef>
2424
#include <cstdint>
2525
#include <format>
26+
#include <map>
2627
#include <memory>
2728
#include <ranges>
2829
#include <unordered_map>
30+
#include <utility>
2931

3032
#include "iceberg/result.h"
3133
#include "iceberg/schema.h"
@@ -132,6 +134,7 @@ bool PartitionSpec::Equals(const PartitionSpec& other) const {
132134

133135
Status PartitionSpec::Validate(const Schema& schema, bool allow_missing_fields) const {
134136
ICEBERG_RETURN_UNEXPECTED(ValidatePartitionName(schema, *this));
137+
ICEBERG_RETURN_UNEXPECTED(ValidateRedundantPartitions(*this));
135138

136139
std::unordered_map<int32_t, int32_t> parents = IndexParents(schema);
137140
for (const auto& partition_field : fields_) {
@@ -261,4 +264,25 @@ bool PartitionSpec::HasSequentialFieldIds(const PartitionSpec& spec) {
261264
return true;
262265
}
263266

267+
Status PartitionSpec::ValidateRedundantPartitions(const PartitionSpec& spec) {
268+
// Use a map to track deduplication keys (source_id + transform dedup name)
269+
std::map<std::pair<int32_t, std::string>, const PartitionField*> dedup_fields;
270+
271+
for (const auto& field : spec.fields()) {
272+
// Create dedup key: (source_id, transform_dedup_name)
273+
auto dedup_key = std::make_pair(field.source_id(), field.transform()->DedupName());
274+
275+
// Check if this dedup key already exists
276+
auto existing_field_iter = dedup_fields.find(dedup_key);
277+
ICEBERG_PRECHECK(existing_field_iter == dedup_fields.end(),
278+
"Cannot add redundant partition: {} conflicts with {}",
279+
field.ToString(), existing_field_iter->second->ToString());
280+
281+
// Add this field to the dedup map
282+
dedup_fields[dedup_key] = &field;
283+
}
284+
285+
return {};
286+
}
287+
264288
} // namespace iceberg

src/iceberg/partition_spec.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable {
135135
/// \brief Compare two partition specs for equality.
136136
bool Equals(const PartitionSpec& other) const;
137137

138+
/// \brief Validates that there are no redundant partition fields in the spec.
139+
static Status ValidateRedundantPartitions(const PartitionSpec& spec);
140+
138141
using SourceIdToFieldsMap = std::unordered_map<int32_t, std::vector<PartitionFieldRef>>;
139142
static Result<SourceIdToFieldsMap> InitSourceIdToFieldsMap(const PartitionSpec&);
140143

src/iceberg/test/partition_spec_test.cc

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,4 +303,127 @@ TEST(PartitionSpecTest, PartitionFieldInStructInMap) {
303303
EXPECT_THAT(result_value, HasErrorMessage("Invalid partition field parent"));
304304
}
305305

306+
// Test ValidateRedundantPartitions
307+
TEST(PartitionSpecTest, ValidateRedundantPartitionsExactDuplicates) {
308+
// Create a schema with different field types
309+
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
310+
auto id_field = SchemaField::MakeRequired(2, "id", int64());
311+
Schema schema({ts_field, id_field}, Schema::kInitialSchemaId);
312+
313+
// Test: exact duplicate transforms on same field (same dedup name)
314+
{
315+
PartitionField field1(1, 1000, "ts_day1", Transform::Day());
316+
PartitionField field2(1, 1001, "ts_day2", Transform::Day());
317+
318+
auto result = PartitionSpec::Make(schema, 1, {field1, field2}, false);
319+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
320+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
321+
EXPECT_THAT(result, HasErrorMessage("conflicts with"));
322+
}
323+
324+
// Test: same bucket size on same field (redundant)
325+
{
326+
PartitionField bucket1(2, 1000, "id_bucket1", Transform::Bucket(16));
327+
PartitionField bucket2(2, 1001, "id_bucket2", Transform::Bucket(16));
328+
329+
auto result = PartitionSpec::Make(schema, 1, {bucket1, bucket2}, false);
330+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
331+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
332+
}
333+
334+
// Test: same truncate width on same field (redundant)
335+
{
336+
auto name_field = SchemaField::MakeRequired(3, "name", string());
337+
Schema schema_with_string({name_field}, Schema::kInitialSchemaId);
338+
339+
PartitionField truncate1(3, 1000, "name_trunc1", Transform::Truncate(4));
340+
PartitionField truncate2(3, 1001, "name_trunc2", Transform::Truncate(4));
341+
342+
auto result =
343+
PartitionSpec::Make(schema_with_string, 1, {truncate1, truncate2}, false);
344+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
345+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
346+
}
347+
}
348+
349+
TEST(PartitionSpecTest, ValidateRedundantPartitionsAllowedCases) {
350+
// Create a schema with different field types
351+
auto ts_field = SchemaField::MakeRequired(1, "ts", timestamp());
352+
auto id_field = SchemaField::MakeRequired(2, "id", int64());
353+
auto name_field = SchemaField::MakeRequired(3, "name", string());
354+
Schema schema({ts_field, id_field, name_field}, Schema::kInitialSchemaId);
355+
356+
// Test: different bucket sizes on same field (allowed - different dedup names)
357+
{
358+
PartitionField bucket16(2, 1000, "id_bucket16", Transform::Bucket(16));
359+
PartitionField bucket32(2, 1001, "id_bucket32", Transform::Bucket(32));
360+
361+
auto result = PartitionSpec::Make(schema, 1, {bucket16, bucket32}, false);
362+
EXPECT_THAT(result, IsOk());
363+
}
364+
365+
// Test: different truncate widths on same field (allowed - different dedup names)
366+
{
367+
PartitionField truncate4(3, 1000, "name_trunc4", Transform::Truncate(4));
368+
PartitionField truncate8(3, 1001, "name_trunc8", Transform::Truncate(8));
369+
370+
auto result = PartitionSpec::Make(schema, 1, {truncate4, truncate8}, false);
371+
EXPECT_THAT(result, IsOk());
372+
}
373+
374+
// Test: same transforms on different fields (allowed)
375+
{
376+
PartitionField ts_day(1, 1000, "ts_day", Transform::Day());
377+
PartitionField id_bucket(2, 1001, "id_bucket", Transform::Bucket(16));
378+
379+
auto result = PartitionSpec::Make(schema, 1, {ts_day, id_bucket}, false);
380+
EXPECT_THAT(result, IsOk());
381+
}
382+
383+
// Test: different transforms on same field (allowed if dedup names differ)
384+
{
385+
PartitionField ts_day(1, 1000, "ts_day", Transform::Day());
386+
PartitionField ts_month(1, 1001, "ts_month", Transform::Month());
387+
388+
// This should be allowed since Day and Month have different dedup names
389+
// The Java logic only checks for exact dedup name matches
390+
auto result = PartitionSpec::Make(schema, 1, {ts_day, ts_month}, false);
391+
EXPECT_THAT(result, IsOk());
392+
}
393+
394+
// Test: single partition field (no redundancy possible)
395+
{
396+
PartitionField single_field(1, 1000, "ts_year", Transform::Year());
397+
398+
auto result = PartitionSpec::Make(schema, 1, {single_field}, false);
399+
EXPECT_THAT(result, IsOk());
400+
}
401+
}
402+
403+
TEST(PartitionSpecTest, ValidateRedundantPartitionsIdentityTransforms) {
404+
// Create a schema with different field types
405+
auto id_field = SchemaField::MakeRequired(1, "id", int64());
406+
auto name_field = SchemaField::MakeRequired(2, "name", string());
407+
Schema schema({id_field, name_field}, Schema::kInitialSchemaId);
408+
409+
// Test: multiple identity transforms on same field (redundant)
410+
{
411+
PartitionField identity1(1, 1000, "id1", Transform::Identity());
412+
PartitionField identity2(1, 1001, "id2", Transform::Identity());
413+
414+
auto result = PartitionSpec::Make(schema, 1, {identity1, identity2}, false);
415+
EXPECT_THAT(result, IsError(ErrorKind::kInvalidArgument));
416+
EXPECT_THAT(result, HasErrorMessage("Cannot add redundant partition"));
417+
}
418+
419+
// Test: identity transforms on different fields (allowed)
420+
{
421+
PartitionField id_identity(1, 1000, "id", Transform::Identity());
422+
PartitionField name_identity(2, 1001, "name", Transform::Identity());
423+
424+
auto result = PartitionSpec::Make(schema, 1, {id_identity, name_identity}, false);
425+
EXPECT_THAT(result, IsOk());
426+
}
427+
}
428+
306429
} // namespace iceberg

0 commit comments

Comments
 (0)