Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions exprs.go
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,8 @@ func createBoundRef(field NestedField, acc accessor) BoundReference {
return &boundRef[Decimal]{field: field, acc: acc}
case UUIDType:
return &boundRef[uuid.UUID]{field: field, acc: acc}
case GeographyType, GeometryType:
return &boundRef[[]byte]{field: field, acc: acc}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
panic("unhandled bound reference type: " + field.Type.String())
}
Expand Down
2 changes: 2 additions & 0 deletions schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,8 @@ type SchemaVisitorPerPrimitiveType[T any] interface {
VisitBinary() T
VisitUUID() T
VisitUnknown() T
VisitGeometry(GeometryType) T
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These methods get added to the interface but visitField (~671–720) doesn't dispatch them. GeometryType/GeographyType are PrimitiveTypes, so they fall through to visitor.Primitive(...), which both visitors panic from. SchemaToArrowSchema on any geo column will panic at runtime today. I'd add the two cases in the dispatcher and a regression test that runs iceberg.Visit(geoSchema, convertToArrow{}).

VisitGeography(GeographyType) T
}

// Visit accepts a visitor and performs a post-order traversal of the given schema.
Expand Down
174 changes: 174 additions & 0 deletions schema_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1035,3 +1035,177 @@ func TestSanitizeColumnNamesEmptyFieldName(t *testing.T) {
assert.ErrorIs(t, err, iceberg.ErrInvalidSchema)
assert.ErrorContains(t, err, "field name cannot be empty")
}

func TestSchemaWithGeometryGeographyTypes(t *testing.T) {
geom, err := iceberg.GeometryTypeOf("srid:4326")
require.NoError(t, err)
geog, err := iceberg.GeographyTypeOf("srid:4269", iceberg.EdgeAlgorithmKarney)
require.NoError(t, err)

schema := iceberg.NewSchema(1,
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int64, Required: true},
iceberg.NestedField{ID: 2, Name: "simple_point", Type: iceberg.GeometryType{}, Required: false},
iceberg.NestedField{ID: 3, Name: "location", Type: geom, Required: false},
iceberg.NestedField{ID: 4, Name: "service_area", Type: geog, Required: false},
)

data, err := json.Marshal(schema)
require.NoError(t, err)

assert.JSONEq(t, `{
"type": "struct",
"schema-id": 1,
"identifier-field-ids": [],
"fields": [
{"id": 1, "name": "id", "type": "long", "required": true},
{"id": 2, "name": "simple_point", "type": "geometry", "required": false},
{"id": 3, "name": "location", "type": "geometry(srid:4326)", "required": false},
{"id": 4, "name": "service_area", "type": "geography(srid:4269, karney)", "required": false}
]
}`, string(data))

var unmarshaledSchema iceberg.Schema
require.NoError(t, json.Unmarshal(data, &unmarshaledSchema))
assert.True(t, schema.Equals(&unmarshaledSchema))
}

func TestNestedFieldToStringGeographyGeometry(t *testing.T) {
geom, err := iceberg.GeometryTypeOf("srid:3857")
require.NoError(t, err)
geog, err := iceberg.GeographyTypeOf("srid:4269", iceberg.EdgeAlgorithmKarney)
require.NoError(t, err)

tests := []struct {
field iceberg.NestedField
expected string
}{
{
iceberg.NestedField{ID: 1, Name: "point", Type: iceberg.GeometryType{}, Required: false},
"1: point: optional geometry",
},
{
iceberg.NestedField{ID: 2, Name: "location", Type: geom, Required: true},
"2: location: required geometry(srid:3857)",
},
{
iceberg.NestedField{ID: 3, Name: "area", Type: iceberg.GeographyType{}, Required: false},
"3: area: optional geography",
},
{
iceberg.NestedField{ID: 4, Name: "region", Type: geog, Required: false},
"4: region: optional geography(srid:4269, karney)",
},
}

for _, tt := range tests {
t.Run(tt.field.Name, func(t *testing.T) {
assert.Equal(t, tt.expected, tt.field.String())
})
}
}

func TestSchemaWithGeometryInNestedStructures(t *testing.T) {
geom, err := iceberg.GeometryTypeOf("srid:4326")
require.NoError(t, err)
geog, err := iceberg.GeographyTypeOf("srid:4269", iceberg.EdgeAlgorithmSpherical)
require.NoError(t, err)

schema := iceberg.NewSchema(1,
iceberg.NestedField{
ID: 1,
Name: "locations",
Type: &iceberg.ListType{
ElementID: 2,
Element: geom,
ElementRequired: true,
},
Required: true,
},
iceberg.NestedField{
ID: 3,
Name: "region_data",
Type: &iceberg.MapType{
KeyID: 4,
KeyType: iceberg.PrimitiveTypes.String,
ValueID: 5,
ValueType: geog,
ValueRequired: false,
},
Required: false,
},
iceberg.NestedField{
ID: 6,
Name: "place",
Type: &iceberg.StructType{
FieldList: []iceberg.NestedField{
{ID: 7, Name: "name", Type: iceberg.PrimitiveTypes.String, Required: true},
{ID: 8, Name: "coords", Type: iceberg.GeometryType{}, Required: false},
},
},
Required: false,
},
)

data, err := json.Marshal(schema)
require.NoError(t, err)

var unmarshaledSchema iceberg.Schema
require.NoError(t, json.Unmarshal(data, &unmarshaledSchema))
assert.True(t, schema.Equals(&unmarshaledSchema))

assert.Equal(t, "1: locations: required list<geometry(srid:4326)>", schema.Field(0).String())
assert.Equal(t, "3: region_data: optional map<string, geography(srid:4269, spherical)>", schema.Field(1).String())
}

func TestPruneColumnsWithGeometry(t *testing.T) {
geom, err := iceberg.GeometryTypeOf("srid:4326")
require.NoError(t, err)

schema := iceberg.NewSchema(1,
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int32, Required: true},
iceberg.NestedField{ID: 2, Name: "name", Type: iceberg.PrimitiveTypes.String, Required: false},
iceberg.NestedField{ID: 3, Name: "location", Type: geom, Required: false},
)

pruned, err := iceberg.PruneColumns(schema, map[int]iceberg.Void{1: {}, 3: {}}, false)
require.NoError(t, err)

expected := iceberg.NewSchema(1,
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int32, Required: true},
iceberg.NestedField{ID: 3, Name: "location", Type: geom, Required: false},
)

assert.True(t, pruned.Equals(expected))
}

func TestSchemaIndexByIDWithGeography(t *testing.T) {
geog, err := iceberg.GeographyTypeOf("srid:4269", iceberg.EdgeAlgorithmKarney)
require.NoError(t, err)

schema := iceberg.NewSchema(1,
iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int32, Required: true},
iceberg.NestedField{ID: 2, Name: "area", Type: geog, Required: false},
)

index, err := iceberg.IndexByID(schema)
require.NoError(t, err)

assert.Len(t, index, 2)
assert.Equal(t, geog, index[2].Type)
assert.Equal(t, "area", index[2].Name)
}

func TestSchemaFindColumnNameWithGeometryGeography(t *testing.T) {
schema := iceberg.NewSchema(1,
iceberg.NestedField{ID: 1, Name: "point", Type: iceberg.GeometryType{}, Required: false},
iceberg.NestedField{ID: 2, Name: "region", Type: iceberg.GeographyType{}, Required: false},
)

name, ok := schema.FindColumnName(1)
assert.True(t, ok)
assert.Equal(t, "point", name)

name, ok = schema.FindColumnName(2)
assert.True(t, ok)
assert.Equal(t, "region", name)
}
18 changes: 18 additions & 0 deletions table/arrow_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,24 @@ func (c convertToArrow) VisitUnknown() arrow.Field {
}
}

func (c convertToArrow) VisitGeometry(iceberg.GeometryType) arrow.Field {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass-through binary is fine for now. Worth a one-liner about the GeoArrow migration plan (extension metadata over a binary buffer, no shape change for downstream readers) so we don't trap ourselves once #991 lands. Also note this method is currently unreachable due to the dispatch issue on schema.go.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why we don't just use https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L15 off the bat right now instead of using binary/large binary as the intermediate.

https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L84 can be used for the large type case

// Passthrough binary for now, adding geoarrow-go support later
if c.useLargeTypes {
return arrow.Field{Type: arrow.BinaryTypes.LargeBinary}
}

return arrow.Field{Type: arrow.BinaryTypes.Binary}
}

func (c convertToArrow) VisitGeography(iceberg.GeographyType) arrow.Field {
// Passthrough binary for now, adding geoarrow-go support later
if c.useLargeTypes {
return arrow.Field{Type: arrow.BinaryTypes.LargeBinary}
}

return arrow.Field{Type: arrow.BinaryTypes.Binary}
Comment on lines +643 to +648
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same point as above. why not just use https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L20 right now instead of the passthrough?

}

var _ iceberg.SchemaVisitorPerPrimitiveType[arrow.Field] = convertToArrow{}

// SchemaToArrowSchema converts an Iceberg schema to an Arrow schema. If the metadata parameter
Expand Down
82 changes: 82 additions & 0 deletions table/metadata_builder_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,8 @@ func TestUnsupportedTypes(t *testing.T) {
TestTypes := []iceberg.Type{
iceberg.TimestampNsType{},
iceberg.TimestampTzNsType{},
iceberg.GeometryType{},
iceberg.GeographyType{},
}
for _, typ := range TestTypes {
for unsupportedVersion := 1; unsupportedVersion < minFormatVersionForType(typ); unsupportedVersion++ {
Expand Down Expand Up @@ -1838,6 +1840,86 @@ func TestUnknownTypeValidation(t *testing.T) {
})
}

func TestGeometryGeographyNullOnlyDefaults(t *testing.T) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The v2 with non-null initial default subtest asserts "is not supported until v3". That path actually fires both the v2-unsupported error and the must-default-to-null error, and ErrorContains happens to match the first. If geo ever becomes allowed in v2, this silently changes meaning. Worth splitting: one v2 subtest asserting the type-unsupported message, one v3 subtest with non-null default asserting must-default-to-null.

testTypes := []struct {
name string
typ iceberg.Type
}{
{"geometry", iceberg.GeometryType{}},
{"geography", iceberg.GeographyType{}},
}

for _, tt := range testTypes {
t.Run(tt.name+" with non-null initial default", func(t *testing.T) {
defaultValue := "POINT(0 0)"
sc := iceberg.NewSchema(0,
iceberg.NestedField{
Type: tt.typ,
ID: 1,
Name: "location",
Required: false,
InitialDefault: &defaultValue,
},
)

err := checkSchemaCompatibility(sc, 3)
require.Error(t, err)
require.ErrorContains(t, err, "columns must default to null")
require.ErrorIs(t, err, iceberg.ErrInvalidSchema)
})

t.Run(tt.name+" with non-null write default", func(t *testing.T) {
defaultValue := "POINT(0 0)"
sc := iceberg.NewSchema(0,
iceberg.NestedField{
Type: tt.typ,
ID: 1,
Name: "location",
Required: false,
WriteDefault: &defaultValue,
},
)

err := checkSchemaCompatibility(sc, 3)
require.Error(t, err)
require.ErrorContains(t, err, "columns must default to null")
require.ErrorIs(t, err, iceberg.ErrInvalidSchema)
})

t.Run(tt.name+" with null defaults", func(t *testing.T) {
sc := iceberg.NewSchema(0,
iceberg.NestedField{
Type: tt.typ,
ID: 1,
Name: "location",
Required: false,
},
)

err := checkSchemaCompatibility(sc, 3)
require.NoError(t, err)
})

t.Run(tt.name+" in v2 with non-null initial default", func(t *testing.T) {
defaultValue := "POINT(0 0)"
sc := iceberg.NewSchema(0,
iceberg.NestedField{
Type: tt.typ,
ID: 1,
Name: "location",
Required: false,
InitialDefault: &defaultValue,
},
)

err := checkSchemaCompatibility(sc, 2)
require.Error(t, err)
require.ErrorContains(t, err, "is not supported until v3")
require.ErrorIs(t, err, iceberg.ErrInvalidSchema)
})
}
}

func TestComplexTypeDefaultValidation(t *testing.T) {
t.Run("InvalidStructInitialDefault", func(t *testing.T) {
schema := iceberg.NewSchema(1,
Expand Down
44 changes: 36 additions & 8 deletions table/metadata_schema_compatibility.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,17 @@ func (e ErrIncompatibleSchema) Error() string {
fmt.Fprintf(&problems, "\n- invalid type for %s: %s is not supported until v%d", f.ColName, f.Field.Type, f.UnsupportedType.MinFormatVersion)
}
if f.InvalidDefault != nil {
fmt.Fprintf(&problems, "\n- invalid initial default for %s: non-null default (%v) is not supported until v%d", f.ColName, f.Field.InitialDefault, f.InvalidDefault.MinFormatVersion)
switch f.Field.Type.(type) {
case iceberg.GeometryType, iceberg.GeographyType:
if f.Field.InitialDefault != nil {
fmt.Fprintf(&problems, "\n- invalid initial default for %s: %s columns must default to null", f.ColName, f.Field.Type)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When a field has both InitialDefault and WriteDefault, two IncompatibleField entries get appended (see ~line 132 below) and Error() re-prints both default kinds for each — four lines for two problems. Either collapse to one entry per field, or drive the printout from f.InvalidDefault.WriteDefault (the value captured at append time).

}
if f.Field.WriteDefault != nil {
fmt.Fprintf(&problems, "\n- invalid write default for %s: %s columns must default to null", f.ColName, f.Field.Type)
}
default:
fmt.Fprintf(&problems, "\n- invalid initial default for %s: non-null default (%v) is not supported until v%d", f.ColName, f.Field.InitialDefault, f.InvalidDefault.MinFormatVersion)
}
}
}

Expand Down Expand Up @@ -113,12 +123,30 @@ func checkSchemaCompatibility(sc *iceberg.Schema, formatVersion int) error {
})
}

if field.InitialDefault != nil && formatVersion < defaultValuesMinFormatVersion {
problems = append(problems, IncompatibleField{
Field: field,
ColName: colName,
InvalidDefault: &InvalidDefault{MinFormatVersion: defaultValuesMinFormatVersion, WriteDefault: field.InitialDefault},
})
switch field.Type.(type) {
case iceberg.GeometryType, iceberg.GeographyType:
if field.InitialDefault != nil {
problems = append(problems, IncompatibleField{
Field: field,
ColName: colName,
InvalidDefault: &InvalidDefault{MinFormatVersion: formatVersion, WriteDefault: field.InitialDefault},
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MinFormatVersion = formatVersion (current version) here, but the field's documented as the minimum required version where the feature became supported. IncompatibleField is exported, so a downstream consumer reads a misleading value. Cleaner to add a separate reason flag like MustBeNullForType and switch on that in Error() instead of re-checking f.Field.Type.

})
}
if field.WriteDefault != nil {
problems = append(problems, IncompatibleField{
Field: field,
ColName: colName,
InvalidDefault: &InvalidDefault{MinFormatVersion: formatVersion, WriteDefault: field.WriteDefault},
})
}
default:
if field.InitialDefault != nil && formatVersion < defaultValuesMinFormatVersion {
problems = append(problems, IncompatibleField{
Field: field,
ColName: colName,
InvalidDefault: &InvalidDefault{MinFormatVersion: defaultValuesMinFormatVersion, WriteDefault: field.InitialDefault},
})
}
}
}

Expand All @@ -134,7 +162,7 @@ func checkSchemaCompatibility(sc *iceberg.Schema, formatVersion int) error {
// version number for types that require newer format versions.
func minFormatVersionForType(t iceberg.Type) int {
switch t.(type) {
case iceberg.TimestampNsType, iceberg.TimestampTzNsType, iceberg.UnknownType:
case iceberg.TimestampNsType, iceberg.TimestampTzNsType, iceberg.UnknownType, iceberg.GeometryType, iceberg.GeographyType:
return 3
default:
// All other types supported in v1+
Expand Down
4 changes: 4 additions & 0 deletions table/substrait/substrait.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ func (convertToSubstrait) VisitUnknown() types.Type {
// Returning nil indicates this type cannot be converted to Substrait
return nil
}
func (convertToSubstrait) VisitGeometry(iceberg.GeometryType) types.Type { return &types.BinaryType{} }
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same dispatch issue as schema.go — these are dead code today because visitField doesn't route to them, and convertToSubstrait.Primitive panics on the fall-through. Minor: VisitGeometry is one-line but VisitGeography is multi-line; surrounding methods are uniformly one-liners.

func (convertToSubstrait) VisitGeography(iceberg.GeographyType) types.Type {
return &types.BinaryType{}
}

var _ iceberg.SchemaVisitorPerPrimitiveType[types.Type] = (*convertToSubstrait)(nil)

Expand Down
Loading
Loading