-
Notifications
You must be signed in to change notification settings - Fork 184
feat(table): Adding geometry and geography type + schema plumbing #984
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -567,6 +567,8 @@ type SchemaVisitorPerPrimitiveType[T any] interface { | |
| VisitBinary() T | ||
| VisitUUID() T | ||
| VisitUnknown() T | ||
| VisitGeometry(GeometryType) T | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These methods get added to the interface but |
||
| VisitGeography(GeographyType) T | ||
| } | ||
|
|
||
| // Visit accepts a visitor and performs a post-order traversal of the given schema. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -630,6 +630,24 @@ func (c convertToArrow) VisitUnknown() arrow.Field { | |
| } | ||
| } | ||
|
|
||
| func (c convertToArrow) VisitGeometry(iceberg.GeometryType) arrow.Field { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pass-through binary is fine for now. Worth a one-liner about the GeoArrow migration plan (extension metadata over a binary buffer, no shape change for downstream readers) so we don't trap ourselves once #991 lands. Also note this method is currently unreachable due to the dispatch issue on
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see why we don't just use https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L15 off the bat right now instead of using binary/large binary as the intermediate. https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L84 can be used for the large type case |
||
| // Passthrough binary for now, adding geoarrow-go support later | ||
| if c.useLargeTypes { | ||
| return arrow.Field{Type: arrow.BinaryTypes.LargeBinary} | ||
| } | ||
|
|
||
| return arrow.Field{Type: arrow.BinaryTypes.Binary} | ||
| } | ||
|
|
||
| func (c convertToArrow) VisitGeography(iceberg.GeographyType) arrow.Field { | ||
| // Passthrough binary for now, adding geoarrow-go support later | ||
| if c.useLargeTypes { | ||
| return arrow.Field{Type: arrow.BinaryTypes.LargeBinary} | ||
| } | ||
|
|
||
| return arrow.Field{Type: arrow.BinaryTypes.Binary} | ||
|
Comment on lines
+643
to
+648
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same point as above. why not just use https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L20 right now instead of the passthrough? |
||
| } | ||
|
|
||
| var _ iceberg.SchemaVisitorPerPrimitiveType[arrow.Field] = convertToArrow{} | ||
|
|
||
| // SchemaToArrowSchema converts an Iceberg schema to an Arrow schema. If the metadata parameter | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1410,6 +1410,8 @@ func TestUnsupportedTypes(t *testing.T) { | |
| TestTypes := []iceberg.Type{ | ||
| iceberg.TimestampNsType{}, | ||
| iceberg.TimestampTzNsType{}, | ||
| iceberg.GeometryType{}, | ||
| iceberg.GeographyType{}, | ||
| } | ||
| for _, typ := range TestTypes { | ||
| for unsupportedVersion := 1; unsupportedVersion < minFormatVersionForType(typ); unsupportedVersion++ { | ||
|
|
@@ -1838,6 +1840,86 @@ func TestUnknownTypeValidation(t *testing.T) { | |
| }) | ||
| } | ||
|
|
||
| func TestGeometryGeographyNullOnlyDefaults(t *testing.T) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| testTypes := []struct { | ||
| name string | ||
| typ iceberg.Type | ||
| }{ | ||
| {"geometry", iceberg.GeometryType{}}, | ||
| {"geography", iceberg.GeographyType{}}, | ||
| } | ||
|
|
||
| for _, tt := range testTypes { | ||
| t.Run(tt.name+" with non-null initial default", func(t *testing.T) { | ||
| defaultValue := "POINT(0 0)" | ||
| sc := iceberg.NewSchema(0, | ||
| iceberg.NestedField{ | ||
| Type: tt.typ, | ||
| ID: 1, | ||
| Name: "location", | ||
| Required: false, | ||
| InitialDefault: &defaultValue, | ||
| }, | ||
| ) | ||
|
|
||
| err := checkSchemaCompatibility(sc, 3) | ||
| require.Error(t, err) | ||
| require.ErrorContains(t, err, "columns must default to null") | ||
| require.ErrorIs(t, err, iceberg.ErrInvalidSchema) | ||
| }) | ||
|
|
||
| t.Run(tt.name+" with non-null write default", func(t *testing.T) { | ||
| defaultValue := "POINT(0 0)" | ||
| sc := iceberg.NewSchema(0, | ||
| iceberg.NestedField{ | ||
| Type: tt.typ, | ||
| ID: 1, | ||
| Name: "location", | ||
| Required: false, | ||
| WriteDefault: &defaultValue, | ||
| }, | ||
| ) | ||
|
|
||
| err := checkSchemaCompatibility(sc, 3) | ||
| require.Error(t, err) | ||
| require.ErrorContains(t, err, "columns must default to null") | ||
| require.ErrorIs(t, err, iceberg.ErrInvalidSchema) | ||
| }) | ||
|
|
||
| t.Run(tt.name+" with null defaults", func(t *testing.T) { | ||
| sc := iceberg.NewSchema(0, | ||
| iceberg.NestedField{ | ||
| Type: tt.typ, | ||
| ID: 1, | ||
| Name: "location", | ||
| Required: false, | ||
| }, | ||
| ) | ||
|
|
||
| err := checkSchemaCompatibility(sc, 3) | ||
| require.NoError(t, err) | ||
| }) | ||
|
|
||
| t.Run(tt.name+" in v2 with non-null initial default", func(t *testing.T) { | ||
| defaultValue := "POINT(0 0)" | ||
| sc := iceberg.NewSchema(0, | ||
| iceberg.NestedField{ | ||
| Type: tt.typ, | ||
| ID: 1, | ||
| Name: "location", | ||
| Required: false, | ||
| InitialDefault: &defaultValue, | ||
| }, | ||
| ) | ||
|
|
||
| err := checkSchemaCompatibility(sc, 2) | ||
| require.Error(t, err) | ||
| require.ErrorContains(t, err, "is not supported until v3") | ||
| require.ErrorIs(t, err, iceberg.ErrInvalidSchema) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestComplexTypeDefaultValidation(t *testing.T) { | ||
| t.Run("InvalidStructInitialDefault", func(t *testing.T) { | ||
| schema := iceberg.NewSchema(1, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,17 @@ func (e ErrIncompatibleSchema) Error() string { | |
| fmt.Fprintf(&problems, "\n- invalid type for %s: %s is not supported until v%d", f.ColName, f.Field.Type, f.UnsupportedType.MinFormatVersion) | ||
| } | ||
| if f.InvalidDefault != nil { | ||
| fmt.Fprintf(&problems, "\n- invalid initial default for %s: non-null default (%v) is not supported until v%d", f.ColName, f.Field.InitialDefault, f.InvalidDefault.MinFormatVersion) | ||
| switch f.Field.Type.(type) { | ||
| case iceberg.GeometryType, iceberg.GeographyType: | ||
| if f.Field.InitialDefault != nil { | ||
| fmt.Fprintf(&problems, "\n- invalid initial default for %s: %s columns must default to null", f.ColName, f.Field.Type) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a field has both |
||
| } | ||
| if f.Field.WriteDefault != nil { | ||
| fmt.Fprintf(&problems, "\n- invalid write default for %s: %s columns must default to null", f.ColName, f.Field.Type) | ||
| } | ||
| default: | ||
| fmt.Fprintf(&problems, "\n- invalid initial default for %s: non-null default (%v) is not supported until v%d", f.ColName, f.Field.InitialDefault, f.InvalidDefault.MinFormatVersion) | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -113,12 +123,30 @@ func checkSchemaCompatibility(sc *iceberg.Schema, formatVersion int) error { | |
| }) | ||
| } | ||
|
|
||
| if field.InitialDefault != nil && formatVersion < defaultValuesMinFormatVersion { | ||
| problems = append(problems, IncompatibleField{ | ||
| Field: field, | ||
| ColName: colName, | ||
| InvalidDefault: &InvalidDefault{MinFormatVersion: defaultValuesMinFormatVersion, WriteDefault: field.InitialDefault}, | ||
| }) | ||
| switch field.Type.(type) { | ||
| case iceberg.GeometryType, iceberg.GeographyType: | ||
| if field.InitialDefault != nil { | ||
| problems = append(problems, IncompatibleField{ | ||
| Field: field, | ||
| ColName: colName, | ||
| InvalidDefault: &InvalidDefault{MinFormatVersion: formatVersion, WriteDefault: field.InitialDefault}, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| }) | ||
| } | ||
| if field.WriteDefault != nil { | ||
| problems = append(problems, IncompatibleField{ | ||
| Field: field, | ||
| ColName: colName, | ||
| InvalidDefault: &InvalidDefault{MinFormatVersion: formatVersion, WriteDefault: field.WriteDefault}, | ||
| }) | ||
| } | ||
| default: | ||
| if field.InitialDefault != nil && formatVersion < defaultValuesMinFormatVersion { | ||
| problems = append(problems, IncompatibleField{ | ||
| Field: field, | ||
| ColName: colName, | ||
| InvalidDefault: &InvalidDefault{MinFormatVersion: defaultValuesMinFormatVersion, WriteDefault: field.InitialDefault}, | ||
| }) | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -134,7 +162,7 @@ func checkSchemaCompatibility(sc *iceberg.Schema, formatVersion int) error { | |
| // version number for types that require newer format versions. | ||
| func minFormatVersionForType(t iceberg.Type) int { | ||
| switch t.(type) { | ||
| case iceberg.TimestampNsType, iceberg.TimestampTzNsType, iceberg.UnknownType: | ||
| case iceberg.TimestampNsType, iceberg.TimestampTzNsType, iceberg.UnknownType, iceberg.GeometryType, iceberg.GeographyType: | ||
| return 3 | ||
| default: | ||
| // All other types supported in v1+ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -169,6 +169,10 @@ func (convertToSubstrait) VisitUnknown() types.Type { | |
| // Returning nil indicates this type cannot be converted to Substrait | ||
| return nil | ||
| } | ||
| func (convertToSubstrait) VisitGeometry(iceberg.GeometryType) types.Type { return &types.BinaryType{} } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same dispatch issue as |
||
| func (convertToSubstrait) VisitGeography(iceberg.GeographyType) types.Type { | ||
| return &types.BinaryType{} | ||
| } | ||
|
|
||
| var _ iceberg.SchemaVisitorPerPrimitiveType[types.Type] = (*convertToSubstrait)(nil) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this be https://github.com/geoarrow/geoarrow-go/blob/main/wkb.go#L20 instead of
[]byte?