Skip to content

Commit d7c02c2

Browse files
committed
Impl sanitization
1 parent 4cac691 commit d7c02c2

File tree

2 files changed

+35
-9
lines changed

2 files changed

+35
-9
lines changed

pyiceberg/schema.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878

7979
INITIAL_SCHEMA_ID = 0
8080

81+
FIELD_ID_PROP = "field-id"
82+
ICEBERG_FIELD_NAME_PROP = "iceberg-field-name"
8183

8284
class Schema(IcebergBaseModel):
8385
"""A table Schema.
@@ -1356,6 +1358,21 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType:
13561358

13571359
# Implementation copied from Apache Iceberg repo.
13581360
def make_compatible_name(name: str) -> str:
1361+
"""Make a field name compatible with Avro specification.
1362+
1363+
This function sanitizes field names to comply with Avro naming rules:
1364+
- Names must start with [A-Za-z_]
1365+
- Subsequent characters must be [A-Za-z0-9_]
1366+
1367+
Invalid characters are replaced with _xHHHH where HHHH is the hex code.
1368+
Names starting with digits get a leading underscore.
1369+
1370+
Args:
1371+
name: The original field name
1372+
1373+
Returns:
1374+
A sanitized name that complies with Avro specification
1375+
"""
13591376
if not _valid_avro_name(name):
13601377
return _sanitize_name(name)
13611378
return name
@@ -1391,7 +1408,9 @@ def _sanitize_name(name: str) -> str:
13911408

13921409

13931410
def _sanitize_char(character: str) -> str:
1394-
return "_" + character if character.isdigit() else "_x" + hex(ord(character))[2:].upper()
1411+
if character.isdigit():
1412+
return "_" + character
1413+
return "_x" + hex(ord(character))[2:].upper()
13951414

13961415

13971416
def sanitize_column_names(schema: Schema) -> Schema:

pyiceberg/utils/schema_conversion.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
Union,
2727
)
2828

29-
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
29+
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit, ICEBERG_FIELD_NAME_PROP, FIELD_ID_PROP, make_compatible_name, _valid_avro_name
3030
from pyiceberg.types import (
3131
BinaryType,
3232
BooleanType,
@@ -225,13 +225,13 @@ def _convert_field(self, field: Dict[str, Any]) -> NestedField:
225225
Returns:
226226
The Iceberg equivalent field.
227227
"""
228-
if "field-id" not in field:
229-
raise ValueError(f"Cannot convert field, missing field-id: {field}")
228+
if FIELD_ID_PROP not in field:
229+
raise ValueError(f"Cannot convert field, missing {FIELD_ID_PROP}: {field}")
230230

231231
plain_type, required = self._resolve_union(field["type"])
232232

233233
return NestedField(
234-
field_id=field["field-id"],
234+
field_id=field[FIELD_ID_PROP],
235235
name=field["name"],
236236
field_type=self._convert_schema(plain_type),
237237
required=required,
@@ -524,12 +524,19 @@ def field(self, field: NestedField, field_result: AvroType) -> AvroType:
524524
if isinstance(field_result, dict) and field_result.get("type") == "record":
525525
field_result["name"] = f"r{field.field_id}"
526526

527+
orig_field_name = field.name
528+
is_valid_field_name = _valid_avro_name(orig_field_name)
529+
field_name = orig_field_name if is_valid_field_name else make_compatible_name(orig_field_name)
530+
527531
result = {
528-
"name": field.name,
529-
"field-id": field.field_id,
532+
"name": field_name,
533+
FIELD_ID_PROP: field.field_id,
530534
"type": field_result if field.required else ["null", field_result],
531535
}
532536

537+
if not is_valid_field_name:
538+
result[ICEBERG_FIELD_NAME_PROP] = orig_field_name
539+
533540
if field.write_default is not None:
534541
result["default"] = field.write_default
535542
elif field.optional:
@@ -564,8 +571,8 @@ def map(self, map_type: MapType, key_result: AvroType, value_result: AvroType) -
564571
"type": "record",
565572
"name": f"k{self.last_map_key_field_id}_v{self.last_map_value_field_id}",
566573
"fields": [
567-
{"name": "key", "type": key_result, "field-id": self.last_map_key_field_id},
568-
{"name": "value", "type": value_result, "field-id": self.last_map_value_field_id},
574+
{"name": "key", "type": key_result, FIELD_ID_PROP: self.last_map_key_field_id},
575+
{"name": "value", "type": value_result, FIELD_ID_PROP: self.last_map_value_field_id},
569576
],
570577
},
571578
"logicalType": "map",

0 commit comments

Comments
 (0)