Skip to content

Commit 70728f6

Browse files
authored
Add option for deterministic defaults (#18)
1 parent f52607f commit 70728f6

File tree

2 files changed

+77
-1
lines changed

2 files changed

+77
-1
lines changed

src/py_avro_schema/_schemas.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ class Option(enum.Flag):
147147
#: Add an __id field to records to track the id of mutable objects
148148
ADD_REFERENCE_ID = enum.auto()
149149

150+
# Use deterministic default values for volatile types like datetime or strings like UUIDs. Non-deterministic
151+
# factories might a problem when comparing schemas, as they change every time a schema is generated by definition.
152+
DETERMINISTIC_DEFAULTS = enum.auto()
153+
150154

151155
JSON_OPTIONS = [opt for opt in Option if opt.name and opt.name.startswith("JSON_")]
152156

@@ -221,6 +225,7 @@ def _schema_obj(py_type: Type, namespace: Optional[str] = None, options: Option
221225

222226
# See https://avro.apache.org/docs/1.11.1/specification/#names
223227
_AVRO_NAME_PATTERN = re.compile(r"^[A-Za-z]([A-Za-z0-9_])*$")
228+
_UUID_PATTERN = re.compile(r"^[0-9a-f]{8}(?:-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})?$", re.IGNORECASE)
224229

225230

226231
def validate_name(value: str) -> str:
@@ -487,6 +492,8 @@ def data(self, names: NamesType) -> JSONObj:
487492

488493
def make_default(self, py_default: datetime.date) -> int:
489494
"""Return an Avro schema compliant default value for a given Python value"""
495+
if Option.DETERMINISTIC_DEFAULTS in self.options:
496+
return 0
490497
return (py_default - datetime.date(1970, 1, 1)).days
491498

492499

@@ -510,6 +517,8 @@ def data(self, names: NamesType) -> JSONObj:
510517

511518
def make_default(self, py_default: datetime.time) -> int:
512519
"""Return an Avro schema compliant default value for a given Python value"""
520+
if Option.DETERMINISTIC_DEFAULTS in self.options:
521+
return 0
513522
# Force UTC as we're concerned only about time diffs
514523
dt1 = datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc)
515524
dt2 = datetime.datetime.combine(datetime.datetime(1, 1, 1), py_default, tzinfo=datetime.timezone.utc)
@@ -532,6 +541,8 @@ def data(self, names: NamesType) -> JSONObj:
532541

533542
def make_default(self, py_default: datetime.datetime) -> int:
534543
"""Return an Avro schema compliant default value for a given Python value"""
544+
if Option.DETERMINISTIC_DEFAULTS in self.options:
545+
return 0
535546
if not py_default.tzinfo:
536547
raise TypeError(f"Default {py_default!r} must be timezone-aware")
537548
return int((py_default - datetime.datetime.fromtimestamp(0, tz=datetime.timezone.utc)).total_seconds() * 1e6)
@@ -1079,7 +1090,18 @@ def data(self, names: NamesType) -> JSONObj:
10791090
if self.aliases:
10801091
field_data["aliases"] = sorted(self.aliases)
10811092
if self.default != dataclasses.MISSING:
1082-
field_data["default"] = self.schema.make_default(self.default)
1093+
default_value = self.schema.make_default(self.default)
1094+
1095+
# When a field is a string, but it's default value produces a UUID-like, we do not pass through the UUID
1096+
# schema (which already sets an empty default). We need to catch here the strings that look like a UUID
1097+
# and set a deterministic default.
1098+
if (
1099+
Option.DETERMINISTIC_DEFAULTS in self.options
1100+
and isinstance(default_value, str)
1101+
and _UUID_PATTERN.match(default_value)
1102+
):
1103+
default_value = ""
1104+
field_data["default"] = default_value
10831105
if self.docs and Option.NO_DOC not in self.options:
10841106
field_data["doc"] = self.docs
10851107
return field_data

tests/test_dataclass.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,3 +887,57 @@ class PyType:
887887
],
888888
}
889889
assert_schema(PyType, expected, options=Option.ADD_REFERENCE_ID)
890+
891+
892+
def test_deterministic_defaults_datetime():
893+
@dataclasses.dataclass
894+
class PyType:
895+
field_a: datetime.datetime = dataclasses.field(
896+
default_factory=lambda: datetime.datetime.now(tz=datetime.timezone.utc)
897+
)
898+
899+
expected = {
900+
"type": "record",
901+
"name": "PyType",
902+
"fields": [
903+
{
904+
"name": "field_a",
905+
"type": {"type": "long", "logicalType": "timestamp-micros"},
906+
"default": 0,
907+
}
908+
],
909+
}
910+
assert_schema(PyType, expected, options=pas.Option.DETERMINISTIC_DEFAULTS)
911+
912+
913+
def test_deterministic_defaults_uuid_str():
914+
import uuid
915+
916+
def short_uid() -> str:
917+
return str(uuid.uuid4())[0:8]
918+
919+
def long_uid() -> str:
920+
return str(uuid.uuid4())
921+
922+
@dataclasses.dataclass
923+
class PyType:
924+
_long_uid: str = dataclasses.field(default_factory=long_uid)
925+
_short_uid: str = dataclasses.field(default_factory=short_uid)
926+
927+
expected = {
928+
"type": "record",
929+
"name": "PyType",
930+
"fields": [
931+
{
932+
"name": "_long_uid",
933+
"type": "string",
934+
"default": "",
935+
},
936+
{
937+
"name": "_short_uid",
938+
"type": "string",
939+
"default": "",
940+
},
941+
],
942+
}
943+
assert_schema(PyType, expected, options=pas.Option.DETERMINISTIC_DEFAULTS)

0 commit comments

Comments
 (0)