From d1e1bd356087bbcf6a32c4404c814449acdc3826 Mon Sep 17 00:00:00 2001 From: Giovanni Grano Date: Tue, 2 Dec 2025 18:12:23 +0100 Subject: [PATCH 1/4] wip --- src/py_avro_schema/_schemas.py | 11 +++++++++++ tests/test_typed_dict.py | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/py_avro_schema/_schemas.py b/src/py_avro_schema/_schemas.py index 8ecd499..f34961c 100644 --- a/src/py_avro_schema/_schemas.py +++ b/src/py_avro_schema/_schemas.py @@ -1168,6 +1168,8 @@ def handles_type(cls, py_type: Type) -> bool: not dataclasses.is_dataclass(py_type) # Pydantic models are handled above and not hasattr(py_type, "__pydantic_private__") + # typed_dict handled separately + and not is_typeddict(py_type) # If we are subclassing a string, used the "named string" approach and (inspect.isclass(py_type) and not issubclass(py_type, str)) # and any other class with typed annotations @@ -1240,12 +1242,21 @@ def __init__(self, py_type: Type, namespace: str | None = None, options: Option """ super().__init__(py_type, namespace=namespace, options=options) py_type = _type_from_annotated(py_type) + self.is_total = py_type.__dict__.get("__total__", True) self.py_fields: dict[str, Type] = get_type_hints(py_type, include_extras=True) self.record_fields = [self._record_field(field) for field in self.py_fields.items()] def _record_field(self, py_field: tuple[str, Type]) -> RecordField: """Return an Avro record field object for a given TypedDict field""" aliases, actual_type = get_field_aliases_and_actual_type(py_field[1]) + + if not self.is_total: + # If a TypedDict is marked as total=None, it does not need to contain all the field. However, we need to + # be able to distinguish between the fields that are missing from the ones that are present but set to None. + # To do that, we extend the original type with str. We will later add a special string (e.g., __td_missing__) + # as a marker at deserialization time. + actual_type = Union[actual_type, str] + field_obj = RecordField( py_type=actual_type, name=py_field[0], diff --git a/tests/test_typed_dict.py b/tests/test_typed_dict.py index acbbeb7..d90d4f8 100644 --- a/tests/test_typed_dict.py +++ b/tests/test_typed_dict.py @@ -85,3 +85,24 @@ class User(TypedDict): } assert_schema(User, expected) + + +def test_non_total_typed_dict(): + + class PyType(TypedDict, total=False): + name: str + age: int | None + + expected = { + "type": "record", + "name": "PyType", + "fields": [ + { + "name": "name", + "type":"string", + }, + {"name": "age", "type": ["long", "null", "string"]}, + ] + } + assert_schema(PyType, expected) + From 41185f03a6b585de4d987de85838e60658b299f0 Mon Sep 17 00:00:00 2001 From: Giovanni Grano Date: Wed, 3 Dec 2025 09:12:18 +0100 Subject: [PATCH 2/4] handle strenum with invalid strings --- src/py_avro_schema/_schemas.py | 19 +++++++++++++++---- tests/test_typed_dict.py | 10 +++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/py_avro_schema/_schemas.py b/src/py_avro_schema/_schemas.py index f34961c..c4866a5 100644 --- a/src/py_avro_schema/_schemas.py +++ b/src/py_avro_schema/_schemas.py @@ -804,7 +804,18 @@ def data(self, names: NamesType) -> JSONType: schemas = (item_schema.data(names=names) for item_schema in self.item_schemas) # We need to deduplicate the schemas **after** rendering. This is because **different** Python types might # result in the **same** Avro schema. Preserving order as order may be significant in an Avro schema. - unique_schemas = list(more_itertools.unique_everseen(schemas)) + + def normalize_string_duplicates(_schema): + """We might have cases in which we have a schema both for ``StrSubclassSchema`` (e.g., a ``StrEnum`` with + invalid names is represented as a ``StrSubclassSchema``) and a string. These are technically duplicates, + but ``unique_everseen`` won't remove them by default.""" + if _schema == "string": + return "string" + elif isinstance(_schema, dict) and _schema.get("type") == "string": + return "string" + return _schema + + unique_schemas = list(more_itertools.unique_everseen(schemas, key=normalize_string_duplicates)) if len(unique_schemas) > 1: return unique_schemas else: @@ -1253,9 +1264,9 @@ def _record_field(self, py_field: tuple[str, Type]) -> RecordField: if not self.is_total: # If a TypedDict is marked as total=None, it does not need to contain all the field. However, we need to # be able to distinguish between the fields that are missing from the ones that are present but set to None. - # To do that, we extend the original type with str. We will later add a special string (e.g., __td_missing__) - # as a marker at deserialization time. - actual_type = Union[actual_type, str] + # To do that, we extend the original type with str. We will later add a special string + # (e.g., __td_missing__) as a marker at deserialization time. + actual_type = Union[actual_type, str] # type: ignore field_obj = RecordField( py_type=actual_type, diff --git a/tests/test_typed_dict.py b/tests/test_typed_dict.py index d90d4f8..195c61a 100644 --- a/tests/test_typed_dict.py +++ b/tests/test_typed_dict.py @@ -1,3 +1,4 @@ +from enum import StrEnum from typing import Annotated, TypedDict from py_avro_schema._alias import Alias, register_type_alias @@ -88,10 +89,13 @@ class User(TypedDict): def test_non_total_typed_dict(): + class Opt(StrEnum): + val = "invalid-val" class PyType(TypedDict, total=False): name: str age: int | None + opt: Opt | None expected = { "type": "record", @@ -99,10 +103,10 @@ class PyType(TypedDict, total=False): "fields": [ { "name": "name", - "type":"string", + "type": "string", }, {"name": "age", "type": ["long", "null", "string"]}, - ] + {"name": "opt", "type": [{"namedString": "Opt", "type": "string"}, "null"]}, + ], } assert_schema(PyType, expected) - From 77b523b4c387a8fea19c974fc64301dd82b6b7da Mon Sep 17 00:00:00 2001 From: Giovanni Grano Date: Wed, 3 Dec 2025 09:30:03 +0100 Subject: [PATCH 3/4] adding option --- src/py_avro_schema/_schemas.py | 8 +++++++- tests/test_typed_dict.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/py_avro_schema/_schemas.py b/src/py_avro_schema/_schemas.py index c4866a5..eee82bb 100644 --- a/src/py_avro_schema/_schemas.py +++ b/src/py_avro_schema/_schemas.py @@ -131,6 +131,12 @@ class Option(enum.Flag): #: See https://docs.pydantic.dev/dev/api/fields/#pydantic.fields.Field USE_FIELD_ALIAS = enum.auto() + #: TypedDict marked with ``total=False`` are valid structures when a field is missing. When of the field is also + # optional, we need to have a way to distinguish between a `None` and a non-set field. With this option, the type + # of each field is extended with `string`. This way, clients can add markers (e.g., `__td_missing__`) to discern + # the two cases. + MARK_NON_TOTAL_TYPED_DICTS = enum.auto() + JSON_OPTIONS = [opt for opt in Option if opt.name and opt.name.startswith("JSON_")] @@ -1261,7 +1267,7 @@ def _record_field(self, py_field: tuple[str, Type]) -> RecordField: """Return an Avro record field object for a given TypedDict field""" aliases, actual_type = get_field_aliases_and_actual_type(py_field[1]) - if not self.is_total: + if Option.MARK_NON_TOTAL_TYPED_DICTS in self.options and not self.is_total: # If a TypedDict is marked as total=None, it does not need to contain all the field. However, we need to # be able to distinguish between the fields that are missing from the ones that are present but set to None. # To do that, we extend the original type with str. We will later add a special string diff --git a/tests/test_typed_dict.py b/tests/test_typed_dict.py index 195c61a..3b73413 100644 --- a/tests/test_typed_dict.py +++ b/tests/test_typed_dict.py @@ -1,6 +1,7 @@ from enum import StrEnum from typing import Annotated, TypedDict +import py_avro_schema as pas from py_avro_schema._alias import Alias, register_type_alias from py_avro_schema._testing import assert_schema @@ -109,4 +110,4 @@ class PyType(TypedDict, total=False): {"name": "opt", "type": [{"namedString": "Opt", "type": "string"}, "null"]}, ], } - assert_schema(PyType, expected) + assert_schema(PyType, expected, options=pas.Option.MARK_NON_TOTAL_TYPED_DICTS) From c514ae055c20b12e72bbcfe5fc93f89bef7b3841 Mon Sep 17 00:00:00 2001 From: Giovanni Grano Date: Fri, 5 Dec 2025 08:01:16 +0100 Subject: [PATCH 4/4] add str | None field in the test --- tests/test_typed_dict.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_typed_dict.py b/tests/test_typed_dict.py index 3b73413..3e1248b 100644 --- a/tests/test_typed_dict.py +++ b/tests/test_typed_dict.py @@ -95,6 +95,7 @@ class Opt(StrEnum): class PyType(TypedDict, total=False): name: str + nickname: str | None age: int | None opt: Opt | None @@ -106,6 +107,7 @@ class PyType(TypedDict, total=False): "name": "name", "type": "string", }, + {"name": "nickname", "type": ["string", "null"]}, {"name": "age", "type": ["long", "null", "string"]}, {"name": "opt", "type": [{"namedString": "Opt", "type": "string"}, "null"]}, ],