Add option to wrap lists and dics inside Avro records (#21)

giograno · web-flow · commit ffa54acbda71 · 2026-02-24T13:28:23.000+01:00
diff --git a/src/py_avro_schema/_schemas.py b/src/py_avro_schema/_schemas.py
@@ -73,6 +73,7 @@
 
 RUNTIME_TYPE_KEY = "_runtime_type"
 REF_ID_KEY = "__id"
+REF_DATA_KEY = "__data"
 SYMBOL_REGEX = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
 
 
@@ -151,6 +152,9 @@ class Option(enum.Flag):
     # factories might a problem when comparing schemas, as they change every time a schema is generated by definition.
     DETERMINISTIC_DEFAULTS = enum.auto()
 
+    #: Wraps lists and maps into a record type
+    WRAP_INTO_RECORDS = enum.auto()
+
 
 JSON_OPTIONS = [opt for opt in Option if opt.name and opt.name.startswith("JSON_")]
 
@@ -298,6 +302,28 @@ def make_default(self, py_default: Any) -> Any:
         """
         return py_default
 
+    def _wrap_as_record(self, inner_schema: JSONObj, names: NamesType) -> JSONType:
+        """
+        Wrap a container schema (array or map) into an Avro record with ``__id`` and ``__data`` fields.
+        Handles deduplication via ``names``.
+        """
+        record_name = _avro_name_for_type(_type_from_annotated(self.py_type))
+        fullname = f"{self.namespace}.{record_name}" if self.namespace else record_name
+        if fullname in names:
+            return fullname
+        names.append(fullname)
+        record_schema = {
+            "type": "record",
+            "name": record_name,
+            "fields": [
+                {"name": REF_ID_KEY, "type": ["null", "long"], "default": None},
+                {"name": REF_DATA_KEY, "type": inner_schema},
+            ],
+        }
+        if self.namespace:
+            record_schema["namespace"] = self.namespace
+        return record_schema
+
 
 @register_schema
 class PrimitiveSchema(Schema):
@@ -712,19 +738,22 @@ def __init__(
         args = get_args(py_type)  # TODO: validate if args has exactly 1 item?
         self.items_schema = _schema_obj(args[0], namespace=namespace, options=options)
 
-    def data(self, names: NamesType) -> JSONObj:
+    def data(self, names: NamesType) -> JSONType:
         """Return the schema data"""
-        return {
-            "type": "array",
-            "items": self.items_schema.data(names=names),
-        }
+        array_schema = {"type": "array", "items": self.items_schema.data(names=names)}
+        if Option.WRAP_INTO_RECORDS not in self.options:
+            return array_schema
+        return self._wrap_as_record(array_schema, names)
 
-    def make_default(self, py_default: collections.abc.Sequence) -> JSONArray:
+    def make_default(self, py_default: collections.abc.Sequence) -> JSONType:
         """Return an Avro schema compliant default value for a given Python Sequence
 
         :param py_default: The Python sequence to generate a default value for.
         """
-        return [self.items_schema.make_default(item) for item in py_default]
+        list_default = [self.items_schema.make_default(item) for item in py_default]
+        if Option.WRAP_INTO_RECORDS in self.options:
+            return {REF_ID_KEY: None, REF_DATA_KEY: list_default}
+        return list_default
 
 
 @register_schema
@@ -787,12 +816,18 @@ def __init__(
             raise TypeError(f"Cannot generate Avro mapping schema for Python dictionary {py_type} with non-string keys")
         self.values_schema = _schema_obj(args[1], namespace=namespace, options=options)
 
-    def data(self, names: NamesType) -> JSONObj:
+    def data(self, names: NamesType) -> JSONType:
         """Return the schema data"""
-        return {
-            "type": "map",
-            "values": self.values_schema.data(names=names),
-        }
+        map_schema = {"type": "map", "values": self.values_schema.data(names=names)}
+        if Option.WRAP_INTO_RECORDS not in self.options:
+            return map_schema
+        return self._wrap_as_record(map_schema, names)
+
+    def make_default(self, py_default: Any) -> JSONType:
+        """Return an Avro schema compliant default value for a given Python value"""
+        if Option.WRAP_INTO_RECORDS in self.options:
+            return {REF_ID_KEY: None, REF_DATA_KEY: py_default}
+        return py_default
 
 
 @register_schema
@@ -1429,3 +1464,43 @@ def _type_from_annotated(py_type: Type) -> Type:
         return args[0]
     else:
         return py_type
+
+
+def _avro_name_for_type(py_type: Type) -> str:
+    """
+    Generate an Avro-compatible name for a given Python type. It is used when wrapping container types (mostly lists
+    and maps) into Avro records.
+    It also uses the module name to build the name of the record. Initially, we thought about hashing all the fully
+    qualified names to distinguish `list[ClassA]` from `list[ClassA]` where `ClassA` are separate classes from
+    different modules. As Avro does not seem to have max length for the record name, this seems to be more readable.
+    See `test_avro_name_for_type` test suite.
+    """
+    py_type = _type_from_annotated(py_type)
+    if py_type is None or py_type is type(None):
+        return "Null"
+    origin = get_origin(py_type)
+    args = get_args(py_type)
+    if inspect.isclass(py_type):
+        if not (name := py_type.__name__):
+            raise TypeNotSupportedError(
+                f"Cannot generate a wrapper record name for Python type {py_type}: empty class name"
+            )
+        name = name[0].upper() + name[1:]
+        module = py_type.__module__
+        if module and module != "builtins":
+            mod_prefix = "".join(
+                word[0].upper() + word[1:] for part in module.split(".") for word in part.split("_") if word
+            )
+            return mod_prefix + name
+        return name
+    if origin is not None and args:
+        union_type = getattr(types, "UnionType", None)
+        if origin is Union or (union_type and origin is union_type):
+            return "Or".join(sorted(_avro_name_for_type(arg) for arg in args))
+        if _is_class(origin, collections.abc.MutableSet):
+            return _avro_name_for_type(args[0]) + "Set"
+        if _is_class(origin, collections.abc.Sequence):
+            return _avro_name_for_type(args[0]) + "List"
+        if _is_class(origin, collections.abc.Mapping):
+            return _avro_name_for_type(args[1]) + "Map"
+    raise TypeNotSupportedError(f"Cannot generate a wrapper record name for Python type {py_type}")
diff --git a/tests/test_avro_name_for_type.py b/tests/test_avro_name_for_type.py
@@ -0,0 +1,131 @@
+"""
+Set of unit tests for the _avro_name_for_type function, as this is a pretty crucial component of our design with
+wrapped records.
+"""
+
+import typing
+
+import pytest
+
+from py_avro_schema._schemas import TypeNotSupportedError, _avro_name_for_type
+
+
+class ClassA:
+    pass
+
+
+class ClassB:
+    pass
+
+
+# Sequences
+
+
+def test_list_str():
+    assert _avro_name_for_type(list[str]) == "StrList"
+
+
+def test_nested_list():
+    assert _avro_name_for_type(list[list[str]]) == "StrListList"
+
+
+def test_list_of_custom_class():
+    assert _avro_name_for_type(list[ClassA]) == "TestAvroNameForTypeClassAList"
+
+
+def test_list_of_union():
+    assert _avro_name_for_type(list[str | int]) == "IntOrStrList"
+
+
+def test_list_of_union_two_custom_classes():
+    assert _avro_name_for_type(list[ClassA | ClassB]) == "TestAvroNameForTypeClassAOrTestAvroNameForTypeClassBList"
+
+
+def test_list_of_optional_custom_class():
+    assert _avro_name_for_type(list[ClassA | None]) == "NullOrTestAvroNameForTypeClassAList"
+
+
+def test_list_of_dict_with_custom_class():
+    assert _avro_name_for_type(list[dict[str, ClassA]]) == "TestAvroNameForTypeClassAMapList"
+
+
+# Sets
+
+
+def test_set_str():
+    assert _avro_name_for_type(set[str]) == "StrSet"
+
+
+def test_set_custom_class():
+    assert _avro_name_for_type(set[ClassA]) == "TestAvroNameForTypeClassASet"
+
+
+# Maps
+
+
+def test_dict_str_str():
+    assert _avro_name_for_type(dict[str, str]) == "StrMap"
+
+
+def test_dict_custom_class_value():
+    assert _avro_name_for_type(dict[str, ClassA]) == "TestAvroNameForTypeClassAMap"
+
+
+def test_dict_with_union_two_custom_classes():
+    assert _avro_name_for_type(dict[str, ClassA | ClassB]) == "TestAvroNameForTypeClassAOrTestAvroNameForTypeClassBMap"
+
+
+def test_dict_with_optional_custom_class():
+    assert _avro_name_for_type(dict[str, ClassA | None]) == "NullOrTestAvroNameForTypeClassAMap"
+
+
+def test_dict_with_list_of_custom_class():
+    assert _avro_name_for_type(dict[str, list[ClassA]]) == "TestAvroNameForTypeClassAListMap"
+
+
+def test_dict_none_value():
+    assert _avro_name_for_type(dict[str, None]) == "NullMap"
+
+
+# Unions
+
+
+def test_union_str_int():
+    assert _avro_name_for_type(str | int) == "IntOrStr"
+
+
+def test_union_str_int_legacy_syntax():
+    assert _avro_name_for_type(typing.Union[str, int]) == "IntOrStr"
+
+
+def test_union_two_custom_classes_order_independent():
+    assert _avro_name_for_type(ClassA | ClassB) == "TestAvroNameForTypeClassAOrTestAvroNameForTypeClassB"
+    assert _avro_name_for_type(ClassB | ClassA) == "TestAvroNameForTypeClassAOrTestAvroNameForTypeClassB"
+
+
+def test_optional_custom_class():
+    assert _avro_name_for_type(ClassA | None) == "NullOrTestAvroNameForTypeClassA"
+
+
+# Special cases
+
+
+def test_same_class_name_different_modules():
+
+    ClassFromA = type("MyClass", (), {"__module__": "pkg.mod_a"})
+    ClassFromB = type("MyClass", (), {"__module__": "pkg.mod_b"})
+
+    name_a = _avro_name_for_type(list[ClassFromA])  # noqa
+    name_b = _avro_name_for_type(list[ClassFromB])  # noqa
+
+    assert name_a == "PkgModAMyClassList"
+    assert name_b == "PkgModBMyClassList"
+
+
+# Error cases
+
+
+def test_unknown_type_raises():
+    T = typing.TypeVar("T")
+    with pytest.raises(TypeNotSupportedError, match="Cannot generate a wrapper record name"):
+        _avro_name_for_type(T)
diff --git a/tests/test_primitives.py b/tests/test_primitives.py