Skip to content

Commit 9ed0b9b

Browse files
authored
fix: Update the de/serialization with schema utils (#9526)
* Update the util methods * Update tests * fix tests * schema fix * Add json schema for tuples and sets * Add proper conversion for sets and tuples * Adjust typing * PR comments * Linting * Optimize deserialization * remove TODO * PR comments * PR comments * Update tests and deserialization error * Support legacy deserialization * Update deprecating warning * Update test
1 parent d14f5dc commit 9ed0b9b

3 files changed

Lines changed: 533 additions & 137 deletions

File tree

haystack/utils/base_serialization.py

Lines changed: 198 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
import warnings
56
from typing import Any, Dict
67

78
from haystack.core.errors import DeserializationError, SerializationError
@@ -54,10 +55,9 @@ class does not have a `from_dict` method.
5455
return obj_class.from_dict(data["data"])
5556

5657

57-
# TODO: Make this function public once its implementation is finalized and tested
58-
def _serialize_value_with_schema(payload: Dict[str, Any]) -> Dict[str, Any]:
58+
def _serialize_value_with_schema(payload: Any) -> Dict[str, Any]:
5959
"""
60-
Serializes a dictionary into a schema-aware format suitable for storage or transmission.
60+
Serializes a value into a schema-aware format suitable for storage or transmission.
6161
6262
The output format separates the schema information from the actual data, making it easier
6363
to deserialize complex nested structures correctly.
@@ -66,63 +66,69 @@ def _serialize_value_with_schema(payload: Dict[str, Any]) -> Dict[str, Any]:
6666
- Objects with to_dict() methods (e.g. dataclasses)
6767
- Objects with __dict__ attributes
6868
- Dictionaries
69-
- Lists, tuples, and sets
69+
- Lists, tuples, and sets. Lists with mixed types are not supported.
7070
- Primitive types (str, int, float, bool, None)
7171
72-
:param value: The value to serialize
72+
:param payload: The value to serialize (can be any type)
7373
:returns: The serialized dict representation of the given value. Contains two keys:
74-
- "schema": Contains type information for each field
75-
- "data": Contains the actual data in a simplified format
74+
- "serialization_schema": Contains type information for each field.
75+
- "serialized_data": Contains the actual data in a simplified format.
7676
7777
"""
78-
schema: Dict[str, Any] = {}
79-
data: Dict[str, Any] = {}
80-
81-
for field, val in payload.items():
82-
# 1) Handle dataclass‐style objects
83-
if hasattr(val, "to_dict") and callable(val.to_dict):
84-
type_name = generate_qualified_class_name(type(val))
85-
pure = _convert_to_basic_types(val.to_dict())
86-
schema[field] = {"type": type_name}
87-
data[field] = pure
88-
89-
# 2) Arbitrary objects w/ __dict__
90-
elif hasattr(val, "__dict__"):
91-
type_name = generate_qualified_class_name(type(val))
92-
pure = _convert_to_basic_types(vars(val))
93-
schema[field] = {"type": type_name}
94-
data[field] = pure
95-
96-
# 3) Dicts → "object"
97-
elif isinstance(val, dict):
98-
pure = _convert_to_basic_types(val)
99-
schema[field] = {"type": "object"}
100-
data[field] = pure
101-
102-
# 4) Sequences → "array"
103-
elif isinstance(val, (list, tuple, set)):
104-
# pure data
105-
pure_list = _convert_to_basic_types(list(val))
106-
# determine item type from first element (if any)
107-
if val:
108-
first = next(iter(val))
109-
if hasattr(first, "to_dict") and callable(first.to_dict) or hasattr(first, "__dict__"):
110-
item_type = generate_qualified_class_name(type(first))
111-
else:
112-
item_type = _primitive_schema_type(first)
113-
else:
114-
item_type = "any"
115-
116-
schema[field] = {"type": "array", "items": {"type": item_type}}
117-
data[field] = pure_list
118-
119-
# 5) Primitives
78+
# Handle dictionary case - iterate through fields
79+
if isinstance(payload, dict):
80+
schema: Dict[str, Any] = {}
81+
data: Dict[str, Any] = {}
82+
83+
for field, val in payload.items():
84+
# Recursively serialize each field
85+
serialized_value = _serialize_value_with_schema(val)
86+
schema[field] = serialized_value["serialization_schema"]
87+
data[field] = serialized_value["serialized_data"]
88+
89+
return {"serialization_schema": {"type": "object", "properties": schema}, "serialized_data": data}
90+
91+
# Handle array case - iterate through elements
92+
elif isinstance(payload, (list, tuple, set)):
93+
# Convert to list for consistent handling
94+
pure_list = _convert_to_basic_types(list(payload))
95+
96+
# Determine item type from first element (if any)
97+
if payload:
98+
first = next(iter(payload))
99+
item_schema = _serialize_value_with_schema(first)
100+
base_schema = {"type": "array", "items": item_schema["serialization_schema"]}
120101
else:
121-
prim_type = _primitive_schema_type(val)
122-
schema[field] = {"type": prim_type}
123-
data[field] = val
124-
125-
return {"serialization_schema": schema, "serialized_data": data}
102+
base_schema = {"type": "array", "items": {}}
103+
104+
# Add JSON Schema properties to infer sets and tuples
105+
if isinstance(payload, set):
106+
base_schema["uniqueItems"] = True
107+
elif isinstance(payload, tuple):
108+
base_schema["minItems"] = len(payload)
109+
base_schema["maxItems"] = len(payload)
110+
111+
return {"serialization_schema": base_schema, "serialized_data": pure_list}
112+
113+
# Handle Haystack style objects (e.g. dataclasses and Components)
114+
elif hasattr(payload, "to_dict") and callable(payload.to_dict):
115+
type_name = generate_qualified_class_name(type(payload))
116+
pure = _convert_to_basic_types(payload)
117+
schema = {"type": type_name}
118+
return {"serialization_schema": schema, "serialized_data": pure}
119+
120+
# Handle arbitrary objects with __dict__
121+
elif hasattr(payload, "__dict__"):
122+
type_name = generate_qualified_class_name(type(payload))
123+
pure = _convert_to_basic_types(vars(payload))
124+
schema = {"type": type_name}
125+
return {"serialization_schema": schema, "serialized_data": pure}
126+
127+
# Handle primitives
128+
else:
129+
prim_type = _primitive_schema_type(payload)
130+
schema = {"type": prim_type}
131+
return {"serialization_schema": schema, "serialized_data": payload}
126132

127133

128134
def _primitive_schema_type(value: Any) -> str:
@@ -172,69 +178,103 @@ def _convert_to_basic_types(value: Any) -> Any:
172178

173179
# sequences
174180
if isinstance(value, (list, tuple, set)):
175-
cls = type(value)
176-
return cls(_convert_to_basic_types(v) for v in value)
181+
return [_convert_to_basic_types(v) for v in value]
177182

178183
# primitive
179184
return value
180185

181186

182-
# TODO: Make this function public once its implementation is finalized and tested
183-
def _deserialize_value_with_schema(serialized: Dict[str, Any]) -> Dict[str, Any]:
187+
def _deserialize_value_with_schema(serialized: Dict[str, Any]) -> Any: # pylint: disable=too-many-return-statements, # noqa: PLR0911, PLR0912
184188
"""
185-
Deserializes a dictionary with schema information and data to original values.
189+
Deserializes a value with schema information back to its original form.
186190
187191
Takes a dict of the form:
188192
{
189-
"schema": {
190-
"numbers": {"type": "integer"},
191-
"messages": {"type": "array", "items": {"type": "haystack.dataclasses.chat_message.ChatMessage"}},
192-
},
193-
"data": {
194-
"numbers": 1,
195-
"messages": [{"role": "user", "meta": {}, "name": None, "content": [{"text": "Hello, world!"}]}],
193+
"serialization_schema": {"type": "integer"} or {"type": "object", "properties": {...}},
194+
"serialized_data": <the actual data>
196195
}
197196
198197
:param serialized: The serialized dict with schema and data.
199-
:returns: The deserialized dict with original values.
198+
:returns: The deserialized value in its original form.
200199
"""
201-
schema = serialized.get("serialization_schema", {})
202-
data = serialized.get("serialized_data", {})
203-
204-
result: Dict[str, Any] = {}
205-
for field, raw in data.items():
206-
info = schema.get(field)
207-
# no schema entry → just deep-deserialize whatever we have
208-
if not info:
209-
result[field] = _deserialize_value(raw)
210-
continue
211-
212-
t = info["type"]
213-
214-
# ARRAY case
215-
if t == "array":
216-
item_type = info["items"]["type"]
217-
reconstructed = []
218-
for item in raw:
219-
envelope = {"type": item_type, "data": item}
220-
reconstructed.append(_deserialize_value(envelope))
221-
result[field] = reconstructed
222200

223-
# PRIMITIVE case
224-
elif t in ("null", "boolean", "integer", "number", "string"):
225-
result[field] = raw
226-
227-
# GENERIC OBJECT
228-
elif t == "object":
229-
envelope = {"type": "object", "data": raw}
230-
result[field] = _deserialize_value(envelope)
201+
if not serialized or "serialization_schema" not in serialized or "serialized_data" not in serialized:
202+
raise DeserializationError(
203+
f"Invalid format of passed serialized payload. Expected a dictionary with keys "
204+
f"'serialization_schema' and 'serialized_data'. Got: {serialized}"
205+
)
206+
schema = serialized["serialization_schema"]
207+
data = serialized["serialized_data"]
208+
209+
schema_type = schema.get("type")
210+
211+
if not schema_type:
212+
# for backward comaptability till Haystack 2.16 we use legacy implementation
213+
warnings.warn(
214+
"Missing 'type' key in 'serialization_schema'. This likely indicates that you're using a serialized "
215+
"State object created with a version of Haystack older than 2.15.0. "
216+
"Support for the old serialization format will be removed in Haystack 2.16.0. "
217+
"Please upgrade to the new serialization format to ensure forward compatibility.",
218+
DeprecationWarning,
219+
)
220+
return _deserialize_value_with_schema_legacy(serialized)
221+
222+
# Handle object case (dictionary with properties)
223+
if schema_type == "object":
224+
properties = schema.get("properties")
225+
if properties:
226+
result: Dict[str, Any] = {}
227+
228+
if isinstance(data, dict):
229+
for field, raw_value in data.items():
230+
field_schema = properties.get(field)
231+
if field_schema:
232+
# Recursively deserialize each field - avoid creating temporary dict
233+
result[field] = _deserialize_value_with_schema(
234+
{"serialization_schema": field_schema, "serialized_data": raw_value}
235+
)
236+
237+
return result
238+
else:
239+
return _deserialize_value(data)
240+
241+
# Handle array case
242+
elif schema_type == "array":
243+
# Cache frequently accessed schema properties
244+
item_schema = schema.get("items", {})
245+
item_type = item_schema.get("type", "any")
246+
is_set = schema.get("uniqueItems") is True
247+
is_tuple = schema.get("minItems") is not None and schema.get("maxItems") is not None
248+
249+
# Handle nested objects/arrays first (most complex case)
250+
if item_type in ("object", "array"):
251+
return [
252+
_deserialize_value_with_schema({"serialization_schema": item_schema, "serialized_data": item})
253+
for item in data
254+
]
255+
256+
# Helper function to deserialize individual items
257+
def deserialize_item(item):
258+
if item_type == "any":
259+
return _deserialize_value(item)
260+
else:
261+
return _deserialize_value({"type": item_type, "data": item})
231262

232-
# CUSTOM CLASS
263+
# Handle different collection types
264+
if is_set:
265+
return {deserialize_item(item) for item in data}
266+
elif is_tuple:
267+
return tuple(deserialize_item(item) for item in data)
233268
else:
234-
envelope = {"type": t, "data": raw}
235-
result[field] = _deserialize_value(envelope)
269+
return [deserialize_item(item) for item in data]
236270

237-
return result
271+
# Handle primitive types
272+
elif schema_type in ("null", "boolean", "integer", "number", "string"):
273+
return data
274+
275+
# Handle custom class types
276+
else:
277+
return _deserialize_value({"type": schema_type, "data": data})
238278

239279

240280
def _deserialize_value(value: Any) -> Any: # pylint: disable=too-many-return-statements # noqa: PLR0911
@@ -291,3 +331,61 @@ def _deserialize_value(value: Any) -> Any: # pylint: disable=too-many-return-st
291331

292332
# 4) Fallback (shouldn't usually happen with our schema)
293333
return value
334+
335+
336+
def _deserialize_value_with_schema_legacy(serialized: Dict[str, Any]) -> Dict[str, Any]:
337+
"""
338+
Legacy function for deserializing a dictionary with schema information and data to original values.
339+
340+
Kept for backward compatibility till Haystack 2.16.0.
341+
Takes a dict of the form:
342+
{
343+
"schema": {
344+
"numbers": {"type": "integer"},
345+
"messages": {"type": "array", "items": {"type": "haystack.dataclasses.chat_message.ChatMessage"}},
346+
},
347+
"data": {
348+
"numbers": 1,
349+
"messages": [{"role": "user", "meta": {}, "name": None, "content": [{"text": "Hello, world!"}]}],
350+
}
351+
352+
:param serialized: The serialized dict with schema and data.
353+
:returns: The deserialized dict with original values.
354+
"""
355+
schema = serialized.get("serialization_schema", {})
356+
data = serialized.get("serialized_data", {})
357+
358+
result: Dict[str, Any] = {}
359+
for field, raw in data.items():
360+
info = schema.get(field)
361+
# no schema entry → just deep-deserialize whatever we have
362+
if not info:
363+
result[field] = _deserialize_value(raw)
364+
continue
365+
366+
t = info["type"]
367+
368+
# ARRAY case
369+
if t == "array":
370+
item_type = info["items"]["type"]
371+
reconstructed = []
372+
for item in raw:
373+
envelope = {"type": item_type, "data": item}
374+
reconstructed.append(_deserialize_value(envelope))
375+
result[field] = reconstructed
376+
377+
# PRIMITIVE case
378+
elif t in ("null", "boolean", "integer", "number", "string"):
379+
result[field] = raw
380+
381+
# GENERIC OBJECT
382+
elif t == "object":
383+
envelope = {"type": "object", "data": raw}
384+
result[field] = _deserialize_value(envelope)
385+
386+
# CUSTOM CLASS
387+
else:
388+
envelope = {"type": t, "data": raw}
389+
result[field] = _deserialize_value(envelope)
390+
391+
return result

0 commit comments

Comments
 (0)