22#
33# SPDX-License-Identifier: Apache-2.0
44
5+ import warnings
56from typing import Any , Dict
67
78from haystack .core .errors import DeserializationError , SerializationError
@@ -54,10 +55,9 @@ class does not have a `from_dict` method.
5455 return obj_class .from_dict (data ["data" ])
5556
5657
57- # TODO: Make this function public once its implementation is finalized and tested
58- def _serialize_value_with_schema (payload : Dict [str , Any ]) -> Dict [str , Any ]:
58+ def _serialize_value_with_schema (payload : Any ) -> Dict [str , Any ]:
5959 """
60- Serializes a dictionary into a schema-aware format suitable for storage or transmission.
60+ Serializes a value into a schema-aware format suitable for storage or transmission.
6161
6262 The output format separates the schema information from the actual data, making it easier
6363 to deserialize complex nested structures correctly.
@@ -66,63 +66,69 @@ def _serialize_value_with_schema(payload: Dict[str, Any]) -> Dict[str, Any]:
6666 - Objects with to_dict() methods (e.g. dataclasses)
6767 - Objects with __dict__ attributes
6868 - Dictionaries
69- - Lists, tuples, and sets
69+ - Lists, tuples, and sets. Lists with mixed types are not supported.
7070 - Primitive types (str, int, float, bool, None)
7171
72- :param value : The value to serialize
72+ :param payload : The value to serialize (can be any type)
7373 :returns: The serialized dict representation of the given value. Contains two keys:
74- - "schema ": Contains type information for each field
75- - "data ": Contains the actual data in a simplified format
74+ - "serialization_schema ": Contains type information for each field.
75+ - "serialized_data ": Contains the actual data in a simplified format.
7676
7777 """
78- schema : Dict [str , Any ] = {}
79- data : Dict [str , Any ] = {}
80-
81- for field , val in payload .items ():
82- # 1) Handle dataclass‐style objects
83- if hasattr (val , "to_dict" ) and callable (val .to_dict ):
84- type_name = generate_qualified_class_name (type (val ))
85- pure = _convert_to_basic_types (val .to_dict ())
86- schema [field ] = {"type" : type_name }
87- data [field ] = pure
88-
89- # 2) Arbitrary objects w/ __dict__
90- elif hasattr (val , "__dict__" ):
91- type_name = generate_qualified_class_name (type (val ))
92- pure = _convert_to_basic_types (vars (val ))
93- schema [field ] = {"type" : type_name }
94- data [field ] = pure
95-
96- # 3) Dicts → "object"
97- elif isinstance (val , dict ):
98- pure = _convert_to_basic_types (val )
99- schema [field ] = {"type" : "object" }
100- data [field ] = pure
101-
102- # 4) Sequences → "array"
103- elif isinstance (val , (list , tuple , set )):
104- # pure data
105- pure_list = _convert_to_basic_types (list (val ))
106- # determine item type from first element (if any)
107- if val :
108- first = next (iter (val ))
109- if hasattr (first , "to_dict" ) and callable (first .to_dict ) or hasattr (first , "__dict__" ):
110- item_type = generate_qualified_class_name (type (first ))
111- else :
112- item_type = _primitive_schema_type (first )
113- else :
114- item_type = "any"
115-
116- schema [field ] = {"type" : "array" , "items" : {"type" : item_type }}
117- data [field ] = pure_list
118-
119- # 5) Primitives
78+ # Handle dictionary case - iterate through fields
79+ if isinstance (payload , dict ):
80+ schema : Dict [str , Any ] = {}
81+ data : Dict [str , Any ] = {}
82+
83+ for field , val in payload .items ():
84+ # Recursively serialize each field
85+ serialized_value = _serialize_value_with_schema (val )
86+ schema [field ] = serialized_value ["serialization_schema" ]
87+ data [field ] = serialized_value ["serialized_data" ]
88+
89+ return {"serialization_schema" : {"type" : "object" , "properties" : schema }, "serialized_data" : data }
90+
91+ # Handle array case - iterate through elements
92+ elif isinstance (payload , (list , tuple , set )):
93+ # Convert to list for consistent handling
94+ pure_list = _convert_to_basic_types (list (payload ))
95+
96+ # Determine item type from first element (if any)
97+ if payload :
98+ first = next (iter (payload ))
99+ item_schema = _serialize_value_with_schema (first )
100+ base_schema = {"type" : "array" , "items" : item_schema ["serialization_schema" ]}
120101 else :
121- prim_type = _primitive_schema_type (val )
122- schema [field ] = {"type" : prim_type }
123- data [field ] = val
124-
125- return {"serialization_schema" : schema , "serialized_data" : data }
102+ base_schema = {"type" : "array" , "items" : {}}
103+
104+ # Add JSON Schema properties to infer sets and tuples
105+ if isinstance (payload , set ):
106+ base_schema ["uniqueItems" ] = True
107+ elif isinstance (payload , tuple ):
108+ base_schema ["minItems" ] = len (payload )
109+ base_schema ["maxItems" ] = len (payload )
110+
111+ return {"serialization_schema" : base_schema , "serialized_data" : pure_list }
112+
113+ # Handle Haystack style objects (e.g. dataclasses and Components)
114+ elif hasattr (payload , "to_dict" ) and callable (payload .to_dict ):
115+ type_name = generate_qualified_class_name (type (payload ))
116+ pure = _convert_to_basic_types (payload )
117+ schema = {"type" : type_name }
118+ return {"serialization_schema" : schema , "serialized_data" : pure }
119+
120+ # Handle arbitrary objects with __dict__
121+ elif hasattr (payload , "__dict__" ):
122+ type_name = generate_qualified_class_name (type (payload ))
123+ pure = _convert_to_basic_types (vars (payload ))
124+ schema = {"type" : type_name }
125+ return {"serialization_schema" : schema , "serialized_data" : pure }
126+
127+ # Handle primitives
128+ else :
129+ prim_type = _primitive_schema_type (payload )
130+ schema = {"type" : prim_type }
131+ return {"serialization_schema" : schema , "serialized_data" : payload }
126132
127133
128134def _primitive_schema_type (value : Any ) -> str :
@@ -172,69 +178,103 @@ def _convert_to_basic_types(value: Any) -> Any:
172178
173179 # sequences
174180 if isinstance (value , (list , tuple , set )):
175- cls = type (value )
176- return cls (_convert_to_basic_types (v ) for v in value )
181+ return [_convert_to_basic_types (v ) for v in value ]
177182
178183 # primitive
179184 return value
180185
181186
182- # TODO: Make this function public once its implementation is finalized and tested
183- def _deserialize_value_with_schema (serialized : Dict [str , Any ]) -> Dict [str , Any ]:
187+ def _deserialize_value_with_schema (serialized : Dict [str , Any ]) -> Any : # pylint: disable=too-many-return-statements, # noqa: PLR0911, PLR0912
184188 """
185- Deserializes a dictionary with schema information and data to original values .
189+ Deserializes a value with schema information back to its original form .
186190
187191 Takes a dict of the form:
188192 {
189- "schema": {
190- "numbers": {"type": "integer"},
191- "messages": {"type": "array", "items": {"type": "haystack.dataclasses.chat_message.ChatMessage"}},
192- },
193- "data": {
194- "numbers": 1,
195- "messages": [{"role": "user", "meta": {}, "name": None, "content": [{"text": "Hello, world!"}]}],
193+ "serialization_schema": {"type": "integer"} or {"type": "object", "properties": {...}},
194+ "serialized_data": <the actual data>
196195 }
197196
198197 :param serialized: The serialized dict with schema and data.
199- :returns: The deserialized dict with original values .
198+ :returns: The deserialized value in its original form .
200199 """
201- schema = serialized .get ("serialization_schema" , {})
202- data = serialized .get ("serialized_data" , {})
203-
204- result : Dict [str , Any ] = {}
205- for field , raw in data .items ():
206- info = schema .get (field )
207- # no schema entry → just deep-deserialize whatever we have
208- if not info :
209- result [field ] = _deserialize_value (raw )
210- continue
211-
212- t = info ["type" ]
213-
214- # ARRAY case
215- if t == "array" :
216- item_type = info ["items" ]["type" ]
217- reconstructed = []
218- for item in raw :
219- envelope = {"type" : item_type , "data" : item }
220- reconstructed .append (_deserialize_value (envelope ))
221- result [field ] = reconstructed
222200
223- # PRIMITIVE case
224- elif t in ("null" , "boolean" , "integer" , "number" , "string" ):
225- result [field ] = raw
226-
227- # GENERIC OBJECT
228- elif t == "object" :
229- envelope = {"type" : "object" , "data" : raw }
230- result [field ] = _deserialize_value (envelope )
201+ if not serialized or "serialization_schema" not in serialized or "serialized_data" not in serialized :
202+ raise DeserializationError (
203+ f"Invalid format of passed serialized payload. Expected a dictionary with keys "
204+ f"'serialization_schema' and 'serialized_data'. Got: { serialized } "
205+ )
206+ schema = serialized ["serialization_schema" ]
207+ data = serialized ["serialized_data" ]
208+
209+ schema_type = schema .get ("type" )
210+
211+ if not schema_type :
212+ # for backward comaptability till Haystack 2.16 we use legacy implementation
213+ warnings .warn (
214+ "Missing 'type' key in 'serialization_schema'. This likely indicates that you're using a serialized "
215+ "State object created with a version of Haystack older than 2.15.0. "
216+ "Support for the old serialization format will be removed in Haystack 2.16.0. "
217+ "Please upgrade to the new serialization format to ensure forward compatibility." ,
218+ DeprecationWarning ,
219+ )
220+ return _deserialize_value_with_schema_legacy (serialized )
221+
222+ # Handle object case (dictionary with properties)
223+ if schema_type == "object" :
224+ properties = schema .get ("properties" )
225+ if properties :
226+ result : Dict [str , Any ] = {}
227+
228+ if isinstance (data , dict ):
229+ for field , raw_value in data .items ():
230+ field_schema = properties .get (field )
231+ if field_schema :
232+ # Recursively deserialize each field - avoid creating temporary dict
233+ result [field ] = _deserialize_value_with_schema (
234+ {"serialization_schema" : field_schema , "serialized_data" : raw_value }
235+ )
236+
237+ return result
238+ else :
239+ return _deserialize_value (data )
240+
241+ # Handle array case
242+ elif schema_type == "array" :
243+ # Cache frequently accessed schema properties
244+ item_schema = schema .get ("items" , {})
245+ item_type = item_schema .get ("type" , "any" )
246+ is_set = schema .get ("uniqueItems" ) is True
247+ is_tuple = schema .get ("minItems" ) is not None and schema .get ("maxItems" ) is not None
248+
249+ # Handle nested objects/arrays first (most complex case)
250+ if item_type in ("object" , "array" ):
251+ return [
252+ _deserialize_value_with_schema ({"serialization_schema" : item_schema , "serialized_data" : item })
253+ for item in data
254+ ]
255+
256+ # Helper function to deserialize individual items
257+ def deserialize_item (item ):
258+ if item_type == "any" :
259+ return _deserialize_value (item )
260+ else :
261+ return _deserialize_value ({"type" : item_type , "data" : item })
231262
232- # CUSTOM CLASS
263+ # Handle different collection types
264+ if is_set :
265+ return {deserialize_item (item ) for item in data }
266+ elif is_tuple :
267+ return tuple (deserialize_item (item ) for item in data )
233268 else :
234- envelope = {"type" : t , "data" : raw }
235- result [field ] = _deserialize_value (envelope )
269+ return [deserialize_item (item ) for item in data ]
236270
237- return result
271+ # Handle primitive types
272+ elif schema_type in ("null" , "boolean" , "integer" , "number" , "string" ):
273+ return data
274+
275+ # Handle custom class types
276+ else :
277+ return _deserialize_value ({"type" : schema_type , "data" : data })
238278
239279
240280def _deserialize_value (value : Any ) -> Any : # pylint: disable=too-many-return-statements # noqa: PLR0911
@@ -291,3 +331,61 @@ def _deserialize_value(value: Any) -> Any: # pylint: disable=too-many-return-st
291331
292332 # 4) Fallback (shouldn't usually happen with our schema)
293333 return value
334+
335+
336+ def _deserialize_value_with_schema_legacy (serialized : Dict [str , Any ]) -> Dict [str , Any ]:
337+ """
338+ Legacy function for deserializing a dictionary with schema information and data to original values.
339+
340+ Kept for backward compatibility till Haystack 2.16.0.
341+ Takes a dict of the form:
342+ {
343+ "schema": {
344+ "numbers": {"type": "integer"},
345+ "messages": {"type": "array", "items": {"type": "haystack.dataclasses.chat_message.ChatMessage"}},
346+ },
347+ "data": {
348+ "numbers": 1,
349+ "messages": [{"role": "user", "meta": {}, "name": None, "content": [{"text": "Hello, world!"}]}],
350+ }
351+
352+ :param serialized: The serialized dict with schema and data.
353+ :returns: The deserialized dict with original values.
354+ """
355+ schema = serialized .get ("serialization_schema" , {})
356+ data = serialized .get ("serialized_data" , {})
357+
358+ result : Dict [str , Any ] = {}
359+ for field , raw in data .items ():
360+ info = schema .get (field )
361+ # no schema entry → just deep-deserialize whatever we have
362+ if not info :
363+ result [field ] = _deserialize_value (raw )
364+ continue
365+
366+ t = info ["type" ]
367+
368+ # ARRAY case
369+ if t == "array" :
370+ item_type = info ["items" ]["type" ]
371+ reconstructed = []
372+ for item in raw :
373+ envelope = {"type" : item_type , "data" : item }
374+ reconstructed .append (_deserialize_value (envelope ))
375+ result [field ] = reconstructed
376+
377+ # PRIMITIVE case
378+ elif t in ("null" , "boolean" , "integer" , "number" , "string" ):
379+ result [field ] = raw
380+
381+ # GENERIC OBJECT
382+ elif t == "object" :
383+ envelope = {"type" : "object" , "data" : raw }
384+ result [field ] = _deserialize_value (envelope )
385+
386+ # CUSTOM CLASS
387+ else :
388+ envelope = {"type" : t , "data" : raw }
389+ result [field ] = _deserialize_value (envelope )
390+
391+ return result
0 commit comments