deepset-ai · sjrl · Jun 30, 2025 · Jun 30, 2025 · Jun 30, 2025 · Jun 30, 2025
@@ -79,3 +79,24 @@ def __repr__(self) -> str:
         fields.append(f"mime_type={self.mime_type!r}")
         fields_str = ", ".join(fields)
         return f"{self.__class__.__name__}({fields_str})"
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the ByteStream to a dictionary representation.
+
+        :returns: A dictionary with keys 'data', 'meta', and 'mime_type'.
+        """
+        # Note: The data is converted to a list of integers for serialization since JSON does not support bytes
+        # directly.
+        return {"data": list(self.data), "meta": self.meta, "mime_type": self.mime_type}
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ByteStream":
+        """
+        Create a ByteStream from a dictionary representation.
+
+        :param data: A dictionary with keys 'data', 'meta', and 'mime_type'.
+
+        :returns: A ByteStream instance.
+        """
+        return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))
@@ -127,8 +127,12 @@ def to_dict(self, flatten: bool = True) -> Dict[str, Any]:
             Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
         """
         data = asdict(self)
-        if (blob := data.get("blob")) is not None:
-            data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
+
+        # Use `ByteStream` and `SparseEmbedding`'s to_dict methods to convert them to JSON-serializable types.
+        if self.blob is not None:
+            data["blob"] = self.blob.to_dict()
+        if self.sparse_embedding is not None:
+            data["sparse_embedding"] = self.sparse_embedding.to_dict()
 
         if flatten:
             meta = data.pop("meta")
@@ -144,7 +148,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
         The `blob` field is converted to its original type.
         """
         if blob := data.get("blob"):
-            data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
+            data["blob"] = ByteStream.from_dict(blob)
         if sparse_embedding := data.get("sparse_embedding"):
             data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
 

@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    Add `to_dict` and `from_dict` to ByteStream so it is consistent with our other dataclasses in having serialization and deserialization methods.
@@ -81,3 +81,23 @@ def test_str_truncation():
     assert len(string_repr) < 200
     assert "text/plain" in string_repr
     assert "foo" in string_repr
+
+
+def test_to_dict():
+    test_str = "Hello, world!"
+    b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
+    d = b.to_dict()
+    assert d["data"] == list(test_str.encode())
+    assert d["mime_type"] == "text/plain"
+    assert d["meta"] == {"foo": "bar"}
+
+
+def test_from_dict():
+    test_str = "Hello, world!"
+    b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
+    d = b.to_dict()
+    b2 = ByteStream.from_dict(d)
+    assert b2.data == b.data
+    assert b2.mime_type == b.mime_type
+    assert b2.meta == b.meta
+    assert str(b2) == str(b)
@@ -146,7 +146,7 @@ def test_to_dict_without_flattening():
 def test_to_dict_with_custom_parameters():
     doc = Document(
         content="test text",
-        blob=ByteStream(b"some bytes", mime_type="application/pdf"),
+        blob=ByteStream(b"some bytes", mime_type="application/pdf", meta={"foo": "bar"}),
         meta={"some": "values", "test": 10},
         score=0.99,
         embedding=[10.0, 10.0],
@@ -156,7 +156,7 @@ def test_to_dict_with_custom_parameters():
     assert doc.to_dict() == {
         "id": doc.id,
         "content": "test text",
-        "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
+        "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {"foo": "bar"}},
         "some": "values",
         "test": 10,
         "score": 0.99,
@@ -178,10 +178,10 @@ def test_to_dict_with_custom_parameters_without_flattening():
     assert doc.to_dict(flatten=False) == {
         "id": doc.id,
         "content": "test text",
-        "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
+        "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {}},
         "meta": {"some": "values", "test": 10},
         "score": 0.99,
-        "embedding": [10, 10],
+        "embedding": [10.0, 10.0],
         "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
     }
 
@@ -212,15 +212,15 @@ def from_from_dict_with_parameters():
     assert Document.from_dict(
         {
             "content": "test text",
-            "blob": {"data": list(blob_data), "mime_type": "text/markdown"},
+            "blob": {"data": list(blob_data), "mime_type": "text/markdown", "meta": {"text": "test text"}},
             "meta": {"text": "test text"},
             "score": 0.812,
             "embedding": [0.1, 0.2, 0.3],
             "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
         }
     ) == Document(
         content="test text",
-        blob=ByteStream(blob_data, mime_type="text/markdown"),
+        blob=ByteStream(blob_data, mime_type="text/markdown", meta={"text": "test text"}),
         meta={"text": "test text"},
         score=0.812,
         embedding=[0.1, 0.2, 0.3],