Skip to content

Commit 97e72b9

Browse files
authored
feat: Add to_dict and from_dict to ByteStream (#9568)
* Add to_dict and from_dict to ByteStream * Add reno * Add unit tests * Fix and expand tests * Fix typing * PR comments
1 parent fc64884 commit 97e72b9

5 files changed

Lines changed: 58 additions & 9 deletions

File tree

haystack/dataclasses/byte_stream.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,24 @@ def __repr__(self) -> str:
7979
fields.append(f"mime_type={self.mime_type!r}")
8080
fields_str = ", ".join(fields)
8181
return f"{self.__class__.__name__}({fields_str})"
82+
83+
def to_dict(self) -> Dict[str, Any]:
84+
"""
85+
Convert the ByteStream to a dictionary representation.
86+
87+
:returns: A dictionary with keys 'data', 'meta', and 'mime_type'.
88+
"""
89+
# Note: The data is converted to a list of integers for serialization since JSON does not support bytes
90+
# directly.
91+
return {"data": list(self.data), "meta": self.meta, "mime_type": self.mime_type}
92+
93+
@classmethod
94+
def from_dict(cls, data: Dict[str, Any]) -> "ByteStream":
95+
"""
96+
Create a ByteStream from a dictionary representation.
97+
98+
:param data: A dictionary with keys 'data', 'meta', and 'mime_type'.
99+
100+
:returns: A ByteStream instance.
101+
"""
102+
return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))

haystack/dataclasses/document.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,12 @@ def to_dict(self, flatten: bool = True) -> Dict[str, Any]:
127127
Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
128128
"""
129129
data = asdict(self)
130-
if (blob := data.get("blob")) is not None:
131-
data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}
130+
131+
# Use `ByteStream` and `SparseEmbedding`'s to_dict methods to convert them to JSON-serializable types.
132+
if self.blob is not None:
133+
data["blob"] = self.blob.to_dict()
134+
if self.sparse_embedding is not None:
135+
data["sparse_embedding"] = self.sparse_embedding.to_dict()
132136

133137
if flatten:
134138
meta = data.pop("meta")
@@ -144,7 +148,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
144148
The `blob` field is converted to its original type.
145149
"""
146150
if blob := data.get("blob"):
147-
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
151+
data["blob"] = ByteStream.from_dict(blob)
148152
if sparse_embedding := data.get("sparse_embedding"):
149153
data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
150154

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Add `to_dict` and `from_dict` to ByteStream so it is consistent with our other dataclasses in having serialization and deserialization methods.

test/dataclasses/test_byte_stream.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,23 @@ def test_str_truncation():
8181
assert len(string_repr) < 200
8282
assert "text/plain" in string_repr
8383
assert "foo" in string_repr
84+
85+
86+
def test_to_dict():
87+
test_str = "Hello, world!"
88+
b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
89+
d = b.to_dict()
90+
assert d["data"] == list(test_str.encode())
91+
assert d["mime_type"] == "text/plain"
92+
assert d["meta"] == {"foo": "bar"}
93+
94+
95+
def test_from_dict():
96+
test_str = "Hello, world!"
97+
b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
98+
d = b.to_dict()
99+
b2 = ByteStream.from_dict(d)
100+
assert b2.data == b.data
101+
assert b2.mime_type == b.mime_type
102+
assert b2.meta == b.meta
103+
assert str(b2) == str(b)

test/dataclasses/test_document.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_to_dict_without_flattening():
146146
def test_to_dict_with_custom_parameters():
147147
doc = Document(
148148
content="test text",
149-
blob=ByteStream(b"some bytes", mime_type="application/pdf"),
149+
blob=ByteStream(b"some bytes", mime_type="application/pdf", meta={"foo": "bar"}),
150150
meta={"some": "values", "test": 10},
151151
score=0.99,
152152
embedding=[10.0, 10.0],
@@ -156,7 +156,7 @@ def test_to_dict_with_custom_parameters():
156156
assert doc.to_dict() == {
157157
"id": doc.id,
158158
"content": "test text",
159-
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
159+
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {"foo": "bar"}},
160160
"some": "values",
161161
"test": 10,
162162
"score": 0.99,
@@ -178,10 +178,10 @@ def test_to_dict_with_custom_parameters_without_flattening():
178178
assert doc.to_dict(flatten=False) == {
179179
"id": doc.id,
180180
"content": "test text",
181-
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
181+
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {}},
182182
"meta": {"some": "values", "test": 10},
183183
"score": 0.99,
184-
"embedding": [10, 10],
184+
"embedding": [10.0, 10.0],
185185
"sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
186186
}
187187

@@ -212,15 +212,15 @@ def from_from_dict_with_parameters():
212212
assert Document.from_dict(
213213
{
214214
"content": "test text",
215-
"blob": {"data": list(blob_data), "mime_type": "text/markdown"},
215+
"blob": {"data": list(blob_data), "mime_type": "text/markdown", "meta": {"text": "test text"}},
216216
"meta": {"text": "test text"},
217217
"score": 0.812,
218218
"embedding": [0.1, 0.2, 0.3],
219219
"sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
220220
}
221221
) == Document(
222222
content="test text",
223-
blob=ByteStream(blob_data, mime_type="text/markdown"),
223+
blob=ByteStream(blob_data, mime_type="text/markdown", meta={"text": "test text"}),
224224
meta={"text": "test text"},
225225
score=0.812,
226226
embedding=[0.1, 0.2, 0.3],

0 commit comments

Comments
 (0)