Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions haystack/dataclasses/byte_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,24 @@ def __repr__(self) -> str:
fields.append(f"mime_type={self.mime_type!r}")
fields_str = ", ".join(fields)
return f"{self.__class__.__name__}({fields_str})"

def to_dict(self) -> Dict[str, Any]:
"""
Convert the ByteStream to a dictionary representation.

:returns: A dictionary with keys 'data', 'meta', and 'mime_type'.
"""
# Note: The data is converted to a list of integers for serialization since JSON does not support bytes
# directly.
return {"data": list(self.data), "meta": self.meta, "mime_type": self.mime_type}

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ByteStream":
"""
Create a ByteStream from a dictionary representation.

:param data: A dictionary with keys 'data', 'meta', and 'mime_type'.

:returns: A ByteStream instance.
"""
return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))
10 changes: 7 additions & 3 deletions haystack/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,12 @@ def to_dict(self, flatten: bool = True) -> Dict[str, Any]:
Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
"""
data = asdict(self)
if (blob := data.get("blob")) is not None:
data["blob"] = {"data": list(blob["data"]), "mime_type": blob["mime_type"]}

# Use `ByteStream` and `SparseEmbedding`'s to_dict methods to convert them to JSON-serializable types.
if self.blob is not None:
data["blob"] = self.blob.to_dict()
if self.sparse_embedding is not None:
data["sparse_embedding"] = self.sparse_embedding.to_dict()

if flatten:
meta = data.pop("meta")
Expand All @@ -144,7 +148,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "Document":
The `blob` field is converted to its original type.
"""
if blob := data.get("blob"):
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
data["blob"] = ByteStream.from_dict(blob)
if sparse_embedding := data.get("sparse_embedding"):
data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add `to_dict` and `from_dict` to ByteStream so it is consistent with our other dataclasses in having serialization and deserialization methods.
20 changes: 20 additions & 0 deletions test/dataclasses/test_byte_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,23 @@ def test_str_truncation():
assert len(string_repr) < 200
assert "text/plain" in string_repr
assert "foo" in string_repr


def test_to_dict():
test_str = "Hello, world!"
b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
d = b.to_dict()
assert d["data"] == list(test_str.encode())
assert d["mime_type"] == "text/plain"
assert d["meta"] == {"foo": "bar"}


def test_from_dict():
test_str = "Hello, world!"
b = ByteStream.from_string(test_str, mime_type="text/plain", meta={"foo": "bar"})
d = b.to_dict()
b2 = ByteStream.from_dict(d)
assert b2.data == b.data
assert b2.mime_type == b.mime_type
assert b2.meta == b.meta
assert str(b2) == str(b)
12 changes: 6 additions & 6 deletions test/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def test_to_dict_without_flattening():
def test_to_dict_with_custom_parameters():
doc = Document(
content="test text",
blob=ByteStream(b"some bytes", mime_type="application/pdf"),
blob=ByteStream(b"some bytes", mime_type="application/pdf", meta={"foo": "bar"}),
meta={"some": "values", "test": 10},
score=0.99,
embedding=[10.0, 10.0],
Expand All @@ -156,7 +156,7 @@ def test_to_dict_with_custom_parameters():
assert doc.to_dict() == {
"id": doc.id,
"content": "test text",
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {"foo": "bar"}},
"some": "values",
"test": 10,
"score": 0.99,
Expand All @@ -178,10 +178,10 @@ def test_to_dict_with_custom_parameters_without_flattening():
assert doc.to_dict(flatten=False) == {
"id": doc.id,
"content": "test text",
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf"},
"blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {}},
"meta": {"some": "values", "test": 10},
"score": 0.99,
"embedding": [10, 10],
"embedding": [10.0, 10.0],
"sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
}

Expand Down Expand Up @@ -212,15 +212,15 @@ def from_from_dict_with_parameters():
assert Document.from_dict(
{
"content": "test text",
"blob": {"data": list(blob_data), "mime_type": "text/markdown"},
"blob": {"data": list(blob_data), "mime_type": "text/markdown", "meta": {"text": "test text"}},
"meta": {"text": "test text"},
"score": 0.812,
"embedding": [0.1, 0.2, 0.3],
"sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
}
) == Document(
content="test text",
blob=ByteStream(blob_data, mime_type="text/markdown"),
blob=ByteStream(blob_data, mime_type="text/markdown", meta={"text": "test text"}),
meta={"text": "test text"},
score=0.812,
embedding=[0.1, 0.2, 0.3],
Expand Down
Loading