Skip to content

Commit 861550f

Browse files
pragnyanramthajulian-rischclaude
authored
fix: avoid mutating Document.from_dict input (#11330)
Co-authored-by: Julian Risch <julian.risch@deepset.ai> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4c89081 commit 861550f

3 files changed

Lines changed: 41 additions & 1 deletion

File tree

haystack/dataclasses/document.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def from_dict(cls, data: dict[str, Any]) -> "Document":
149149
150150
The `blob` field is converted to its original type.
151151
"""
152+
data = data.copy()
152153
if blob := data.get("blob"):
153154
data["blob"] = ByteStream.from_dict(blob)
154155
if sparse_embedding := data.get("sparse_embedding"):
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
Prevent ``Document.from_dict()`` from mutating the input dictionary during deserialization.

test/dataclasses/test_document.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import warnings
6+
from copy import deepcopy
67
from dataclasses import replace
78

89
import pytest
@@ -213,7 +214,7 @@ def test_from_dict():
213214
assert Document.from_dict({}) == Document()
214215

215216

216-
def from_from_dict_with_parameters():
217+
def test_from_dict_with_parameters():
217218
blob_data = b"some bytes"
218219
assert Document.from_dict(
219220
{
@@ -234,6 +235,40 @@ def from_from_dict_with_parameters():
234235
)
235236

236237

238+
def test_from_dict_does_not_mutate_input():
239+
blob_data = b"some bytes"
240+
data = {
241+
"content": "test text",
242+
"blob": {"data": list(blob_data), "mime_type": "text/markdown"},
243+
"score": 0.812,
244+
"embedding": [0.1, 0.2, 0.3],
245+
"sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
246+
"date": "10-10-2023",
247+
"type": "article",
248+
}
249+
original_data = deepcopy(data)
250+
251+
assert Document.from_dict(data) == Document(
252+
content="test text",
253+
blob=ByteStream(blob_data, mime_type="text/markdown"),
254+
score=0.812,
255+
embedding=[0.1, 0.2, 0.3],
256+
sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]),
257+
meta={"date": "10-10-2023", "type": "article"},
258+
)
259+
assert data == original_data
260+
261+
262+
def test_from_dict_does_not_mutate_input_with_explicit_meta():
263+
data = {"content": "test text", "meta": {"date": "10-10-2023", "type": "article"}, "score": 0.812}
264+
original_data = deepcopy(data)
265+
266+
assert Document.from_dict(data) == Document(
267+
content="test text", meta={"date": "10-10-2023", "type": "article"}, score=0.812
268+
)
269+
assert data == original_data
270+
271+
237272
def test_from_dict_with_legacy_fields():
238273
assert Document.from_dict(
239274
{

0 commit comments

Comments
 (0)