Skip to content

Commit ed36347

Browse files
maxdswainanakin87
andauthored
fix: Remove NUL bytes when converting from haystack to pg documents (#2892)
* fix: Remove NUL bytes when converting from haystack to pg documents * fix: pg_docs index error in updated test * Update integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * fmt * unrelated: dataclasses replace --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
1 parent fa61c76 commit ed36347

2 files changed

Lines changed: 22 additions & 1 deletion

File tree

integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from dataclasses import replace
12
from typing import Any
23

34
from haystack import logging
@@ -22,6 +23,9 @@ def _from_haystack_to_pg_documents(documents: list[Document]) -> list[dict[str,
2223
db_document["blob_meta"] = Jsonb(blob.meta) if blob and blob.meta else None
2324
db_document["blob_mime_type"] = blob.mime_type if blob and blob.mime_type else None
2425
db_document["meta"] = Jsonb(db_document["meta"])
26+
# PostgreSQL text fields cannot contain NUL (0x00) bytes, removing NUL bytes
27+
if content := db_document["content"]:
28+
db_document["content"] = content.replace("\x00", "")
2529

2630
if "sparse_embedding" in db_document:
2731
sparse_embedding = db_document.pop("sparse_embedding", None)
@@ -65,7 +69,7 @@ def _from_pg_to_haystack_documents(documents: list[dict[str, Any]]) -> list[Docu
6569

6670
if blob_data:
6771
blob = ByteStream(data=blob_data, meta=blob_meta, mime_type=blob_mime_type)
68-
haystack_document.blob = blob
72+
haystack_document = replace(haystack_document, blob=blob)
6973

7074
haystack_documents.append(haystack_document)
7175

integrations/pgvector/tests/test_converters.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,13 @@ def test_from_haystack_to_pg_documents():
3030
embedding=[0.7, 0.8, 0.9],
3131
score=0.7,
3232
),
33+
Document(
34+
id="4",
35+
content="This is another text\x00",
36+
meta={"meta_key": "meta_value"},
37+
embedding=[0.7, 0.8, 0.9],
38+
score=0.8,
39+
),
3340
]
3441

3542
pg_docs = _from_haystack_to_pg_documents(haystack_docs)
@@ -64,6 +71,16 @@ def test_from_haystack_to_pg_documents():
6471
assert pg_docs[2]["embedding"] == [0.7, 0.8, 0.9]
6572
assert "score" not in pg_docs[2]
6673

74+
assert pg_docs[3]["id"] == "4"
75+
assert pg_docs[3]["content"] == "This is another text"
76+
assert pg_docs[3]["blob_data"] is None
77+
assert pg_docs[3]["blob_meta"] is None
78+
assert pg_docs[3]["blob_mime_type"] is None
79+
assert "dataframe" not in pg_docs[3]
80+
assert pg_docs[3]["meta"].obj == {"meta_key": "meta_value"}
81+
assert pg_docs[3]["embedding"] == [0.7, 0.8, 0.9]
82+
assert "score" not in pg_docs[3]
83+
6784

6885
def test_from_pg_to_haystack_documents():
6986
pg_docs = [

0 commit comments

Comments
 (0)