diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py index 2913c60075..14d3b29fd9 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/converters.py @@ -1,3 +1,4 @@ +from dataclasses import replace from typing import Any from haystack import logging @@ -22,6 +23,9 @@ def _from_haystack_to_pg_documents(documents: list[Document]) -> list[dict[str, db_document["blob_meta"] = Jsonb(blob.meta) if blob and blob.meta else None db_document["blob_mime_type"] = blob.mime_type if blob and blob.mime_type else None db_document["meta"] = Jsonb(db_document["meta"]) + # PostgreSQL text fields cannot contain NUL (0x00) bytes, removing NUL bytes + if content := db_document["content"]: + db_document["content"] = content.replace("\x00", "") if "sparse_embedding" in db_document: sparse_embedding = db_document.pop("sparse_embedding", None) @@ -65,7 +69,7 @@ def _from_pg_to_haystack_documents(documents: list[dict[str, Any]]) -> list[Docu if blob_data: blob = ByteStream(data=blob_data, meta=blob_meta, mime_type=blob_mime_type) - haystack_document.blob = blob + haystack_document = replace(haystack_document, blob=blob) haystack_documents.append(haystack_document) diff --git a/integrations/pgvector/tests/test_converters.py b/integrations/pgvector/tests/test_converters.py index cb72028739..bf3be8c629 100644 --- a/integrations/pgvector/tests/test_converters.py +++ b/integrations/pgvector/tests/test_converters.py @@ -30,6 +30,13 @@ def test_from_haystack_to_pg_documents(): embedding=[0.7, 0.8, 0.9], score=0.7, ), + Document( + id="4", + content="This is another text\x00", + meta={"meta_key": "meta_value"}, + embedding=[0.7, 0.8, 0.9], + score=0.8, + ), ] pg_docs = _from_haystack_to_pg_documents(haystack_docs) @@ -64,6 +71,16 @@ def test_from_haystack_to_pg_documents(): assert pg_docs[2]["embedding"] == [0.7, 0.8, 0.9] assert "score" not in pg_docs[2] + assert pg_docs[3]["id"] == "4" + assert pg_docs[3]["content"] == "This is another text" + assert pg_docs[3]["blob_data"] is None + assert pg_docs[3]["blob_meta"] is None + assert pg_docs[3]["blob_mime_type"] is None + assert "dataframe" not in pg_docs[3] + assert pg_docs[3]["meta"].obj == {"meta_key": "meta_value"} + assert pg_docs[3]["embedding"] == [0.7, 0.8, 0.9] + assert "score" not in pg_docs[3] + def test_from_pg_to_haystack_documents(): pg_docs = [