From 3d72acf4ce41c05e05272825e6cd9d4e7dd8ef41 Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Sun, 4 Jan 2026 17:09:35 +0100 Subject: [PATCH 1/6] Add client_settings param --- .../document_stores/chroma/document_store.py | 21 +++++++++++++-- .../chroma/tests/test_document_store.py | 26 ++++++++++++++++++- .../chroma/tests/test_document_store_async.py | 10 +++++++ integrations/chroma/tests/test_retriever.py | 2 ++ 4 files changed, 56 insertions(+), 3 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index c5d792bcbc..9e05fa99a8 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -7,6 +7,7 @@ import chromadb from chromadb.api.models.AsyncCollection import AsyncCollection +from chromadb.config import Settings from chromadb.api.types import GetResult, QueryResult from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document @@ -40,6 +41,7 @@ def __init__( port: Optional[int] = None, distance_function: Literal["l2", "cosine", "ip"] = "l2", metadata: Optional[dict] = None, + client_settings: Optional[dict[str, Any]] = None, **embedding_function_params: Any, ): """ @@ -67,6 +69,8 @@ def __init__( :param metadata: a dictionary of chromadb collection parameters passed directly to chromadb's client method `create_collection`. If it contains the key `"hnsw:space"`, the value will take precedence over the `distance_function` parameter above. + :param client_settings: a dictionary of Chroma Settings configuration options passed to + `chromadb.config.Settings`. These settings configure the underlying Chroma client behavior. :param embedding_function_params: additional parameters to pass to the embedding function. """ @@ -84,6 +88,7 @@ def __init__( self._embedding_function_params = embedding_function_params self._distance_function = distance_function self._metadata = metadata + self._client_settings = client_settings self._persist_path = persist_path self._host = host @@ -102,18 +107,24 @@ def _ensure_initialized(self): "You cannot specify both options." ) raise ValueError(error_message) + + client_kwargs: dict[str, Any] = {} + if self._client_settings: + client_kwargs["settings"] = Settings(**self._client_settings) + if self._host and self._port is not None: # Remote connection via HTTP client client = chromadb.HttpClient( host=self._host, port=self._port, + **client_kwargs, ) elif self._persist_path is None: # In-memory storage - client = chromadb.Client() + client = chromadb.Client(**client_kwargs) else: # Local persistent storage - client = chromadb.PersistentClient(path=self._persist_path) + client = chromadb.PersistentClient(path=self._persist_path, **client_kwargs) self._client = client # store client for potential future use @@ -148,9 +159,14 @@ async def _ensure_initialized_async(self): ) raise ValueError(error_message) + client_kwargs: dict[str, Any] = {} + if self._client_settings: + client_kwargs["settings"] = Settings(**self._client_settings) + client = await chromadb.AsyncHttpClient( host=self._host, port=self._port, + **client_kwargs, ) self._async_client = client # store client for potential future use @@ -634,6 +650,7 @@ def to_dict(self) -> dict[str, Any]: host=self._host, port=self._port, distance_function=self._distance_function, + client_settings=self._client_settings, **self._embedding_function_params, ) diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index cf5387f285..c395ebec61 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -75,6 +75,10 @@ def test_init_http_connection(self): assert store._host == "localhost" assert store._port == 8000 + def test_init_with_client_settings(self): + store = ChromaDocumentStore(client_settings={"anonymized_telemetry": False}) + assert store._client_settings == {"anonymized_telemetry": False} + def test_invalid_initialization_both_host_and_persist_path(self): """ Test that providing both host and persist_path raises an error. @@ -82,10 +86,27 @@ def test_invalid_initialization_both_host_and_persist_path(self): with pytest.raises(ValueError): store = ChromaDocumentStore(persist_path="./path/to/local/store", host="localhost") store._ensure_initialized() + + def test_client_settings_applied(self): + """ + Chroma's in-memory client uses a singleton pattern with an internal cache. + Once a client is created with certain settings, Chroma rejects creating another + with different settings in the same process. We clear the cache before and after + this test to avoid conflicts with other tests that use default settings. + """ + from chromadb.api.shared_system_client import SharedSystemClient + + SharedSystemClient.clear_system_cache() + try: + store = ChromaDocumentStore(client_settings={"anonymized_telemetry": False}) + store._ensure_initialized() + assert store._client.get_settings().anonymized_telemetry is False + finally: + SharedSystemClient.clear_system_cache() def test_to_dict(self, request): ds = ChromaDocumentStore( - collection_name=request.node.name, embedding_function="HuggingFaceEmbeddingFunction", api_key="1234567890" + collection_name=request.node.name, embedding_function="HuggingFaceEmbeddingFunction", api_key="1234567890", client_settings={"anonymized_telemetry": False} ) ds_dict = ds.to_dict() assert ds_dict == { @@ -98,6 +119,7 @@ def test_to_dict(self, request): "port": None, "api_key": "1234567890", "distance_function": "l2", + "client_settings": {"anonymized_telemetry": False}, }, } @@ -114,6 +136,7 @@ def test_from_dict(self): "port": None, "api_key": "1234567890", "distance_function": "l2", + "client_settings": {"anonymized_telemetry": False} }, } @@ -121,6 +144,7 @@ def test_from_dict(self): assert ds._collection_name == collection_name assert ds._embedding_function == function_name assert ds._embedding_function_params == {"api_key": "1234567890"} + assert ds._client_settings == {"anonymized_telemetry": False} def test_same_collection_name_reinitialization(self): ChromaDocumentStore("test_1") diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 5127cec152..dbae71869b 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -94,6 +94,16 @@ async def test_comparison_equal_async(self, document_store, filterable_docs): ) self.assert_documents_are_equal(result, [d for d in filterable_docs if d.meta.get("number") == 100]) + async def test_client_settings_applied_async(self): + store = ChromaDocumentStore( + host="localhost", + port=8000, + client_settings={"anonymized_telemetry": False}, + collection_name=f"{uuid.uuid1()}-async-settings", + ) + await store._ensure_initialized_async() + assert store._async_client.get_settings().anonymized_telemetry is False + @pytest.mark.integration async def test_search_async(self): document_store = ChromaDocumentStore(host="localhost", port=8000, collection_name="my_custom_collection") diff --git a/integrations/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py index 23c7b0bdcb..a3cf71e884 100644 --- a/integrations/chroma/tests/test_retriever.py +++ b/integrations/chroma/tests/test_retriever.py @@ -41,6 +41,7 @@ def test_to_dict(self, request): "port": None, "api_key": "1234567890", "distance_function": "l2", + "client_settings": None, }, }, }, @@ -131,6 +132,7 @@ def test_to_dict(self, request): "port": None, "api_key": "1234567890", "distance_function": "l2", + "client_settings": None, }, }, }, From 8b3f8ad9d560d11813f6165b5667833fa74bb0da Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Sun, 4 Jan 2026 17:19:00 +0100 Subject: [PATCH 2/6] ruff fixes --- .../document_stores/chroma/document_store.py | 2 +- integrations/chroma/tests/test_document_store.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 9e05fa99a8..e805acb147 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -7,8 +7,8 @@ import chromadb from chromadb.api.models.AsyncCollection import AsyncCollection -from chromadb.config import Settings from chromadb.api.types import GetResult, QueryResult +from chromadb.config import Settings from haystack import default_from_dict, default_to_dict, logging from haystack.dataclasses import Document from haystack.document_stores.errors import DocumentStoreError diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index c395ebec61..96338a98ab 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,6 +9,7 @@ from unittest import mock import pytest +from chromadb.api.shared_system_client import SharedSystemClient from haystack.dataclasses import ByteStream, Document from haystack.testing.document_store import ( TEST_EMBEDDING_1, @@ -86,7 +87,7 @@ def test_invalid_initialization_both_host_and_persist_path(self): with pytest.raises(ValueError): store = ChromaDocumentStore(persist_path="./path/to/local/store", host="localhost") store._ensure_initialized() - + def test_client_settings_applied(self): """ Chroma's in-memory client uses a singleton pattern with an internal cache. @@ -94,8 +95,6 @@ def test_client_settings_applied(self): with different settings in the same process. We clear the cache before and after this test to avoid conflicts with other tests that use default settings. """ - from chromadb.api.shared_system_client import SharedSystemClient - SharedSystemClient.clear_system_cache() try: store = ChromaDocumentStore(client_settings={"anonymized_telemetry": False}) @@ -106,7 +105,10 @@ def test_client_settings_applied(self): def test_to_dict(self, request): ds = ChromaDocumentStore( - collection_name=request.node.name, embedding_function="HuggingFaceEmbeddingFunction", api_key="1234567890", client_settings={"anonymized_telemetry": False} + collection_name=request.node.name, + embedding_function="HuggingFaceEmbeddingFunction", + api_key="1234567890", + client_settings={"anonymized_telemetry": False}, ) ds_dict = ds.to_dict() assert ds_dict == { @@ -136,7 +138,7 @@ def test_from_dict(self): "port": None, "api_key": "1234567890", "distance_function": "l2", - "client_settings": {"anonymized_telemetry": False} + "client_settings": {"anonymized_telemetry": False}, }, } From e4e8708e40c8ce8e1c565034305c82ce506af517 Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Sun, 4 Jan 2026 17:35:07 +0100 Subject: [PATCH 3/6] fix integration mark in async tests --- integrations/chroma/README.md | 3 +++ integrations/chroma/tests/test_document_store_async.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/integrations/chroma/README.md b/integrations/chroma/README.md index fadfc4c094..c8b009ff90 100644 --- a/integrations/chroma/README.md +++ b/integrations/chroma/README.md @@ -11,3 +11,6 @@ ## Contributing Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). + +To run integration tests locally, you need a Chroma server running. +Start one with: `docker run -p 8000:8000 chromadb/chroma:latest`. diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index dbae71869b..78326756bc 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -17,6 +17,7 @@ sys.platform == "win32", reason=("We do not run the Chroma server on Windows and async is only supported with HTTP connections"), ) +@pytest.mark.integration @pytest.mark.asyncio class TestDocumentStoreAsync: @pytest.fixture @@ -104,7 +105,6 @@ async def test_client_settings_applied_async(self): await store._ensure_initialized_async() assert store._async_client.get_settings().anonymized_telemetry is False - @pytest.mark.integration async def test_search_async(self): document_store = ChromaDocumentStore(host="localhost", port=8000, collection_name="my_custom_collection") From 069e9f3cde655b0b0f73c94336580a0c8ce24130 Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Mon, 5 Jan 2026 11:39:13 +0100 Subject: [PATCH 4/6] handle invalid client_settings --- .../document_stores/chroma/document_store.py | 12 ++++++++++-- integrations/chroma/tests/test_document_store.py | 14 ++++++++++++++ .../chroma/tests/test_document_store_async.py | 13 +++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index e805acb147..bb7f74b399 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -110,7 +110,11 @@ def _ensure_initialized(self): client_kwargs: dict[str, Any] = {} if self._client_settings: - client_kwargs["settings"] = Settings(**self._client_settings) + try: + client_kwargs["settings"] = Settings(**self._client_settings) + except Exception as e: + msg = f"Invalid client_settings: {e}" + raise ValueError(msg) from e if self._host and self._port is not None: # Remote connection via HTTP client @@ -161,7 +165,11 @@ async def _ensure_initialized_async(self): client_kwargs: dict[str, Any] = {} if self._client_settings: - client_kwargs["settings"] = Settings(**self._client_settings) + try: + client_kwargs["settings"] = Settings(**self._client_settings) + except Exception as e: + msg = f"Invalid client_settings: {e}" + raise ValueError(msg) from e client = await chromadb.AsyncHttpClient( host=self._host, diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 96338a98ab..4010de80f0 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -103,6 +103,20 @@ def test_client_settings_applied(self): finally: SharedSystemClient.clear_system_cache() + def test_invalid_client_settings(self): + SharedSystemClient.clear_system_cache() + try: + store = ChromaDocumentStore( + client_settings={ + "invalid_setting_name": "some_value", + "another_fake_setting": 123, + } + ) + with pytest.raises(ValueError, match="Invalid client_settings"): + store._ensure_initialized() + finally: + SharedSystemClient.clear_system_cache() + def test_to_dict(self, request): ds = ChromaDocumentStore( collection_name=request.node.name, diff --git a/integrations/chroma/tests/test_document_store_async.py b/integrations/chroma/tests/test_document_store_async.py index 78326756bc..23b6a8fbb2 100644 --- a/integrations/chroma/tests/test_document_store_async.py +++ b/integrations/chroma/tests/test_document_store_async.py @@ -105,6 +105,19 @@ async def test_client_settings_applied_async(self): await store._ensure_initialized_async() assert store._async_client.get_settings().anonymized_telemetry is False + async def test_invalid_client_settings_async(self): + store = ChromaDocumentStore( + host="localhost", + port=8000, + client_settings={ + "invalid_setting_name": "some_value", + "another_fake_setting": 123, + }, + collection_name=f"{uuid.uuid1()}-async-invalid", + ) + with pytest.raises(ValueError, match="Invalid client_settings"): + await store._ensure_initialized_async() + async def test_search_async(self): document_store = ChromaDocumentStore(host="localhost", port=8000, collection_name="my_custom_collection") From d41ab9cafd752f38d8eb454645bee6e0c07c2677 Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Fri, 9 Jan 2026 12:41:52 +0100 Subject: [PATCH 5/6] apply PR comments --- .../document_stores/chroma/document_store.py | 7 ++- .../chroma/tests/test_document_store.py | 47 ++++++++++--------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 558ecdb57e..342f0727aa 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -71,6 +71,9 @@ def __init__( `distance_function` parameter above. :param client_settings: a dictionary of Chroma Settings configuration options passed to `chromadb.config.Settings`. These settings configure the underlying Chroma client behavior. + For available options, see [Chroma's config.py](https://github.com/chroma-core/chroma/blob/main/chromadb/config.py). + **Note**: specifying these settings may interfere with standard client initialization parameters. + This option is intended for advanced customization. :param embedding_function_params: additional parameters to pass to the embedding function. """ @@ -112,7 +115,7 @@ def _ensure_initialized(self): if self._client_settings: try: client_kwargs["settings"] = Settings(**self._client_settings) - except Exception as e: + except ValueError as e: msg = f"Invalid client_settings: {e}" raise ValueError(msg) from e @@ -167,7 +170,7 @@ async def _ensure_initialized_async(self): if self._client_settings: try: client_kwargs["settings"] = Settings(**self._client_settings) - except Exception as e: + except ValueError as e: msg = f"Invalid client_settings: {e}" raise ValueError(msg) from e diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index 2babf16459..ac457567ac 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -21,6 +21,19 @@ from haystack_integrations.document_stores.chroma import ChromaDocumentStore +@pytest.fixture +def clear_chroma_system_cache(): + """ + Chroma's in-memory client uses a singleton pattern with an internal cache. + Once a client is created with certain settings, Chroma rejects creating another + with different settings in the same process. This fixture clears the cache + before and after tests that use custom client settings. + """ + SharedSystemClient.clear_system_cache() + yield + SharedSystemClient.clear_system_cache() + + class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but @@ -88,34 +101,26 @@ def test_invalid_initialization_both_host_and_persist_path(self): store = ChromaDocumentStore(persist_path="./path/to/local/store", host="localhost") store._ensure_initialized() - def test_client_settings_applied(self): + def test_client_settings_applied(self, clear_chroma_system_cache): """ Chroma's in-memory client uses a singleton pattern with an internal cache. Once a client is created with certain settings, Chroma rejects creating another with different settings in the same process. We clear the cache before and after this test to avoid conflicts with other tests that use default settings. """ - SharedSystemClient.clear_system_cache() - try: - store = ChromaDocumentStore(client_settings={"anonymized_telemetry": False}) + store = ChromaDocumentStore(client_settings={"anonymized_telemetry": False}) + store._ensure_initialized() + assert store._client.get_settings().anonymized_telemetry is False + + def test_invalid_client_settings(self, clear_chroma_system_cache): + store = ChromaDocumentStore( + client_settings={ + "invalid_setting_name": "some_value", + "another_fake_setting": 123, + } + ) + with pytest.raises(ValueError, match="Invalid client_settings"): store._ensure_initialized() - assert store._client.get_settings().anonymized_telemetry is False - finally: - SharedSystemClient.clear_system_cache() - - def test_invalid_client_settings(self): - SharedSystemClient.clear_system_cache() - try: - store = ChromaDocumentStore( - client_settings={ - "invalid_setting_name": "some_value", - "another_fake_setting": 123, - } - ) - with pytest.raises(ValueError, match="Invalid client_settings"): - store._ensure_initialized() - finally: - SharedSystemClient.clear_system_cache() def test_to_dict(self, request): ds = ChromaDocumentStore( From d41c334b879b77932cd44ec0fe611bc5b87cd2cf Mon Sep 17 00:00:00 2001 From: SaraCalla Date: Fri, 9 Jan 2026 17:09:30 +0100 Subject: [PATCH 6/6] add comments and clearer error messages --- .../document_stores/chroma/document_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py index 342f0727aa..2d0633bdeb 100644 --- a/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py +++ b/integrations/chroma/src/haystack_integrations/document_stores/chroma/document_store.py @@ -111,12 +111,13 @@ def _ensure_initialized(self): ) raise ValueError(error_message) + # Use dict to conditionally pass settings because Chroma doesn't accept settings=None client_kwargs: dict[str, Any] = {} if self._client_settings: try: client_kwargs["settings"] = Settings(**self._client_settings) except ValueError as e: - msg = f"Invalid client_settings: {e}" + msg = f"Invalid client_settings ({self._client_settings}): {e}" raise ValueError(msg) from e if self._host and self._port is not None: @@ -166,12 +167,13 @@ async def _ensure_initialized_async(self): ) raise ValueError(error_message) + # Use dict to conditionally pass settings because Chroma doesn't accept settings=None client_kwargs: dict[str, Any] = {} if self._client_settings: try: client_kwargs["settings"] = Settings(**self._client_settings) except ValueError as e: - msg = f"Invalid client_settings: {e}" + msg = f"Invalid client_settings ({self._client_settings}): {e}" raise ValueError(msg) from e client = await chromadb.AsyncHttpClient(