Skip to content

Commit 5c0d0ea

Browse files
feat: complete OracleDocumentStore operations (#3179)
Co-authored-by: David S. Batista <dsbatista@gmail.com>
1 parent 63ff0fd commit 5c0d0ea

8 files changed

Lines changed: 731 additions & 113 deletions

File tree

integrations/oracle/README.md

Lines changed: 0 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -9,109 +9,6 @@ Haystack DocumentStore backed by [Oracle AI Vector Search](https://www.oracle.co
99

1010
---
1111

12-
## Installation
13-
14-
```bash
15-
pip install oracle-haystack
16-
```
17-
18-
Requires Python 3.10+ and Oracle Database 23ai (or later). No Oracle Instant Client is needed for direct TCP connections (thin mode).
19-
20-
## Usage
21-
22-
```python
23-
from haystack.utils import Secret
24-
from haystack_integrations.document_stores.oracle import OracleConnectionConfig, OracleDocumentStore
25-
from haystack_integrations.components.retrievers.oracle import OracleEmbeddingRetriever
26-
27-
# Configure the connection
28-
config = OracleConnectionConfig(
29-
user=Secret.from_env_var("ORACLE_USER"),
30-
password=Secret.from_env_var("ORACLE_PASSWORD"),
31-
dsn=Secret.from_env_var("ORACLE_DSN"),
32-
)
33-
34-
# Create the document store
35-
store = OracleDocumentStore(
36-
connection_config=config,
37-
table_name="my_documents",
38-
embedding_dim=768,
39-
distance_metric="COSINE",
40-
create_table_if_not_exists=True,
41-
)
42-
43-
# Write documents
44-
from haystack.dataclasses import Document
45-
store.write_documents([
46-
Document(content="Oracle 23ai supports native vector search."),
47-
])
48-
49-
# Retrieve by embedding
50-
retriever = OracleEmbeddingRetriever(document_store=store, top_k=5)
51-
results = retriever.run(query_embedding=[0.1] * 768)
52-
print(results["documents"])
53-
```
54-
55-
### Connecting to Oracle Autonomous Database (ADB-S / wallet)
56-
57-
```python
58-
config = OracleConnectionConfig(
59-
user=Secret.from_env_var("ORACLE_USER"),
60-
password=Secret.from_env_var("ORACLE_PASSWORD"),
61-
dsn=Secret.from_env_var("ORACLE_DSN"),
62-
wallet_location="/path/to/wallet",
63-
wallet_password=Secret.from_env_var("WALLET_PASSWORD"),
64-
)
65-
```
66-
67-
### Optional HNSW index
68-
69-
Pass `create_index=True` when constructing the store to build an HNSW vector index, which dramatically speeds up approximate nearest-neighbour search on large collections:
70-
71-
```python
72-
store = OracleDocumentStore(
73-
connection_config=config,
74-
table_name="my_documents",
75-
embedding_dim=768,
76-
create_index=True,
77-
hnsw_neighbors=32,
78-
hnsw_ef_construction=200,
79-
hnsw_accuracy=95,
80-
)
81-
```
82-
8312
## Contributing
8413

8514
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
86-
87-
### Running tests
88-
89-
#### Unit tests
90-
91-
```bash
92-
PYTHONPATH=src hatch run test:unit -vvv
93-
```
94-
95-
#### Integration tests against a live Oracle instance
96-
97-
Set `ORACLE_USER`, `ORACLE_PASSWORD`, and `ORACLE_DSN` environment variables to point at your Oracle 23ai instance, then:
98-
99-
```bash
100-
PYTHONPATH=src hatch run test:integration -vvv
101-
```
102-
103-
#### Integration tests via Docker (local Oracle 23ai Free)
104-
105-
A `docker-compose.yml` is provided that runs [`gvenzl/oracle-free:23-slim`](https://hub.docker.com/r/gvenzl/oracle-free) (Oracle Database 23ai Free edition).
106-
107-
```bash
108-
docker compose up -d --wait
109-
```
110-
111-
`--wait` blocks until the Oracle healthcheck passes (the first boot takes 2–4 minutes while Oracle initialises its data files).
112-
113-
Run the full integration test suite:
114-
115-
```bash
116-
PYTHONPATH=src hatch run test:integration -vvv
117-
```

integrations/oracle/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: Implementation :: CPython",
2323
]
2424
dependencies = [
25-
"haystack-ai>=2.26.1",
25+
"haystack-ai>=2.28.0",
2626
"oracledb>=2.1.0,<3.0.0",
2727
]
2828

integrations/oracle/src/haystack_integrations/components/retrievers/oracle/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
from haystack_integrations.components.retrievers.oracle.embedding_retriever import OracleEmbeddingRetriever
6+
from haystack_integrations.components.retrievers.oracle.keyword_retriever import OracleKeywordRetriever
67

7-
__all__ = ["OracleEmbeddingRetriever"]
8+
__all__ = ["OracleEmbeddingRetriever", "OracleKeywordRetriever"]
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Any
6+
7+
from haystack import component, default_from_dict, default_to_dict
8+
from haystack.dataclasses import Document
9+
from haystack.document_stores.types import FilterPolicy
10+
from haystack.document_stores.types.filter_policy import apply_filter_policy
11+
12+
from haystack_integrations.document_stores.oracle import OracleDocumentStore
13+
14+
15+
@component
16+
class OracleKeywordRetriever:
17+
"""
18+
Retrieves documents from an OracleDocumentStore using keyword-based (BM25) similarity.
19+
20+
Requires Oracle Database 23ai and an automatically created DBMS_SEARCH index.
21+
22+
Use inside a Haystack pipeline::
23+
24+
pipeline.add_component("retriever", OracleKeywordRetriever(document_store=store, top_k=5))
25+
"""
26+
27+
def __init__(
28+
self,
29+
*,
30+
document_store: OracleDocumentStore,
31+
filters: dict[str, Any] | None = None,
32+
top_k: int = 10,
33+
filter_policy: FilterPolicy = FilterPolicy.REPLACE,
34+
) -> None:
35+
if not isinstance(document_store, OracleDocumentStore):
36+
msg = "document_store must be an instance of OracleDocumentStore"
37+
raise TypeError(msg)
38+
self.document_store = document_store
39+
self.filters = filters or {}
40+
self.top_k = top_k
41+
self.filter_policy = FilterPolicy.from_str(filter_policy) if isinstance(filter_policy, str) else filter_policy
42+
43+
@component.output_types(documents=list[Document])
44+
def run(
45+
self,
46+
query: str,
47+
filters: dict[str, Any] | None = None,
48+
top_k: int | None = None,
49+
) -> dict[str, list[Document]]:
50+
"""
51+
Retrieve documents by keyword search.
52+
53+
Args:
54+
query: The keyword query string.
55+
filters: Runtime filters, merged with constructor filters according to filter_policy.
56+
top_k: Override the constructor top_k for this call.
57+
58+
Returns:
59+
``{"documents": [Document, ...]}``
60+
"""
61+
filters = apply_filter_policy(self.filter_policy, self.filters, filters)
62+
docs = self.document_store._keyword_retrieval(
63+
query,
64+
filters=filters,
65+
top_k=top_k if top_k is not None else self.top_k,
66+
)
67+
return {"documents": docs}
68+
69+
@component.output_types(documents=list[Document])
70+
async def run_async(
71+
self,
72+
query: str,
73+
filters: dict[str, Any] | None = None,
74+
top_k: int | None = None,
75+
) -> dict[str, list[Document]]:
76+
"""Async variant of :meth:`run`."""
77+
filters = apply_filter_policy(self.filter_policy, self.filters, filters)
78+
docs = await self.document_store._keyword_retrieval_async(
79+
query,
80+
filters=filters,
81+
top_k=top_k if top_k is not None else self.top_k,
82+
)
83+
return {"documents": docs}
84+
85+
def to_dict(self) -> dict[str, Any]:
86+
"""
87+
Serializes the component to a dictionary.
88+
89+
:returns:
90+
Dictionary with serialized data.
91+
"""
92+
return default_to_dict(
93+
self,
94+
document_store=self.document_store.to_dict(),
95+
filters=self.filters,
96+
top_k=self.top_k,
97+
filter_policy=self.filter_policy.value,
98+
)
99+
100+
@classmethod
101+
def from_dict(cls, data: dict[str, Any]) -> "OracleKeywordRetriever":
102+
"""
103+
Deserializes the component from a dictionary.
104+
105+
:param data:
106+
Dictionary to deserialize from.
107+
:returns:
108+
Deserialized component.
109+
"""
110+
params = data.get("init_parameters", {})
111+
if "document_store" in params:
112+
params["document_store"] = OracleDocumentStore.from_dict(params["document_store"])
113+
if filter_policy := params.get("filter_policy"):
114+
params["filter_policy"] = FilterPolicy.from_str(filter_policy)
115+
return default_from_dict(cls, data)

0 commit comments

Comments
 (0)