Skip to content

Commit 10d7848

Browse files
authored
feat: Add QueryHint support for database optimization (#177)
Add QueryHint support for OceanBase database optimization with parallel execution and query timeout hints. ## Key Features: - QueryHint dataclass with parallel and query_timeout parameters - SQL hint generation for OceanBase database optimization - Added to collection.get(), query(), and hybrid_search() methods - Comprehensive test coverage across all database modes - Full documentation with usage examples ## Usage: from pyseekdb.client.query_types import QueryHint query_hint = QueryHint(parallel=4, query_timeout=5.0) collection.get(ids=['1', '2'], query_hint=query_hint) Made with [Cursor](https://cursor.com) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Introduced QueryHint as a client-side option to tune collection.query(), get(), and hybrid_search() (parallelism, timeout, vector index control). When provided, hint text is integrated into generated queries and can affect execution plans. * **Documentation** * Docstrings and examples updated to show QueryHint usage. * **Tests** * Added integration tests covering QueryHint scenarios for query, get, and hybrid_search. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 14528ed commit 10d7848

8 files changed

Lines changed: 399 additions & 5 deletions

File tree

src/pyseekdb/client/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
get_default_embedding_function,
4747
register_embedding_function,
4848
)
49+
from .query_types import QueryHint
4950
from .schema import Schema
5051
from .sparse_embedding_function import (
5152
SparseEmbeddingFunction,
@@ -172,6 +173,7 @@ def __getattr__(name: str) -> Any:
172173
"K",
173174
"Ngram2FulltextIndexConfig",
174175
"NgramFulltextIndexConfig",
176+
"QueryHint",
175177
"RemoteServerClient",
176178
"Schema",
177179
"SeekdbEmbeddedClient",

src/pyseekdb/client/client_base.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,15 @@
3838
)
3939
from .filters import FilterBuilder
4040
from .meta_info import CollectionFieldNames, CollectionNames
41+
from .query_types import QueryHint
4142
from .schema import Schema, SparseVectorIndexConfig
4243
from .sparse_embedding_function import (
4344
SparseEmbeddingFunction,
4445
SparseEmbeddingFunctionRegistry,
4546
SparseVector,
4647
_sparse_vector_to_sql,
4748
)
48-
from .sql_utils import is_query_sql
49+
from .sql_utils import _query_hint_to_sql, is_query_sql
4950
from .types import K as FieldKey
5051
from .version import Version
5152

@@ -2588,6 +2589,7 @@ def _collection_query( # noqa: C901
25882589
where_document: dict[str, Any] | None = None,
25892590
include: list[str] | None = None,
25902591
query_key: FieldKey | None = None,
2592+
query_hint: QueryHint | None = None,
25912593
**kwargs,
25922594
) -> dict[str, Any]:
25932595
"""
@@ -2647,6 +2649,7 @@ def _collection_query( # noqa: C901
26472649
include=include,
26482650
sparse_config=sparse_config,
26492651
collection_name=collection_name,
2652+
query_hint=query_hint,
26502653
**kwargs,
26512654
)
26522655

@@ -2725,12 +2728,15 @@ def _collection_query( # noqa: C901
27252728
# Convert vector to string format for SQL
27262729
vector_str = _embedding_to_hexstring(query_vector)
27272730

2731+
# Build query hint
2732+
hint_sql = _query_hint_to_sql(query_hint, table_name=table_name)
2733+
27282734
# Build SQL query with vector distance calculation
27292735
# Reference: SELECT id, vec FROM t2 ORDER BY l2_distance(vec, '[0.1, 0.2, 0.3]') APPROXIMATE LIMIT 5;
27302736
# Need to include distance in SELECT for result processing
27312737
# Use the appropriate distance function based on the index configuration
27322738
sql = f"""
2733-
SELECT {select_clause},
2739+
SELECT {hint_sql} {select_clause},
27342740
{distance_func}(embedding, {vector_str}) AS distance
27352741
FROM `{table_name}`
27362742
{where_clause}
@@ -2806,6 +2812,7 @@ def _collection_query_sparse( # noqa: C901
28062812
include: list[str] | None = None,
28072813
sparse_config=None,
28082814
collection_name: str = "",
2815+
query_hint=None,
28092816
**kwargs,
28102817
) -> dict[str, Any]:
28112818
"""
@@ -2858,6 +2865,8 @@ def _collection_query_sparse( # noqa: C901
28582865
if not sparse_query_vectors:
28592866
raise ValueError("No sparse query vectors resolved.")
28602867

2868+
hint_sql = _query_hint_to_sql(query_hint, table_name=table_name)
2869+
28612870
# Normalize include fields
28622871
include_fields = self._normalize_include_fields(include)
28632872

@@ -2885,14 +2894,14 @@ def _collection_query_sparse( # noqa: C901
28852894

28862895
# Build SQL query with sparse vector distance calculation
28872896
sql = f"""
2888-
SELECT {select_clause},
2897+
SELECT {hint_sql} {select_clause},
28892898
{distance_func}(sparse_embedding, {sv_sql}) AS distance
28902899
FROM `{table_name}`
28912900
{where_clause}
28922901
ORDER BY {distance_func}(sparse_embedding, {sv_sql})
28932902
APPROXIMATE
28942903
LIMIT %s
2895-
"""
2904+
""".strip()
28962905

28972906
# Execute query
28982907
query_params = [*params, n_results]
@@ -2960,6 +2969,7 @@ def _collection_get( # noqa: C901
29602969
limit: int | None = None,
29612970
offset: int | None = None,
29622971
include: list[str] | None = None,
2972+
query_hint: QueryHint | None = None,
29632973
**kwargs,
29642974
) -> dict[str, Any]:
29652975
"""
@@ -2974,6 +2984,7 @@ def _collection_get( # noqa: C901
29742984
limit: Maximum number of results (optional)
29752985
offset: Number of results to skip (optional)
29762986
include: Fields to include in results (optional)
2987+
query_hint: Query optimization hints for database execution (optional)
29772988
**kwargs: Additional parameters
29782989
29792990
Returns:
@@ -3015,9 +3026,12 @@ def _collection_get( # noqa: C901
30153026
# Build WHERE clause from filters
30163027
where_clause, params = self._build_where_clause(where, where_document, id_list)
30173028

3029+
# Build query hint
3030+
hint_sql = _query_hint_to_sql(query_hint, table_name=table_name)
3031+
30183032
# Build SQL query
30193033
sql = f"""
3020-
SELECT {select_clause}
3034+
SELECT {hint_sql} {select_clause}
30213035
FROM `{table_name}`
30223036
{where_clause}
30233037
LIMIT %s OFFSET %s
@@ -3073,6 +3087,7 @@ def _collection_hybrid_search(
30733087
rank: dict[str, Any] | None = None,
30743088
n_results: int = 10,
30753089
include: list[str] | None = None,
3090+
query_hint: QueryHint | None = None,
30763091
dimension: int | None = None,
30773092
**kwargs,
30783093
) -> dict[str, Any]:
@@ -3171,6 +3186,12 @@ def _collection_hybrid_search(
31713186
# Remove any surrounding quotes if present
31723187
query_sql = query_sql.strip().strip("'\"")
31733188

3189+
# Add query hint to the generated SQL
3190+
hint_sql = _query_hint_to_sql(query_hint, table_name=table_name)
3191+
if hint_sql and query_sql.upper().startswith("SELECT"):
3192+
# Insert hint after SELECT keyword
3193+
query_sql = f"SELECT {hint_sql} {query_sql[len('SELECT') :]}"
3194+
31743195
logger.debug(f"Executing query SQL: {query_sql}")
31753196

31763197
# Execute the returned SQL query

src/pyseekdb/client/collection.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
if TYPE_CHECKING:
1414
from .embedding_function import Documents as EmbeddingDocuments
1515
from .embedding_function import EmbeddingFunction
16+
from .query_types import QueryHint
1617
from .schema import SparseVectorIndexConfig
1718
from .sparse_embedding_function import SparseEmbeddingFunction
1819

@@ -346,6 +347,7 @@ def query(
346347
where_document: dict[str, Any] | None = None,
347348
include: list[str] | None = None,
348349
query_key: Any | None = None,
350+
query_hint: "QueryHint | None" = None,
349351
**kwargs,
350352
) -> dict[str, Any]:
351353
"""
@@ -370,6 +372,7 @@ def query(
370372
query_key: Specify which index to query. Default is None (dense vector).
371373
Use ``K.SPARSE_EMBEDDING`` (or ``"#sparse_embedding"``)
372374
to query using sparse vector index.
375+
query_hint: Query optimization hints for database execution (optional)
373376
**kwargs: Additional parameters
374377
375378
Returns:
@@ -399,6 +402,15 @@ def query(
399402
query_key=K.SPARSE_EMBEDDING,
400403
n_results=5
401404
)
405+
406+
# Query with query hint
407+
from pyseekdb.client.query_types import QueryHint
408+
results = collection.query(
409+
query_texts=["machine learning"],
410+
n_results=5,
411+
where={"score": {"$gte": 90}},
412+
query_hint=QueryHint(parallel=8, query_timeout=10.0)
413+
)
402414
"""
403415
return self._client._collection_query(
404416
collection_id=self._id,
@@ -409,6 +421,7 @@ def query(
409421
where=where,
410422
where_document=where_document,
411423
include=include,
424+
query_hint=query_hint,
412425
embedding_function=self._embedding_function,
413426
distance=self._distance,
414427
query_key=query_key,
@@ -424,6 +437,7 @@ def get(
424437
limit: int | None = None,
425438
offset: int | None = None,
426439
include: list[str] | None = None,
440+
query_hint: "QueryHint | None" = None,
427441
**kwargs,
428442
) -> dict[str, Any]:
429443
"""
@@ -436,6 +450,7 @@ def get(
436450
limit: Maximum number of results to return (optional)
437451
offset: Number of results to skip (optional)
438452
include: Fields to include in results, e.g., ["metadatas", "documents", "embeddings"] (optional)
453+
query_hint: Query optimization hints for database execution (optional)
439454
**kwargs: Additional parameters
440455
441456
Returns:
@@ -469,6 +484,14 @@ def get(
469484
470485
# Get all data
471486
results = collection.get(limit=100)
487+
488+
# Get with query hint
489+
from pyseekdb.client.query_types import QueryHint
490+
results = collection.get(
491+
where={"category": "AI"},
492+
limit=10,
493+
query_hint=QueryHint(parallel=4, query_timeout=5.0)
494+
)
472495
"""
473496
return self._client._collection_get(
474497
collection_id=self._id,
@@ -479,6 +502,7 @@ def get(
479502
limit=limit,
480503
offset=offset,
481504
include=include,
505+
query_hint=query_hint,
482506
**kwargs,
483507
)
484508

@@ -489,6 +513,7 @@ def hybrid_search(
489513
rank: dict[str, Any] | None = None,
490514
n_results: int = 10,
491515
include: list[str] | None = None,
516+
query_hint: "QueryHint | None" = None,
492517
**kwargs,
493518
) -> dict[str, Any]:
494519
"""
@@ -507,6 +532,7 @@ def hybrid_search(
507532
rank: Ranking configuration dict (e.g., {"rrf": {"rank_window_size": 60, "rank_constant": 60}})
508533
n_results: Final number of results to return after ranking (default: 10)
509534
include: Fields to include in results (e.g., ["documents", "metadatas", "embeddings"])
535+
query_hint: Query optimization hints for database execution (optional)
510536
**kwargs: Additional parameters
511537
512538
Returns:
@@ -537,6 +563,22 @@ def hybrid_search(
537563
# results["ids"][0] contains IDs for the hybrid search
538564
# results["documents"][0] contains documents for the hybrid search
539565
# results["distances"][0] contains distances for the hybrid search
566+
567+
# Hybrid search with query hint
568+
from pyseekdb.client.query_types import QueryHint
569+
results = collection.hybrid_search(
570+
query={
571+
"where_document": {"$contains": "AI"},
572+
"n_results": 8
573+
},
574+
knn={
575+
"query_texts": ["artificial intelligence"],
576+
"n_results": 8
577+
},
578+
rank={"rrf": {"rank_window_size": 60}},
579+
n_results=10,
580+
query_hint=QueryHint(parallel=6, query_timeout=15.0)
581+
)
540582
"""
541583
# When no query/knn provided, return only ids/distances by default
542584
if include is None and not query and not knn:
@@ -550,6 +592,7 @@ def hybrid_search(
550592
rank=rank,
551593
n_results=n_results,
552594
include=include,
595+
query_hint=query_hint,
553596
embedding_function=self._embedding_function,
554597
dimension=self._dimension,
555598
**kwargs,

src/pyseekdb/client/query_types.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
Query-related type definitions and data classes for pyseekdb client.
3+
4+
This module contains types and classes used for query operations,
5+
such as query hints for database optimization.
6+
"""
7+
8+
from dataclasses import dataclass
9+
10+
11+
@dataclass
12+
class QueryHint:
13+
"""
14+
Query hint for database optimization
15+
16+
Args:
17+
parallel: Number of parallel execution threads (optional)
18+
query_timeout: Query timeout in seconds (optional, converted to microseconds for OceanBase)
19+
vector_index: Whether to add vector index hint (optional).
20+
"""
21+
22+
parallel: int | None = None
23+
query_timeout: float | None = None
24+
vector_index: bool | None = None
25+
26+
def __post_init__(self):
27+
if self.parallel is not None and self.parallel <= 0:
28+
raise ValueError(f"parallel must be positive, got {self.parallel}")
29+
if self.query_timeout is not None and self.query_timeout <= 0:
30+
raise ValueError(f"query_timeout must be positive, got {self.query_timeout}")
31+
if self.vector_index is not None and not isinstance(self.vector_index, bool):
32+
raise ValueError(f"vector_index must be a boolean, got {type(self.vector_index)}")

src/pyseekdb/client/sql_utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
from pymysql.converters import escape_string
1111

12+
from .query_types import QueryHint
13+
1214

1315
def escape_percent_for_sql(value: str) -> str:
1416
"""
@@ -61,3 +63,36 @@ def render_sql_with_params(sql: str, params: Sequence[Any]) -> str:
6163
rendered_parts.append(replacement)
6264
rendered_parts.append(part)
6365
return "".join(rendered_parts)
66+
67+
68+
def _query_hint_to_sql(query_hint: QueryHint | None, table_name: str | None = None) -> str:
69+
"""
70+
Convert QueryHint to SQL HINT string for OceanBase/seekdb.
71+
72+
Args:
73+
query_hint: QueryHint object containing hint parameters
74+
75+
Returns:
76+
SQL HINT string in format "/*+ hint1(value1) hint2(value2) */" or empty string if no hints
77+
"""
78+
if query_hint is None:
79+
return ""
80+
81+
hints = []
82+
83+
if query_hint.parallel is not None:
84+
hints.append(f"parallel({query_hint.parallel})")
85+
86+
if query_hint.query_timeout is not None:
87+
# Convert seconds to microseconds for OceanBase
88+
timeout_microseconds = int(query_hint.query_timeout * 1_000_000)
89+
hints.append(f"query_timeout({timeout_microseconds})")
90+
91+
if query_hint.vector_index is True and table_name:
92+
hints.append(f"INDEX({table_name}, idx_vec)")
93+
94+
if not hints:
95+
return ""
96+
97+
hint_str = " ".join(hints)
98+
return f"/*+ {hint_str} */"

0 commit comments

Comments
 (0)