Skip to content

Commit 18ad59e

Browse files
committed
not, in, etc. changes
1 parent 49b2785 commit 18ad59e

9 files changed

Lines changed: 456 additions & 16 deletions

File tree

AGENTS.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ Redis search results come out as a `QueryResult(rows, count)`.
2828
translator.
2929
- Writes. There is no `INSERT`, `UPDATE`, or `DELETE`. Write through `redis-py`.
3030
- Index creation. Use `FT.CREATE` directly via `redis-py` first.
31-
- Cross-index joins, subqueries, `HAVING`, `DISTINCT`. Not implemented.
31+
- Cross-index joins, subqueries, `HAVING`. Not implemented.
32+
- `SELECT DISTINCT` is supported for bare column projections (routed to
33+
`FT.AGGREGATE` with `GROUPBY`). `SELECT DISTINCT *` and `DISTINCT` mixed
34+
with aggregates raise `ValueError`.
3235

3336
## The minimum useful snippet
3437

@@ -80,8 +83,10 @@ Full reference, generated from docstrings, is at `docs/api/`.
8083
6. **Lazy schema loading is the default.** The first query that touches an
8184
index issues one `FT.INFO`. Pass `schema_cache_strategy="load_all"` to
8285
`create_executor` if you want to fail fast on missing indexes at startup.
83-
7. **No JOIN, subquery, HAVING, or DISTINCT.** The translator raises
84-
`ValueError`; do not retry with rephrasing.
86+
7. **No JOIN, subquery, or HAVING.** The translator raises `ValueError`;
87+
do not retry with rephrasing. `SELECT DISTINCT col1, col2, ...` is
88+
supported for column projections; `DISTINCT *` and `DISTINCT` mixed with
89+
aggregates still raise.
8590
8. **GEO uses `POINT(lon, lat)` order.** Longitude first, matching Redis.
8691

8792
## Error model an agent can expect

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,14 @@ Pass `decode_responses=True` to the `Redis` client if you want string keys inste
7272
- [x] Date functions: `YEAR()`, `MONTH()`, `DAY()`, `DATE_FORMAT()`, etc.
7373
- [x] `IS NULL` / `IS NOT NULL` via `ismissing()` (requires Redis 7.4+)
7474
- [x] `exists()` function for field presence checks
75+
- [x] `SELECT DISTINCT col1, col2, ...` on bare column projections (routed to `FT.AGGREGATE` with `GROUPBY`)
7576

7677
## What's not implemented (yet)
7778

7879
- [ ] JOINs (Redis doesn't support cross-index joins)
7980
- [ ] Subqueries
8081
- [ ] HAVING clause
81-
- [ ] DISTINCT
82+
- [ ] `SELECT DISTINCT *` and `DISTINCT` mixed with aggregate functions (both raise `ValueError`)
8283
- [ ] Index creation from SQL (`CREATE INDEX`)
8384

8485
The translator raises `ValueError` for unsupported clauses; do not retry with rephrasing.

docs/api/sql-syntax.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis
2424
| Date functions (`YEAR`, `MONTH`, `DAY`, `DATE_FORMAT`, ...) | yes |
2525
| `IS NULL` / `IS NOT NULL` (Redis 7.4+) | yes |
2626
| `exists()` for field presence | yes |
27+
| `SELECT DISTINCT col1, col2, ...` | yes (column projections only; routed to `FT.AGGREGATE` with `GROUPBY`) |
2728

2829
## Not supported
2930

@@ -32,7 +33,8 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis
3233
| `JOIN` | Redis has no cross-index join. |
3334
| Subqueries | Out of scope for the POC. |
3435
| `HAVING` | Out of scope (use `WHERE` plus `GROUP BY` where possible). |
35-
| `DISTINCT` | Out of scope. |
36+
| `SELECT DISTINCT *` | Cannot group by an unspecified set of columns; list them explicitly. |
37+
| `DISTINCT` with aggregate functions | Use `GROUP BY` explicitly, or `COUNT(DISTINCT col)` for cardinality. |
3638
| `CREATE INDEX` | sql-redis does not create schemas. Use `FT.CREATE`. |
3739

3840
## TEXT search

sql_redis/analyzer.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,15 +125,34 @@ def analyze(self, parsed: ParsedQuery) -> AnalyzedQuery:
125125
for date_func in parsed.date_functions:
126126
referenced_fields.add(date_func.field)
127127

128-
# Collect aliases from date functions and computed fields (for GROUP BY)
128+
# Collect aliases from date functions, computed fields, the score()
129+
# alias, and aggregation aliases. These are computed in the query
130+
# pipeline rather than loaded from documents, so GROUP BY / ORDER BY
131+
# references to them must not be looked up in the schema.
129132
alias_names = {df.alias for df in parsed.date_functions}
130133
alias_names.update(cf.alias for cf in parsed.computed_fields)
134+
if parsed.scoring is not None:
135+
alias_names.add(parsed.scoring.alias)
136+
for agg in parsed.aggregations:
137+
if agg.alias:
138+
alias_names.add(agg.alias)
139+
if parsed.vector_search is not None and parsed.vector_search.alias:
140+
# ORDER BY <vector-distance-alias> is the canonical way to sort by
141+
# KNN similarity; the alias is a computed column, not an indexed
142+
# field, so it must not be looked up in the schema.
143+
alias_names.add(parsed.vector_search.alias)
131144

132145
# Fields from GROUP BY (exclude aliases since they're computed)
133146
for field_name in parsed.groupby_fields:
134147
if field_name not in alias_names:
135148
referenced_fields.add(field_name)
136149

150+
# Fields from ORDER BY (so they are validated against the schema
151+
# and available for LOAD in the FT.AGGREGATE path)
152+
for field_name, _ in parsed.orderby_fields:
153+
if field_name not in alias_names:
154+
referenced_fields.add(field_name)
155+
137156
# Resolve field types
138157
for field_name in referenced_fields:
139158
if field_name not in schema:

sql_redis/parser.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ class ParsedQuery:
264264
offset: int | None = None
265265
filters: list[str] = dataclasses.field(default_factory=list)
266266
scoring: ScoringSpec | None = None # Relevance scoring config
267+
distinct: bool = False # SELECT DISTINCT was specified
267268

268269

269270
class SQLParser:
@@ -291,8 +292,30 @@ def parse(self, sql: str) -> ParsedQuery:
291292
# Extract SELECT fields and aggregations
292293
select = ast.find(exp.Select)
293294
if select:
295+
if select.args.get("distinct") is not None:
296+
result.distinct = True
294297
for expression in select.expressions:
295298
self._process_select_expression(expression, result)
299+
if result.distinct:
300+
# Validate DISTINCT shape here; the groupby_fields promotion
301+
# happens after the GROUP BY clause is parsed (below) so an
302+
# explicit GROUP BY does not duplicate the projected columns.
303+
if "*" in result.fields:
304+
raise ValueError(
305+
"SELECT DISTINCT * is not supported; "
306+
"list the columns to deduplicate by explicitly."
307+
)
308+
if result.aggregations:
309+
# AGG(DISTINCT ...) is handled per-aggregate; mixing
310+
# top-level DISTINCT with aggregations has no clean Redis
311+
# mapping. Reject so users do not silently get one or the
312+
# other applied.
313+
raise ValueError(
314+
"SELECT DISTINCT combined with aggregate functions "
315+
"is not supported; use GROUP BY explicitly."
316+
)
317+
if not result.fields:
318+
raise ValueError("SELECT DISTINCT requires at least one column.")
296319

297320
# Extract WHERE clause conditions
298321
where = ast.find(exp.Where)
@@ -311,6 +334,12 @@ def parse(self, sql: str) -> ParsedQuery:
311334
if isinstance(expr, exp.Column):
312335
result.groupby_fields.append(expr.name)
313336

337+
# SELECT DISTINCT: promote the projected columns to GROUP BY so the
338+
# query routes to FT.AGGREGATE and emits GROUPBY @col1 @col2 ...
339+
# An explicit GROUP BY takes precedence so we do not duplicate keys.
340+
if result.distinct and not result.groupby_fields:
341+
result.groupby_fields = list(result.fields)
342+
314343
# Extract HAVING clause — exists() in HAVING → FILTER
315344
having = ast.find(exp.Having)
316345
if having:
@@ -1281,6 +1310,16 @@ def _extract_literal_value(self, expression, convert_dates: bool = False):
12811310
inner_value = self._extract_literal_value(expression.this)
12821311
if inner_value is not None:
12831312
return -inner_value
1313+
elif isinstance(expression, exp.Column):
1314+
# SQL with ANSI quoting parses "active" as an identifier
1315+
# (exp.Column(Identifier(quoted=True))), not a string literal.
1316+
# Users who write `status = "active"` clearly intend a value
1317+
# comparison; silently turning it into None produced
1318+
# `@status:{None}` and crashed on NUMERIC fields. Treat a quoted
1319+
# identifier in value position as its string contents.
1320+
ident = expression.this
1321+
if isinstance(ident, exp.Identifier) and ident.args.get("quoted"):
1322+
return ident.this
12841323
return None
12851324

12861325
def _validate_geo_unit(self, unit_val: object) -> str:

sql_redis/query_builder.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,11 @@
4949
class QueryBuilder:
5050
"""Builds RediSearch query syntax from conditions."""
5151

52-
# Characters that need escaping in TAG values
53-
TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~"
52+
# Characters that need escaping in TAG values.
53+
# `|` separates values inside an IN list at the syntax level, so a literal
54+
# pipe inside a value must be escaped; otherwise `status = 'a|b'` parses
55+
# as `status IN ('a', 'b')`.
56+
TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~|"
5457

5558
# Characters that have special meaning in RediSearch free-text queries
5659
# (outside double-quoted phrases). Must be escaped with backslash.
@@ -139,6 +142,13 @@ def build_text_condition(
139142

140143
# Build search_value based on operator — shared by single- and multi-field paths
141144
if operator == "LIKE":
145+
# LIKE '' produces no token and no wildcard, so RediSearch emits a
146+
# bare `@field:` which is a syntax error at runtime. Reject early.
147+
if value == "":
148+
raise ValueError(
149+
"LIKE pattern must not be empty. Use IS NULL / IS NOT NULL "
150+
"to test for field absence, or '%' to match any value."
151+
)
142152
# Escape special chars in the non-wildcard portion, then convert % → *
143153
# Split on %, escape each segment, rejoin with *
144154
parts = value.split("%")
@@ -344,18 +354,21 @@ def build_tag_condition(
344354
field: str,
345355
operator: str,
346356
value: str | list[str],
357+
negated: bool = False,
347358
) -> str:
348359
"""Build query syntax for TAG field conditions.
349360
350361
Args:
351362
field: Field name.
352363
operator: One of =, !=, IN.
353364
value: Tag value or list of values for IN.
365+
negated: If True, prefix with `-` for negation. Covers NOT IN
366+
and NOT field = .... The `!=` operator is also honored.
354367
355368
Returns:
356369
RediSearch query syntax like @field:{value} or @field:{v1|v2}.
357370
"""
358-
prefix = "-" if operator == "!=" else ""
371+
prefix = "-" if negated or operator == "!=" else ""
359372

360373
if isinstance(value, list):
361374
# IN clause - join with |
@@ -371,36 +384,40 @@ def build_numeric_condition(
371384
field: str,
372385
operator: str,
373386
value: int | float | tuple[int | float, int | float],
387+
negated: bool = False,
374388
) -> str:
375389
"""Build query syntax for NUMERIC field conditions.
376390
377391
Args:
378392
field: Field name.
379393
operator: One of =, !=, <, <=, >, >=, BETWEEN.
380394
value: Numeric value or (min, max) tuple for BETWEEN.
395+
negated: If True, prefix the resulting range with - so that
396+
NOT field > x, NOT BETWEEN, etc. are honored. Without this,
397+
NOT on comparison operators was silently dropped.
381398
382399
Returns:
383400
RediSearch query syntax like @field:[min max].
384401
"""
385-
prefix = "-" if operator == "!=" else ""
402+
prefix = "-" if negated or operator == "!=" else ""
386403

387404
if operator == "BETWEEN":
388405
if isinstance(value, tuple):
389406
min_val, max_val = value
390407
return f"{prefix}@{field}:[{min_val} {max_val}]"
391408
raise ValueError("BETWEEN operator requires a tuple (min, max)")
392409
elif operator == "=":
393-
return f"@{field}:[{value} {value}]"
410+
return f"{prefix}@{field}:[{value} {value}]"
394411
elif operator == "!=":
395-
return f"-@{field}:[{value} {value}]"
412+
return f"{prefix}@{field}:[{value} {value}]"
396413
elif operator == ">":
397-
return f"@{field}:[({value} +inf]"
414+
return f"{prefix}@{field}:[({value} +inf]"
398415
elif operator == ">=":
399-
return f"@{field}:[{value} +inf]"
416+
return f"{prefix}@{field}:[{value} +inf]"
400417
elif operator == "<":
401-
return f"@{field}:[-inf ({value}]"
418+
return f"{prefix}@{field}:[-inf ({value}]"
402419
elif operator == "<=":
403-
return f"@{field}:[-inf {value}]"
420+
return f"{prefix}@{field}:[-inf {value}]"
404421
else:
405422
raise ValueError(f"Unknown numeric operator: {operator}")
406423

sql_redis/translator.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery:
156156
)
157157

158158
# Determine if we need FT.AGGREGATE
159+
# Multi-key ORDER BY also requires FT.AGGREGATE: FT.SEARCH SORTBY
160+
# accepts a single key, while FT.AGGREGATE SORTBY accepts multiple.
161+
# Routing automatically prevents the silent drop of trailing keys
162+
# that used to happen on the FT.SEARCH path.
159163
use_aggregate = (
160164
len(analyzed.aggregations) > 0
161165
or len(analyzed.groupby_fields) > 0
@@ -165,6 +169,7 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery:
165169
or len(analyzed.date_functions) > 0
166170
or has_date_func_conditions
167171
or len(parsed.filters) > 0 # exists() in HAVING → FILTER
172+
or len(parsed.orderby_fields) > 1 # multi-key ORDER BY
168173
)
169174

170175
# Build query string from conditions
@@ -310,6 +315,15 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
310315
inorder=condition.inorder,
311316
)
312317
elif field_type == "TAG":
318+
# BETWEEN is meaningless for TAG values; RediSearch tags have no
319+
# ordering, so 'a' <= status <= 'z' has no defined semantics.
320+
# Previously the parser fell through and the builder emitted
321+
# @status:{\('a'\, 'z'\)} (invalid). Surface the limitation.
322+
if operator == "BETWEEN":
323+
raise ValueError(
324+
f"BETWEEN is not supported on TAG fields ('{condition.field}'); "
325+
"TAG values are unordered. Use IN (...) for a set match."
326+
)
313327
# Keep list value for IN clauses, convert scalar to string
314328
value = (
315329
condition.value
@@ -320,8 +334,35 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
320334
condition.field,
321335
operator,
322336
value,
337+
negated=is_negated,
323338
)
324339
elif field_type == "NUMERIC":
340+
# IN (...) on a NUMERIC field was previously handed a list value
341+
# to build_numeric_condition, which then tried float([1,2,3]) and
342+
# crashed. RediSearch has no native IN for NUMERIC; expand to a
343+
# union of equality ranges (negated → AND of NOT-equals).
344+
if operator == "IN":
345+
if not isinstance(condition.value, list) or not condition.value:
346+
raise ValueError(
347+
f"IN on NUMERIC field '{condition.field}' requires a "
348+
"non-empty list of values."
349+
)
350+
parts: list[str] = []
351+
for item in condition.value:
352+
item_num = self._convert_to_numeric(item)
353+
parts.append(
354+
self._query_builder.build_numeric_condition(
355+
condition.field, "=", item_num, negated=is_negated
356+
)
357+
)
358+
if len(parts) == 1:
359+
return parts[0]
360+
# NOT IN (...) → AND of negated equalities (De Morgan)
361+
# IN (...) → OR of equalities
362+
joiner = " " if is_negated else "|"
363+
joined = joiner.join(parts)
364+
return f"({joined})"
365+
325366
# Cast value to expected type for numeric conditions
326367
numeric_value: int | float | tuple[int | float, int | float]
327368
if isinstance(condition.value, tuple):
@@ -345,6 +386,7 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
345386
condition.field,
346387
operator,
347388
numeric_value,
389+
negated=is_negated,
348390
)
349391
else:
350392
# GEO, VECTOR, and unknown field types - default to text search
@@ -425,6 +467,9 @@ def _build_search(
425467
# SORTBY — skip if the ORDER BY field is a score() alias, because
426468
# WITHSCORES already returns results in relevance order and the alias
427469
# is not a sortable indexed field.
470+
# Multi-key ORDER BY is routed to FT.AGGREGATE upstream (see
471+
# use_aggregate in _build_command), so by the time we reach this
472+
# branch parsed.orderby_fields has at most one entry.
428473
score_alias_name = parsed.scoring.alias if parsed.scoring else None
429474
if parsed.orderby_fields:
430475
field_name, direction = parsed.orderby_fields[0]
@@ -522,6 +567,18 @@ def _build_aggregate(
522567
# Load fields referenced in exists() computed fields (SELECT)
523568
for computed in analyzed.computed_fields:
524569
self._extract_exists_fields(computed.expression, load_fields)
570+
# Load ORDER BY fields so multi-key SORTBY works on non-SORTABLE
571+
# columns. (SORTABLE fields are already in scope; loading them
572+
# again is harmless.) Skip computed/derived aliases.
573+
computed_aliases = {cf.alias for cf in analyzed.computed_fields}
574+
computed_aliases.update(df.alias for df in analyzed.date_functions)
575+
computed_aliases.update(gs.alias for gs in parsed.geo_distance_selects)
576+
for field_name, _ in parsed.orderby_fields:
577+
if (
578+
field_name in analyzed.field_types
579+
and field_name not in computed_aliases
580+
):
581+
load_fields.add(field_name)
525582

526583
if load_all:
527584
args.extend(["LOAD", "*"])

0 commit comments

Comments
 (0)