not, in, etc. changes

rbs333 · rbs333 · commit 18ad59eb9d10 · 2026-05-27T07:56:35.000-04:00
diff --git a/AGENTS.md b/AGENTS.md
@@ -28,7 +28,10 @@ Redis search results come out as a `QueryResult(rows, count)`.
   translator.
 - Writes. There is no `INSERT`, `UPDATE`, or `DELETE`. Write through `redis-py`.
 - Index creation. Use `FT.CREATE` directly via `redis-py` first.
-- Cross-index joins, subqueries, `HAVING`, `DISTINCT`. Not implemented.
+- Cross-index joins, subqueries, `HAVING`. Not implemented.
+- `SELECT DISTINCT` is supported for bare column projections (routed to
+  `FT.AGGREGATE` with `GROUPBY`). `SELECT DISTINCT *` and `DISTINCT` mixed
+  with aggregates raise `ValueError`.
 
 ## The minimum useful snippet
 
@@ -80,8 +83,10 @@ Full reference, generated from docstrings, is at `docs/api/`.
 6. **Lazy schema loading is the default.** The first query that touches an
    index issues one `FT.INFO`. Pass `schema_cache_strategy="load_all"` to
    `create_executor` if you want to fail fast on missing indexes at startup.
-7. **No JOIN, subquery, HAVING, or DISTINCT.** The translator raises
-   `ValueError`; do not retry with rephrasing.
+7. **No JOIN, subquery, or HAVING.** The translator raises `ValueError`;
+   do not retry with rephrasing. `SELECT DISTINCT col1, col2, ...` is
+   supported for column projections; `DISTINCT *` and `DISTINCT` mixed with
+   aggregates still raise.
 8. **GEO uses `POINT(lon, lat)` order.** Longitude first, matching Redis.
 
 ## Error model an agent can expect
diff --git a/README.md b/README.md
@@ -72,13 +72,14 @@ Pass `decode_responses=True` to the `Redis` client if you want string keys inste
 - [x] Date functions: `YEAR()`, `MONTH()`, `DAY()`, `DATE_FORMAT()`, etc.
 - [x] `IS NULL` / `IS NOT NULL` via `ismissing()` (requires Redis 7.4+)
 - [x] `exists()` function for field presence checks
+- [x] `SELECT DISTINCT col1, col2, ...` on bare column projections (routed to `FT.AGGREGATE` with `GROUPBY`)
 
 ## What's not implemented (yet)
 
 - [ ] JOINs (Redis doesn't support cross-index joins)
 - [ ] Subqueries
 - [ ] HAVING clause
-- [ ] DISTINCT
+- [ ] `SELECT DISTINCT *` and `DISTINCT` mixed with aggregate functions (both raise `ValueError`)
 - [ ] Index creation from SQL (`CREATE INDEX`)
 
 The translator raises `ValueError` for unsupported clauses; do not retry with rephrasing.
diff --git a/docs/api/sql-syntax.md b/docs/api/sql-syntax.md
@@ -24,6 +24,7 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis
 | Date functions (`YEAR`, `MONTH`, `DAY`, `DATE_FORMAT`, ...) | yes |
 | `IS NULL` / `IS NOT NULL` (Redis 7.4+) | yes |
 | `exists()` for field presence | yes |
+| `SELECT DISTINCT col1, col2, ...` | yes (column projections only; routed to `FT.AGGREGATE` with `GROUPBY`) |
 
 ## Not supported
 
@@ -32,7 +33,8 @@ The complete catalog of SQL clauses, operators, and functions sql-redis recognis
 | `JOIN` | Redis has no cross-index join. |
 | Subqueries | Out of scope for the POC. |
 | `HAVING` | Out of scope (use `WHERE` plus `GROUP BY` where possible). |
-| `DISTINCT` | Out of scope. |
+| `SELECT DISTINCT *` | Cannot group by an unspecified set of columns; list them explicitly. |
+| `DISTINCT` with aggregate functions | Use `GROUP BY` explicitly, or `COUNT(DISTINCT col)` for cardinality. |
 | `CREATE INDEX` | sql-redis does not create schemas. Use `FT.CREATE`. |
 
 ## TEXT search
diff --git a/sql_redis/analyzer.py b/sql_redis/analyzer.py
@@ -125,15 +125,34 @@ def analyze(self, parsed: ParsedQuery) -> AnalyzedQuery:
         for date_func in parsed.date_functions:
             referenced_fields.add(date_func.field)
 
-        # Collect aliases from date functions and computed fields (for GROUP BY)
+        # Collect aliases from date functions, computed fields, the score()
+        # alias, and aggregation aliases. These are computed in the query
+        # pipeline rather than loaded from documents, so GROUP BY / ORDER BY
+        # references to them must not be looked up in the schema.
         alias_names = {df.alias for df in parsed.date_functions}
         alias_names.update(cf.alias for cf in parsed.computed_fields)
+        if parsed.scoring is not None:
+            alias_names.add(parsed.scoring.alias)
+        for agg in parsed.aggregations:
+            if agg.alias:
+                alias_names.add(agg.alias)
+        if parsed.vector_search is not None and parsed.vector_search.alias:
+            # ORDER BY <vector-distance-alias> is the canonical way to sort by
+            # KNN similarity; the alias is a computed column, not an indexed
+            # field, so it must not be looked up in the schema.
+            alias_names.add(parsed.vector_search.alias)
 
         # Fields from GROUP BY (exclude aliases since they're computed)
         for field_name in parsed.groupby_fields:
             if field_name not in alias_names:
                 referenced_fields.add(field_name)
 
+        # Fields from ORDER BY (so they are validated against the schema
+        # and available for LOAD in the FT.AGGREGATE path)
+        for field_name, _ in parsed.orderby_fields:
+            if field_name not in alias_names:
+                referenced_fields.add(field_name)
+
         # Resolve field types
         for field_name in referenced_fields:
             if field_name not in schema:
diff --git a/sql_redis/parser.py b/sql_redis/parser.py
@@ -264,6 +264,7 @@ class ParsedQuery:
     offset: int | None = None
     filters: list[str] = dataclasses.field(default_factory=list)
     scoring: ScoringSpec | None = None  # Relevance scoring config
+    distinct: bool = False  # SELECT DISTINCT was specified
 
 
 class SQLParser:
@@ -291,8 +292,30 @@ def parse(self, sql: str) -> ParsedQuery:
         # Extract SELECT fields and aggregations
         select = ast.find(exp.Select)
         if select:
+            if select.args.get("distinct") is not None:
+                result.distinct = True
             for expression in select.expressions:
                 self._process_select_expression(expression, result)
+            if result.distinct:
+                # Validate DISTINCT shape here; the groupby_fields promotion
+                # happens after the GROUP BY clause is parsed (below) so an
+                # explicit GROUP BY does not duplicate the projected columns.
+                if "*" in result.fields:
+                    raise ValueError(
+                        "SELECT DISTINCT * is not supported; "
+                        "list the columns to deduplicate by explicitly."
+                    )
+                if result.aggregations:
+                    # AGG(DISTINCT ...) is handled per-aggregate; mixing
+                    # top-level DISTINCT with aggregations has no clean Redis
+                    # mapping. Reject so users do not silently get one or the
+                    # other applied.
+                    raise ValueError(
+                        "SELECT DISTINCT combined with aggregate functions "
+                        "is not supported; use GROUP BY explicitly."
+                    )
+                if not result.fields:
+                    raise ValueError("SELECT DISTINCT requires at least one column.")
 
         # Extract WHERE clause conditions
         where = ast.find(exp.Where)
@@ -311,6 +334,12 @@ def parse(self, sql: str) -> ParsedQuery:
                 if isinstance(expr, exp.Column):
                     result.groupby_fields.append(expr.name)
 
+        # SELECT DISTINCT: promote the projected columns to GROUP BY so the
+        # query routes to FT.AGGREGATE and emits GROUPBY @col1 @col2 ...
+        # An explicit GROUP BY takes precedence so we do not duplicate keys.
+        if result.distinct and not result.groupby_fields:
+            result.groupby_fields = list(result.fields)
+
         # Extract HAVING clause — exists() in HAVING → FILTER
         having = ast.find(exp.Having)
         if having:
@@ -1281,6 +1310,16 @@ def _extract_literal_value(self, expression, convert_dates: bool = False):
             inner_value = self._extract_literal_value(expression.this)
             if inner_value is not None:
                 return -inner_value
+        elif isinstance(expression, exp.Column):
+            # SQL with ANSI quoting parses "active" as an identifier
+            # (exp.Column(Identifier(quoted=True))), not a string literal.
+            # Users who write `status = "active"` clearly intend a value
+            # comparison; silently turning it into None produced
+            # `@status:{None}` and crashed on NUMERIC fields. Treat a quoted
+            # identifier in value position as its string contents.
+            ident = expression.this
+            if isinstance(ident, exp.Identifier) and ident.args.get("quoted"):
+                return ident.this
         return None
 
     def _validate_geo_unit(self, unit_val: object) -> str:
diff --git a/sql_redis/query_builder.py b/sql_redis/query_builder.py
@@ -49,8 +49,11 @@
 class QueryBuilder:
     """Builds RediSearch query syntax from conditions."""
 
-    # Characters that need escaping in TAG values
-    TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~"
+    # Characters that need escaping in TAG values.
+    # `|` separates values inside an IN list at the syntax level, so a literal
+    # pipe inside a value must be escaped; otherwise `status = 'a|b'` parses
+    # as `status IN ('a', 'b')`.
+    TAG_SPECIAL_CHARS = r".,<>{}[]\"':;!@#$%^&*()-+=~|"
 
     # Characters that have special meaning in RediSearch free-text queries
     # (outside double-quoted phrases). Must be escaped with backslash.
@@ -139,6 +142,13 @@ def build_text_condition(
 
         # Build search_value based on operator — shared by single- and multi-field paths
         if operator == "LIKE":
+            # LIKE '' produces no token and no wildcard, so RediSearch emits a
+            # bare `@field:` which is a syntax error at runtime. Reject early.
+            if value == "":
+                raise ValueError(
+                    "LIKE pattern must not be empty. Use IS NULL / IS NOT NULL "
+                    "to test for field absence, or '%' to match any value."
+                )
             # Escape special chars in the non-wildcard portion, then convert % → *
             # Split on %, escape each segment, rejoin with *
             parts = value.split("%")
@@ -344,18 +354,21 @@ def build_tag_condition(
         field: str,
         operator: str,
         value: str | list[str],
+        negated: bool = False,
     ) -> str:
         """Build query syntax for TAG field conditions.
 
         Args:
             field: Field name.
             operator: One of =, !=, IN.
             value: Tag value or list of values for IN.
+            negated: If True, prefix with `-` for negation. Covers NOT IN
+                and NOT field = .... The `!=` operator is also honored.
 
         Returns:
             RediSearch query syntax like @field:{value} or @field:{v1|v2}.
         """
-        prefix = "-" if operator == "!=" else ""
+        prefix = "-" if negated or operator == "!=" else ""
 
         if isinstance(value, list):
             # IN clause - join with |
@@ -371,36 +384,40 @@ def build_numeric_condition(
         field: str,
         operator: str,
         value: int | float | tuple[int | float, int | float],
+        negated: bool = False,
     ) -> str:
         """Build query syntax for NUMERIC field conditions.
 
         Args:
             field: Field name.
             operator: One of =, !=, <, <=, >, >=, BETWEEN.
             value: Numeric value or (min, max) tuple for BETWEEN.
+            negated: If True, prefix the resulting range with - so that
+                NOT field > x, NOT BETWEEN, etc. are honored. Without this,
+                NOT on comparison operators was silently dropped.
 
         Returns:
             RediSearch query syntax like @field:[min max].
         """
-        prefix = "-" if operator == "!=" else ""
+        prefix = "-" if negated or operator == "!=" else ""
 
         if operator == "BETWEEN":
             if isinstance(value, tuple):
                 min_val, max_val = value
                 return f"{prefix}@{field}:[{min_val} {max_val}]"
             raise ValueError("BETWEEN operator requires a tuple (min, max)")
         elif operator == "=":
-            return f"@{field}:[{value} {value}]"
+            return f"{prefix}@{field}:[{value} {value}]"
         elif operator == "!=":
-            return f"-@{field}:[{value} {value}]"
+            return f"{prefix}@{field}:[{value} {value}]"
         elif operator == ">":
-            return f"@{field}:[({value} +inf]"
+            return f"{prefix}@{field}:[({value} +inf]"
         elif operator == ">=":
-            return f"@{field}:[{value} +inf]"
+            return f"{prefix}@{field}:[{value} +inf]"
         elif operator == "<":
-            return f"@{field}:[-inf ({value}]"
+            return f"{prefix}@{field}:[-inf ({value}]"
         elif operator == "<=":
-            return f"@{field}:[-inf {value}]"
+            return f"{prefix}@{field}:[-inf {value}]"
         else:
             raise ValueError(f"Unknown numeric operator: {operator}")
 
diff --git a/sql_redis/translator.py b/sql_redis/translator.py
@@ -156,6 +156,10 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery:
             )
 
         # Determine if we need FT.AGGREGATE
+        # Multi-key ORDER BY also requires FT.AGGREGATE: FT.SEARCH SORTBY
+        # accepts a single key, while FT.AGGREGATE SORTBY accepts multiple.
+        # Routing automatically prevents the silent drop of trailing keys
+        # that used to happen on the FT.SEARCH path.
         use_aggregate = (
             len(analyzed.aggregations) > 0
             or len(analyzed.groupby_fields) > 0
@@ -165,6 +169,7 @@ def _build_command(self, analyzed: AnalyzedQuery) -> TranslatedQuery:
             or len(analyzed.date_functions) > 0
             or has_date_func_conditions
             or len(parsed.filters) > 0  # exists() in HAVING → FILTER
+            or len(parsed.orderby_fields) > 1  # multi-key ORDER BY
         )
 
         # Build query string from conditions
@@ -310,6 +315,15 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
                 inorder=condition.inorder,
             )
         elif field_type == "TAG":
+            # BETWEEN is meaningless for TAG values; RediSearch tags have no
+            # ordering, so 'a' <= status <= 'z' has no defined semantics.
+            # Previously the parser fell through and the builder emitted
+            # @status:{\('a'\, 'z'\)} (invalid). Surface the limitation.
+            if operator == "BETWEEN":
+                raise ValueError(
+                    f"BETWEEN is not supported on TAG fields ('{condition.field}'); "
+                    "TAG values are unordered. Use IN (...) for a set match."
+                )
             # Keep list value for IN clauses, convert scalar to string
             value = (
                 condition.value
@@ -320,8 +334,35 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
                 condition.field,
                 operator,
                 value,
+                negated=is_negated,
             )
         elif field_type == "NUMERIC":
+            # IN (...) on a NUMERIC field was previously handed a list value
+            # to build_numeric_condition, which then tried float([1,2,3]) and
+            # crashed. RediSearch has no native IN for NUMERIC; expand to a
+            # union of equality ranges (negated → AND of NOT-equals).
+            if operator == "IN":
+                if not isinstance(condition.value, list) or not condition.value:
+                    raise ValueError(
+                        f"IN on NUMERIC field '{condition.field}' requires a "
+                        "non-empty list of values."
+                    )
+                parts: list[str] = []
+                for item in condition.value:
+                    item_num = self._convert_to_numeric(item)
+                    parts.append(
+                        self._query_builder.build_numeric_condition(
+                            condition.field, "=", item_num, negated=is_negated
+                        )
+                    )
+                if len(parts) == 1:
+                    return parts[0]
+                # NOT IN (...) → AND of negated equalities (De Morgan)
+                # IN (...) → OR of equalities
+                joiner = " " if is_negated else "|"
+                joined = joiner.join(parts)
+                return f"({joined})"
+
             # Cast value to expected type for numeric conditions
             numeric_value: int | float | tuple[int | float, int | float]
             if isinstance(condition.value, tuple):
@@ -345,6 +386,7 @@ def _build_condition(self, condition: Condition, field_type: str | None) -> str:
                 condition.field,
                 operator,
                 numeric_value,
+                negated=is_negated,
             )
         else:
             # GEO, VECTOR, and unknown field types - default to text search
@@ -425,6 +467,9 @@ def _build_search(
         # SORTBY — skip if the ORDER BY field is a score() alias, because
         # WITHSCORES already returns results in relevance order and the alias
         # is not a sortable indexed field.
+        # Multi-key ORDER BY is routed to FT.AGGREGATE upstream (see
+        # use_aggregate in _build_command), so by the time we reach this
+        # branch parsed.orderby_fields has at most one entry.
         score_alias_name = parsed.scoring.alias if parsed.scoring else None
         if parsed.orderby_fields:
             field_name, direction = parsed.orderby_fields[0]
@@ -522,6 +567,18 @@ def _build_aggregate(
             # Load fields referenced in exists() computed fields (SELECT)
             for computed in analyzed.computed_fields:
                 self._extract_exists_fields(computed.expression, load_fields)
+            # Load ORDER BY fields so multi-key SORTBY works on non-SORTABLE
+            # columns. (SORTABLE fields are already in scope; loading them
+            # again is harmless.) Skip computed/derived aliases.
+            computed_aliases = {cf.alias for cf in analyzed.computed_fields}
+            computed_aliases.update(df.alias for df in analyzed.date_functions)
+            computed_aliases.update(gs.alias for gs in parsed.geo_distance_selects)
+            for field_name, _ in parsed.orderby_fields:
+                if (
+                    field_name in analyzed.field_types
+                    and field_name not in computed_aliases
+                ):
+                    load_fields.add(field_name)
 
         if load_all:
             args.extend(["LOAD", "*"])
diff --git a/tests/test_query_builder.py b/tests/test_query_builder.py
diff --git a/tests/test_translator.py b/tests/test_translator.py