Skip to content

Commit 218f69f

Browse files
GappleBeejoetsoikaedrohoalextatarinov
authored andcommitted
Fixed #28041 -- Added Lexeme expression to contrib.postgres.search.
This expression automatically escapes its input and allows fine-grained control over prefix matching and term weighting via logical combinations. Thanks Mariusz Felisiak, Adam Zapletal, Paolo Melchiorre, Jacob Walls, Adam Johnson, and Simon Charette for reviews. Co-authored-by: joetsoi <joetsoi@users.noreply.github.com> Co-authored-by: Karl Hobley <karl@kaed.uk> Co-authored-by: Alexandr Tatarinov <tatarinov1997@gmail.com>
1 parent e08fa42 commit 218f69f

4 files changed

Lines changed: 423 additions & 1 deletion

File tree

django/contrib/postgres/search.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from django.db.backends.postgresql.psycopg_any import is_psycopg3
12
from django.db.models import (
23
CharField,
34
Expression,
@@ -10,9 +11,45 @@
1011
)
1112
from django.db.models.expressions import CombinedExpression, register_combinable_fields
1213
from django.db.models.functions import Cast, Coalesce
14+
from django.utils.regex_helper import _lazy_re_compile
1315

1416
from .utils import CheckPostgresInstalledMixin
1517

18+
if is_psycopg3:
19+
from psycopg.adapt import Dumper
20+
21+
class UTF8Dumper(Dumper):
22+
def dump(self, obj):
23+
return bytes(obj, "utf-8")
24+
25+
def quote_lexeme(value):
26+
return UTF8Dumper(str).quote(psql_escape(value)).decode()
27+
28+
else:
29+
from psycopg2.extensions import adapt
30+
31+
def quote_lexeme(value):
32+
adapter = adapt(psql_escape(value))
33+
adapter.encoding = "utf-8"
34+
return adapter.getquoted().decode()
35+
36+
37+
spec_chars_re = _lazy_re_compile(r"['\0\[\]()|&:*!@<>\\]")
38+
multiple_spaces_re = _lazy_re_compile(r"\s{2,}")
39+
40+
41+
def normalize_spaces(val):
42+
"""Convert multiple spaces to single and strip from both sides."""
43+
if not (val := val.strip()):
44+
return None
45+
return multiple_spaces_re.sub(" ", val)
46+
47+
48+
def psql_escape(query):
49+
"""Replace chars not fit for use in search queries with a single space."""
50+
query = spec_chars_re.sub(" ", query)
51+
return normalize_spaces(query)
52+
1653

1754
class SearchVectorExact(Lookup):
1855
lookup_name = "exact"
@@ -205,6 +242,9 @@ def __init__(
205242
invert=False,
206243
search_type="plain",
207244
):
245+
if isinstance(value, LexemeCombinable):
246+
search_type = "raw"
247+
208248
self.function = self.SEARCH_TYPES.get(search_type)
209249
if self.function is None:
210250
raise ValueError("Unknown search_type argument '%s'." % search_type)
@@ -383,3 +423,104 @@ class TrigramWordSimilarity(TrigramWordBase):
383423

384424
class TrigramStrictWordSimilarity(TrigramWordBase):
385425
function = "STRICT_WORD_SIMILARITY"
426+
427+
428+
class LexemeCombinable:
429+
BITAND = "&"
430+
BITOR = "|"
431+
432+
def _combine(self, other, connector, reversed):
433+
if not isinstance(other, LexemeCombinable):
434+
raise TypeError(
435+
"A Lexeme can only be combined with another Lexeme, "
436+
f"got {other.__class__.__name__}."
437+
)
438+
if reversed:
439+
return CombinedLexeme(other, connector, self)
440+
return CombinedLexeme(self, connector, other)
441+
442+
# On Combinable, these are not implemented to reduce confusion with Q. In
443+
# this case we are actually (ab)using them to do logical combination so
444+
# it's consistent with other usage in Django.
445+
def __or__(self, other):
446+
return self._combine(other, self.BITOR, False)
447+
448+
def __ror__(self, other):
449+
return self._combine(other, self.BITOR, True)
450+
451+
def __and__(self, other):
452+
return self._combine(other, self.BITAND, False)
453+
454+
def __rand__(self, other):
455+
return self._combine(other, self.BITAND, True)
456+
457+
458+
class Lexeme(LexemeCombinable, Value):
459+
_output_field = SearchQueryField()
460+
461+
def __init__(
462+
self, value, output_field=None, *, invert=False, prefix=False, weight=None
463+
):
464+
if value == "":
465+
raise ValueError("Lexeme value cannot be empty.")
466+
467+
if not isinstance(value, str):
468+
raise TypeError(
469+
f"Lexeme value must be a string, got {value.__class__.__name__}."
470+
)
471+
472+
if weight is not None and (
473+
not isinstance(weight, str) or weight.lower() not in {"a", "b", "c", "d"}
474+
):
475+
raise ValueError(
476+
f"Weight must be one of 'A', 'B', 'C', and 'D', got {weight!r}."
477+
)
478+
479+
self.prefix = prefix
480+
self.invert = invert
481+
self.weight = weight
482+
super().__init__(value, output_field=output_field)
483+
484+
def as_sql(self, compiler, connection):
485+
param = quote_lexeme(self.value)
486+
label = ""
487+
if self.prefix:
488+
label += "*"
489+
if self.weight:
490+
label += self.weight
491+
492+
if label:
493+
param = f"{param}:{label}"
494+
if self.invert:
495+
param = f"!{param}"
496+
497+
return "%s", (param,)
498+
499+
def __invert__(self):
500+
cloned = self.copy()
501+
cloned.invert = not self.invert
502+
return cloned
503+
504+
505+
class CombinedLexeme(LexemeCombinable, CombinedExpression):
506+
_output_field = SearchQueryField()
507+
508+
def as_sql(self, compiler, connection):
509+
value_params = []
510+
lsql, params = compiler.compile(self.lhs)
511+
value_params.extend(params)
512+
513+
rsql, params = compiler.compile(self.rhs)
514+
value_params.extend(params)
515+
516+
combined_sql = f"({lsql} {self.connector} {rsql})"
517+
combined_value = combined_sql % tuple(value_params)
518+
return "%s", (combined_value,)
519+
520+
def __invert__(self):
521+
# Apply De Morgan's theorem.
522+
cloned = self.copy()
523+
cloned.connector = self.BITAND if self.connector == self.BITOR else self.BITOR
524+
cloned.lhs = ~self.lhs
525+
cloned.rhs = ~self.rhs
526+
return cloned

docs/ref/contrib/postgres/search.txt

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ Examples:
9696

9797
.. code-block:: pycon
9898

99-
>>> from django.contrib.postgres.search import SearchQuery
99+
>>> from django.contrib.postgres.search import SearchQuery, Lexeme
100100
>>> SearchQuery("red tomato") # two keywords
101101
>>> SearchQuery("tomato red") # same results as above
102102
>>> SearchQuery("red tomato", search_type="phrase") # a phrase
@@ -105,6 +105,7 @@ Examples:
105105
>>> SearchQuery(
106106
... "'tomato' ('red' OR 'green')", search_type="websearch"
107107
... ) # websearch operators
108+
>>> SearchQuery(Lexeme("tomato") & (Lexeme("red") | Lexeme("green"))) # Lexeme objects
108109

109110
``SearchQuery`` terms can be combined logically to provide more flexibility:
110111

@@ -118,6 +119,10 @@ Examples:
118119
See :ref:`postgresql-fts-search-configuration` for an explanation of the
119120
``config`` parameter.
120121

122+
.. versionchanged:: 6.0
123+
124+
:class:`Lexeme` objects were added.
125+
121126
``SearchRank``
122127
==============
123128

@@ -276,6 +281,53 @@ floats to :class:`SearchRank` as ``weights`` in the same order above:
276281
>>> rank = SearchRank(vector, query, weights=[0.2, 0.4, 0.6, 0.8])
277282
>>> Entry.objects.annotate(rank=rank).filter(rank__gte=0.3).order_by("-rank")
278283

284+
``Lexeme``
285+
==========
286+
287+
.. versionadded:: 6.0
288+
289+
.. class:: Lexeme(value, output_field=None, *, invert=False, prefix=False, weight=None)
290+
291+
``Lexeme`` objects allow search operators to be safely used with strings from
292+
an untrusted source. The content of each lexeme is escaped so that any
293+
operators that may exist in the string itself will not be interpreted.
294+
295+
You can combine lexemes with other lexemes using the ``&`` and ``|`` operators
296+
and also negate them with the ``~`` operator. For example:
297+
298+
.. code-block:: pycon
299+
300+
>>> from django.contrib.postgres.search import SearchQuery, SearchVector, Lexeme
301+
>>> vector = SearchVector("body_text", "blog__tagline")
302+
>>> Entry.objects.annotate(search=vector).filter(
303+
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert"))
304+
... )
305+
<QuerySet [<Entry: Apple Crumble Recipes>, <Entry: Banana Split Recipes>]>
306+
307+
.. code-block:: pycon
308+
309+
>>> Entry.objects.annotate(search=vector).filter(
310+
... search=SearchQuery(Lexeme("fruit") & Lexeme("dessert") & ~Lexeme("banana"))
311+
... )
312+
<QuerySet [<Entry: Apple Crumble Recipes>]>
313+
314+
Lexeme objects also support term weighting and prefixes:
315+
316+
.. code-block:: pycon
317+
318+
>>> Entry.objects.annotate(search=vector).filter(
319+
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese"))
320+
... )
321+
<QuerySet [<Entry: Cheese on Toast recipes>, <Entry: Pizza recipes>]>
322+
>>> Entry.objects.annotate(search=vector).filter(
323+
... search=SearchQuery(Lexeme("Pizza") | Lexeme("Cheese", weight="A"))
324+
... )
325+
<QuerySet [<Entry: Pizza recipes>]>
326+
>>> Entry.objects.annotate(search=vector).filter(
327+
... search=SearchQuery(Lexeme("za", prefix=True))
328+
... )
329+
<QuerySet []>
330+
279331
Performance
280332
===========
281333

docs/releases/6.0.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,12 @@ Minor features
171171
:mod:`django.contrib.postgres`
172172
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
173173

174+
* The new :class:`Lexeme <django.contrib.postgres.search.Lexeme>` expression
175+
for full text search provides fine-grained control over search terms.
176+
``Lexeme`` objects automatically escape their input and support logical
177+
combination operators (``&``, ``|``, ``~``), prefix matching, and term
178+
weighting.
179+
174180
* Model fields, indexes, and constraints from :mod:`django.contrib.postgres`
175181
now include system checks to verify that ``django.contrib.postgres`` is an
176182
installed app.

0 commit comments

Comments
 (0)