1414-- two RANKINGS (not their raw scores — which live on incomparable scales)
1515-- so a job ranked highly by EITHER signal surfaces.
1616--
17- -- PREREQUISITES (operator must do these first — see DEVLOG Day 69
17+ -- PREREQUISITES (operator must do these first — see DEVLOG Day 70
1818-- "OPERATOR ACTION REQUIRED"):
1919-- 1. `docs/sql/supabase-cached-jobs-pgvector.sql` applied — the
2020-- `vector` extension, `cached_jobs.embedding vector(1536)` column,
2525-- This file (the RPC) is applied AFTER those two steps, then the operator
2626-- flips JOB_SEARCH_HYBRID_ENABLED=true.
2727--
28+ -- ARCHITECTURE — HNSW CANDIDATE POOLS (the v2 rewrite, DEVLOG Day 74):
29+ -- Each retriever is its OWN top-N query reading `cached_jobs` directly,
30+ -- so the index can drive candidate selection:
31+ -- - lexical — GIN on search_tsv; `ORDER BY ts_rank DESC LIMIT 200`.
32+ -- - semantic — HNSW vector_cosine_ops on embedding; `ORDER BY <=>
33+ -- LIMIT 200`.
34+ -- The first cut ranked BOTH lists with window functions over one shared
35+ -- `candidates` CTE of every filtered row; the semantic `row_number()`
36+ -- then had to sort all ~14k embeddings with no usable index and hit the
37+ -- statement timeout against the real corpus. Per-retriever top-N queries
38+ -- keep the HNSW/GIN index in the plan — verified via EXPLAIN ANALYZE.
39+ --
40+ -- NOTE ON hnsw.ef_search: left at the pgvector default (40), so the HNSW
41+ -- scan yields ~40 semantic candidates regardless of the `LIMIT 200` above.
42+ -- That is ample fused against 200 lexical candidates for a 20-50 row page.
43+ -- It is deliberately NOT widened in-function: a function-level `SET`
44+ -- clause is rejected (`42501 permission denied to set parameter` — the
45+ -- migration role lacks the privilege) and a body-level `SET LOCAL` is
46+ -- rejected too (`0A000 SET is not allowed in a non-volatile function` —
47+ -- this RPC is STABLE). An operator with privilege can widen recall
48+ -- globally via `ALTER DATABASE postgres SET hnsw.ef_search = <n>` if the
49+ -- semantic pool ever needs to be deeper.
50+ --
2851-- PARAMS: identical to `search_cached_jobs_ranked` (p_query, p_location,
2952-- p_sources, p_remote_only, p_posted_within_days, p_limit, p_work_modes,
3053-- p_employment_types, p_sort_by, p_offset) PLUS:
3659-- syntax string already built by the backend's synonym expander
3760-- (src/job_search_synonyms.py), NOT raw user text. Parsed with
3861-- `to_tsquery` (not websearch_to_tsquery) for the same reason. Empty
39- -- p_query = no lexical filter .
62+ -- p_query = no lexical retriever .
4063--
4164-- RRF FUSION: for each job, with k = 60 (the standard RRF constant):
4265-- rrf = 1.0/(k + lexical_rank) + 1.0/(k + semantic_rank)
4568-- have an embedding.
4669-- - A job present in only ONE ranked list contributes ONLY that list's
4770-- term (the other term is 0) — implemented via a FULL OUTER JOIN and
48- -- treating a missing rank's term as 0 .
71+ -- COALESCE(..., 0.0) on the missing term .
4972--
5073-- DEGENERATE CASES (all must still return sensible rows):
51- -- - empty p_query -> lexical list is "all filtered rows"
52- -- unranked-by-text; effectively the
53- -- semantic ranking (or recency)
54- -- drives ordering. No to_tsquery call.
55- -- - NULL p_query_embedding -> semantic list empty; pure lexical.
56- -- - both empty/NULL -> filtered rows ordered by recency
57- -- (same as browse mode).
74+ -- - query present, NULL embedding -> semantic pool empty; pure lexical.
75+ -- - empty query, embedding present -> lexical pool empty; pure semantic.
76+ -- - both empty/NULL -> browse mode: an early-return branch lists the
77+ -- filtered rows ordered by recency (no retriever / fusion needed).
5878--
5979-- SORTING: same p_sort_by modes as the Tier 1 RPC. 'relevance' (default)
6080-- orders by the fused RRF score DESC; 'newest'/'oldest'/'company_az'
@@ -100,7 +120,7 @@ DECLARE
100120 -- RRF damping constant. 60 is the value from the original RRF paper
101121 -- and the de-facto production default; larger k flattens the
102122 -- contribution curve, smaller k sharpens it toward rank-1.
103- rrf_k CONSTANT integer := 60 ;
123+ rrf_k CONSTANT integer := 60 ;
104124BEGIN
105125 IF has_query THEN
106126 -- Same contract as the Tier 1 RPC: p_query is a to_tsquery-
@@ -109,126 +129,125 @@ BEGIN
109129 tsquery_obj := to_tsquery(' english' , p_query);
110130 END IF;
111131
112- RETURN QUERY
113132 -- ----------------------------------------------------------------
114- -- candidates: every row that survives the (non-text, non-vector)
115- -- FILTERS. Both ranked lists below draw from this same pool, so a
116- -- job's rank reflects its standing among ELIGIBLE jobs only .
133+ -- Browse mode (no text query, no query embedding): there is no
134+ -- ranking signal, so skip the retrievers / fusion entirely and
135+ -- return the filtered rows ordered by recency .
117136 -- ----------------------------------------------------------------
118- WITH candidates AS (
137+ IF NOT has_query AND NOT has_embedding THEN
138+ RETURN QUERY
119139 SELECT cj.*
120140 FROM public .cached_jobs cj
121141 WHERE cj .removed_at IS NULL
122- AND (
123- COALESCE(p_location, ' ' ) = ' '
124- OR cj .location ILIKE ' %' || p_location || ' %'
125- )
126- AND (
127- p_sources IS NULL
128- OR cardinality(p_sources) = 0
129- OR cj .source = ANY (p_sources)
130- )
142+ AND (COALESCE(p_location, ' ' ) = ' ' OR cj .location ILIKE ' %' || p_location || ' %' )
143+ AND (p_sources IS NULL OR cardinality(p_sources) = 0 OR cj .source = ANY (p_sources))
131144 AND (NOT p_remote_only OR cj .work_mode = ' remote' )
132- AND (
133- p_work_modes IS NULL
134- OR cardinality(p_work_modes) = 0
135- OR cj .work_mode = ANY (p_work_modes)
136- )
137- AND (
138- p_employment_types IS NULL
139- OR cardinality(p_employment_types) = 0
140- OR cj .employment_type_norm = ANY (p_employment_types)
141- )
142- AND (
143- p_posted_within_days IS NULL
144- OR cj .posted_at > NOW() - (p_posted_within_days || ' days' )::INTERVAL
145- )
146- ),
145+ AND (p_work_modes IS NULL OR cardinality(p_work_modes) = 0 OR cj .work_mode = ANY (p_work_modes))
146+ AND (p_employment_types IS NULL OR cardinality(p_employment_types) = 0 OR cj .employment_type_norm = ANY (p_employment_types))
147+ AND (p_posted_within_days IS NULL OR cj .posted_at > NOW() - (p_posted_within_days || ' days' )::INTERVAL)
148+ ORDER BY
149+ CASE sort_mode
150+ WHEN ' oldest' THEN - extract(epoch from cj .posted_at )
151+ ELSE extract(epoch from cj .posted_at )
152+ END DESC NULLS LAST,
153+ CASE sort_mode WHEN ' company_az' THEN LOWER (cj .company ) ELSE NULL END ASC NULLS LAST,
154+ cj .posted_at DESC NULLS LAST,
155+ cj .id DESC
156+ LIMIT GREATEST(1 , LEAST(COALESCE(p_limit, 20 ), 50 ))
157+ OFFSET GREATEST(0 , COALESCE(p_offset, 0 ));
158+ RETURN;
159+ END IF;
160+
147161 -- ----------------------------------------------------------------
148- -- lexical: 1-based rank by ts_rank DESC among rows whose tsvector
149- -- matches the query. When p_query is empty there is no FTS filter
150- -- and no meaningful text rank — every candidate is included with a
151- -- NULL lexical_rank so it contributes 0 to the lexical RRF term
152- -- (its placement then comes from the semantic side / recency) .
162+ -- Hybrid path: at least one of (text query, query embedding) is
163+ -- present. Each retriever is its own top-N query on `cached_jobs`
164+ -- so the GIN / HNSW index drives candidate selection (see the
165+ -- ARCHITECTURE note in the header); RRF then fuses the two
166+ -- rankings .
153167 -- ----------------------------------------------------------------
168+ RETURN QUERY
169+ WITH
170+ -- lexical: top-200 by ts_rank among rows whose tsvector matches the
171+ -- query. Empty when has_query is false (-> pure-semantic fallback).
172+ -- The GIN index on search_tsv accelerates the `@@` filter; the
173+ -- outer row_number() assigns the 1-based lexical rank for RRF.
154174 lexical AS (
155- SELECT
156- c .id ,
157- CASE
158- WHEN has_query
159- THEN row_number() OVER (
160- ORDER BY ts_rank(c .search_tsv , tsquery_obj) DESC ,
161- c .posted_at DESC NULLS LAST,
162- c .id
163- )
164- ELSE NULL ::bigint
165- END AS lexical_rank
166- FROM candidates c
167- WHERE NOT has_query OR c .search_tsv @@ tsquery_obj
175+ SELECT l .id ,
176+ row_number() OVER (
177+ ORDER BY l .rank_score DESC , l .posted_at DESC NULLS LAST, l .id
178+ ) AS lexical_rank
179+ FROM (
180+ SELECT cj .id , cj .posted_at ,
181+ ts_rank(cj .search_tsv , tsquery_obj) AS rank_score
182+ FROM public .cached_jobs cj
183+ WHERE has_query
184+ AND cj .removed_at IS NULL
185+ AND cj .search_tsv @@ tsquery_obj
186+ AND (COALESCE(p_location, ' ' ) = ' ' OR cj .location ILIKE ' %' || p_location || ' %' )
187+ AND (p_sources IS NULL OR cardinality(p_sources) = 0 OR cj .source = ANY (p_sources))
188+ AND (NOT p_remote_only OR cj .work_mode = ' remote' )
189+ AND (p_work_modes IS NULL OR cardinality(p_work_modes) = 0 OR cj .work_mode = ANY (p_work_modes))
190+ AND (p_employment_types IS NULL OR cardinality(p_employment_types) = 0 OR cj .employment_type_norm = ANY (p_employment_types))
191+ AND (p_posted_within_days IS NULL OR cj .posted_at > NOW() - (p_posted_within_days || ' days' )::INTERVAL)
192+ ORDER BY ts_rank(cj .search_tsv , tsquery_obj) DESC
193+ LIMIT 200
194+ ) l
168195 ),
169- -- ----------------------------------------------------------------
170- -- semantic: 1-based rank by cosine distance ASC among candidates
171- -- that HAVE an embedding. Empty when p_query_embedding is NULL ->
172- -- pure-lexical fallback. `<=>` is pgvector cosine distance; the
173- -- HNSW vector_cosine_ops index from the pgvector schema file
174- -- accelerates this ordering.
175- -- ----------------------------------------------------------------
196+ -- semantic: top-200 by cosine distance among rows that HAVE an
197+ -- embedding. Empty when p_query_embedding is NULL (-> pure-lexical
198+ -- fallback). `<=>` is pgvector cosine distance; the HNSW
199+ -- vector_cosine_ops index serves this ORDER BY ... LIMIT directly.
176200 semantic AS (
177- SELECT
178- c .id ,
179- row_number() OVER (
180- ORDER BY c .embedding <=> p_query_embedding,
181- c .posted_at DESC NULLS LAST,
182- c .id
183- ) AS semantic_rank
184- FROM candidates c
185- WHERE has_embedding
186- AND c .embedding IS NOT NULL
201+ SELECT s .id ,
202+ row_number() OVER (
203+ ORDER BY s .dist ASC , s .posted_at DESC NULLS LAST, s .id
204+ ) AS semantic_rank
205+ FROM (
206+ SELECT cj .id , cj .posted_at ,
207+ cj .embedding <=> p_query_embedding AS dist
208+ FROM public .cached_jobs cj
209+ WHERE has_embedding
210+ AND cj .embedding IS NOT NULL
211+ AND cj .removed_at IS NULL
212+ AND (COALESCE(p_location, ' ' ) = ' ' OR cj .location ILIKE ' %' || p_location || ' %' )
213+ AND (p_sources IS NULL OR cardinality(p_sources) = 0 OR cj .source = ANY (p_sources))
214+ AND (NOT p_remote_only OR cj .work_mode = ' remote' )
215+ AND (p_work_modes IS NULL OR cardinality(p_work_modes) = 0 OR cj .work_mode = ANY (p_work_modes))
216+ AND (p_employment_types IS NULL OR cardinality(p_employment_types) = 0 OR cj .employment_type_norm = ANY (p_employment_types))
217+ AND (p_posted_within_days IS NULL OR cj .posted_at > NOW() - (p_posted_within_days || ' days' )::INTERVAL)
218+ ORDER BY cj .embedding <=> p_query_embedding
219+ LIMIT 200
220+ ) s
187221 ),
188- -- ----------------------------------------------------------------
189222 -- fused: FULL OUTER JOIN the two ranked lists so a job appearing in
190223 -- only one still survives. RRF score sums the per-list terms; a
191224 -- missing rank contributes 0 (COALESCE(..., 0.0)).
192- -- ----------------------------------------------------------------
193225 fused AS (
194226 SELECT
195227 COALESCE(l .id , s .id ) AS id,
196- COALESCE(
197- CASE WHEN l .lexical_rank IS NOT NULL
198- THEN 1 .0 / (rrf_k + l .lexical_rank )
199- ELSE 0 .0 END,
200- 0 .0
201- )
202- +
203- COALESCE(
204- CASE WHEN s .semantic_rank IS NOT NULL
205- THEN 1 .0 / (rrf_k + s .semantic_rank )
206- ELSE 0 .0 END,
207- 0 .0
208- ) AS rrf_score
228+ COALESCE(1 .0 / (rrf_k + l .lexical_rank ), 0 .0 )
229+ + COALESCE(1 .0 / (rrf_k + s .semantic_rank ), 0 .0 ) AS rrf_score
209230 FROM lexical l
210231 FULL OUTER JOIN semantic s ON s .id = l .id
211232 )
212- -- ----------------------------------------------------------------
213- -- Final projection: join the fused scores back to the candidate
214- -- rows (so the SETOF cached_jobs shape is returned) and order by
215- -- the requested sort mode. 'relevance' uses the RRF score.
216- -- ----------------------------------------------------------------
217- SELECT c.*
233+ -- Final projection: join the fused scores back to `cached_jobs` (so
234+ -- the SETOF cached_jobs shape is returned) and order by the
235+ -- requested sort mode. 'relevance' uses the fused RRF score.
236+ SELECT cj.*
218237 FROM fused f
219- JOIN candidates c ON c .id = f .id
238+ JOIN public . cached_jobs cj ON cj .id = f .id
220239 ORDER BY
221240 CASE sort_mode
222- WHEN ' newest' THEN extract(epoch from c .posted_at )
223- WHEN ' oldest' THEN - extract(epoch from c .posted_at )
241+ WHEN ' newest' THEN extract(epoch from cj .posted_at )
242+ WHEN ' oldest' THEN - extract(epoch from cj .posted_at )
224243 WHEN ' company_az' THEN NULL -- secondary key carries
225244 ELSE f .rrf_score -- 'relevance' (default): fused RRF score
226245 END DESC NULLS LAST,
227246 -- Stable secondary keys to break ties — mirrors the Tier 1 RPC.
228- CASE sort_mode WHEN ' company_az' THEN LOWER (c .company ) ELSE NULL END
247+ CASE sort_mode WHEN ' company_az' THEN LOWER (cj .company ) ELSE NULL END
229248 ASC NULLS LAST,
230- c .posted_at DESC NULLS LAST,
231- c .id DESC
249+ cj .posted_at DESC NULLS LAST,
250+ cj .id DESC
232251 LIMIT GREATEST(1 , LEAST(COALESCE(p_limit, 20 ), 50 ))
233252 OFFSET GREATEST(0 , COALESCE(p_offset, 0 ));
234253END;
0 commit comments