Skip to content

Commit d871c30

Browse files
authored
feat: support timescale pg_textsearch as text search extension (#359)
* feat: support timescale pg_textsearch as text search extension * refactor: deduplicate text search query in retrieve_semantic_bm25_combined Instead of maintaining 3 complete query copies (native, vchord, pg_textsearch), now we: - Build backend-specific parts (score_expr, order_by, where_filter) - Use a single query template with injected backend-specific parts This makes maintenance easier - changes to the semantic CTE or overall structure only need to be made once.
1 parent d8376ec commit d871c30

10 files changed

Lines changed: 300 additions & 103 deletions

File tree

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# PostgreSQL with pgvector and pg_textsearch extensions
2+
# Note: pg_textsearch requires PostgreSQL 17+
3+
FROM postgres:17
4+
5+
# Install build dependencies
6+
RUN apt-get update && apt-get install -y \
7+
build-essential \
8+
git \
9+
postgresql-server-dev-17 \
10+
libpq-dev \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
# Install pgvector
14+
RUN cd /tmp && \
15+
git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git && \
16+
cd pgvector && \
17+
make && \
18+
make install
19+
20+
# Install pg_textsearch
21+
RUN cd /tmp && \
22+
git clone https://github.com/timescale/pg_textsearch.git && \
23+
cd pg_textsearch && \
24+
make && \
25+
make install
26+
27+
# Clean up source files and build dependencies
28+
RUN rm -rf /tmp/pgvector /tmp/pg_textsearch && \
29+
apt-get purge -y --auto-remove build-essential git postgresql-server-dev-17
30+
31+
# Ensure extensions are preloaded
32+
RUN echo "shared_preload_libraries = 'pg_textsearch'" >> /usr/share/postgresql/postgresql.conf.sample
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
name: hindsight
2+
# Docker Compose file for Hindsight with PostgreSQL and Timescale pg_textsearch
3+
# docker compose -f docker/docker-compose/pg_textsearch/docker-compose.yaml down && sleep 2 && docker compose -f docker/docker-compose/pg_textsearch/docker-compose.yaml up -d
4+
# Make sure to set the required environment variables before running:
5+
# - HINDSIGHT_DB_PASSWORD: Password for the PostgreSQL user
6+
# - Configure LLM provider variables as needed (see below in the hindsight service)
7+
#
8+
# Usage:
9+
# docker compose up -d
10+
#
11+
# Optional environment variables with defaults:
12+
# - HINDSIGHT_VERSION: Hindsight application version (default: latest)
13+
# - HINDSIGHT_DB_USER: PostgreSQL user (default: hindsight_user)
14+
# - HINDSIGHT_DB_NAME: PostgreSQL database name (default: hindsight_db)
15+
16+
services:
17+
db:
18+
# Use custom PostgreSQL image with pgvector and pg_textsearch extensions
19+
build:
20+
context: .
21+
dockerfile: Dockerfile
22+
container_name: hindsight-db
23+
restart: always
24+
# Expose PostgreSQL port
25+
ports:
26+
- "5437:5432"
27+
environment:
28+
POSTGRES_USER: ${HINDSIGHT_DB_USER:-hindsight_user}
29+
POSTGRES_PASSWORD: ${HINDSIGHT_DB_PASSWORD:-hindsight_password}
30+
POSTGRES_DB: ${HINDSIGHT_DB_NAME:-hindsight_db}
31+
volumes:
32+
- pg_data:/var/lib/postgresql/data
33+
networks:
34+
- hindsight-net
35+
36+
pg-textsearch-init:
37+
build:
38+
context: .
39+
dockerfile: Dockerfile
40+
depends_on:
41+
- db
42+
environment:
43+
- PGPASSWORD=${HINDSIGHT_DB_PASSWORD:-hindsight_password}
44+
command: >
45+
bash -c "
46+
echo 'Waiting for PostgreSQL to be ready...';
47+
until pg_isready -h hindsight-db -p 5432 -U hindsight_user; do
48+
echo 'PostgreSQL is unavailable - sleeping';
49+
sleep 2;
50+
done;
51+
echo 'PostgreSQL is ready - creating hindsight_db database';
52+
psql -h hindsight-db -p 5432 -U hindsight_user -c 'CREATE DATABASE hindsight_db;' 2>/dev/null || echo 'Database already exists';
53+
echo 'Creating extensions in hindsight_db database';
54+
psql -h hindsight-db -p 5432 -U hindsight_user -d hindsight_db -c 'CREATE EXTENSION IF NOT EXISTS vector CASCADE;';
55+
psql -h hindsight-db -p 5432 -U hindsight_user -d hindsight_db -c 'CREATE EXTENSION IF NOT EXISTS pg_textsearch CASCADE;';
56+
echo 'Database and extensions created successfully';
57+
"
58+
restart: "no"
59+
networks:
60+
- hindsight-net
61+
62+
hindsight:
63+
image: ghcr.io/vectorize-io/hindsight:${HINDSIGHT_VERSION:-latest}
64+
container_name: hindsight-app
65+
ports:
66+
- "8888:8888"
67+
- "9999:9999"
68+
environment:
69+
# LLM Configuration
70+
HINDSIGHT_API_LLM_PROVIDER: ${HINDSIGHT_API_LLM_PROVIDER:-openai}
71+
HINDSIGHT_API_LLM_API_KEY: ${OPENAI_API_KEY:-your-api-key}
72+
73+
# Database Configuration
74+
HINDSIGHT_API_DATABASE_URL: postgresql://${HINDSIGHT_DB_USER:-hindsight_user}:${HINDSIGHT_DB_PASSWORD:-hindsight_password}@db:5432/${HINDSIGHT_DB_NAME:-hindsight_db}
75+
76+
# Vector and Text Search Extensions
77+
HINDSIGHT_API_VECTOR_EXTENSION: pgvector
78+
HINDSIGHT_API_TEXT_SEARCH_EXTENSION: pg_textsearch
79+
80+
depends_on:
81+
- db
82+
networks:
83+
- hindsight-net
84+
85+
86+
networks:
87+
hindsight-net:
88+
driver: bridge
89+
90+
volumes:
91+
pg_data:

hindsight-api/hindsight_api/alembic/versions/5a366d414dce_initial_schema.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def _detect_vector_extension() -> str:
5151

5252
def _detect_text_search_extension() -> str:
5353
"""
54-
Detect or validate text search extension: 'native' or 'vchord'.
54+
Detect or validate text search extension: 'native', 'vchord', or 'pg_textsearch'.
5555
Respects HINDSIGHT_API_TEXT_SEARCH_EXTENSION env var.
5656
Creates the extension if needed.
5757
"""
@@ -69,11 +69,23 @@ def _detect_text_search_extension() -> str:
6969
# Extension truly doesn't exist - re-raise the error
7070
raise
7171
return "vchord"
72+
elif text_search_extension == "pg_textsearch":
73+
# Create pg_textsearch extension if not exists
74+
try:
75+
op.execute("CREATE EXTENSION IF NOT EXISTS pg_textsearch CASCADE")
76+
except Exception:
77+
# Extension might already exist or user lacks permissions - verify it exists
78+
conn = op.get_bind()
79+
result = conn.execute(text("SELECT 1 FROM pg_extension WHERE extname = 'pg_textsearch'")).fetchone()
80+
if not result:
81+
# Extension truly doesn't exist - re-raise the error
82+
raise
83+
return "pg_textsearch"
7284
elif text_search_extension == "native":
7385
return "native"
7486
else:
7587
raise ValueError(
76-
f"Invalid HINDSIGHT_API_TEXT_SEARCH_EXTENSION: {text_search_extension}. Must be 'native' or 'vchord'"
88+
f"Invalid HINDSIGHT_API_TEXT_SEARCH_EXTENSION: {text_search_extension}. Must be 'native', 'vchord', or 'pg_textsearch'"
7789
)
7890

7991

@@ -232,6 +244,12 @@ def upgrade() -> None:
232244
ALTER TABLE memory_units
233245
ADD COLUMN search_vector bm25_catalog.bm25vector
234246
""")
247+
elif text_search_ext == "pg_textsearch":
248+
# Timescale pg_textsearch: dummy TEXT column for consistency (indexes operate on base columns directly)
249+
op.execute("""
250+
ALTER TABLE memory_units
251+
ADD COLUMN search_vector TEXT
252+
""")
235253
else: # native
236254
# Native PostgreSQL: tsvector with automatic generation
237255
op.execute("""
@@ -295,6 +313,14 @@ def upgrade() -> None:
295313
CREATE INDEX idx_memory_units_text_search ON memory_units
296314
USING bm25 (search_vector bm25_catalog.bm25_ops)
297315
""")
316+
elif text_search_ext == "pg_textsearch":
317+
# Timescale pg_textsearch BM25 index on text column
318+
# Note: pg_textsearch doesn't support expressions, so we index the main text column
319+
op.execute("""
320+
CREATE INDEX idx_memory_units_text_search ON memory_units
321+
USING bm25(text)
322+
WITH (text_config='english')
323+
""")
298324
else: # native
299325
# Native PostgreSQL GIN index
300326
op.execute("""

hindsight-api/hindsight_api/alembic/versions/n9i0j1k2l3m4_learnings_and_pinned_reflections.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def _detect_vector_extension() -> str:
5858

5959
def _detect_text_search_extension() -> str:
6060
"""
61-
Detect or validate text search extension: 'native' or 'vchord'.
61+
Detect or validate text search extension: 'native', 'vchord', or 'pg_textsearch'.
6262
Respects HINDSIGHT_API_TEXT_SEARCH_EXTENSION env var.
6363
Creates the extension if needed.
6464
"""
@@ -76,11 +76,23 @@ def _detect_text_search_extension() -> str:
7676
# Extension truly doesn't exist - re-raise the error
7777
raise
7878
return "vchord"
79+
elif text_search_extension == "pg_textsearch":
80+
# Create pg_textsearch extension if not exists
81+
try:
82+
op.execute("CREATE EXTENSION IF NOT EXISTS pg_textsearch CASCADE")
83+
except Exception:
84+
# Extension might already exist or user lacks permissions - verify it exists
85+
conn = op.get_bind()
86+
result = conn.execute(text("SELECT 1 FROM pg_extension WHERE extname = 'pg_textsearch'")).fetchone()
87+
if not result:
88+
# Extension truly doesn't exist - re-raise the error
89+
raise
90+
return "pg_textsearch"
7991
elif text_search_extension == "native":
8092
return "native"
8193
else:
8294
raise ValueError(
83-
f"Invalid HINDSIGHT_API_TEXT_SEARCH_EXTENSION: {text_search_extension}. Must be 'native' or 'vchord'"
95+
f"Invalid HINDSIGHT_API_TEXT_SEARCH_EXTENSION: {text_search_extension}. Must be 'native', 'vchord', or 'pg_textsearch'"
8496
)
8597

8698

@@ -146,6 +158,15 @@ def upgrade() -> None:
146158
CREATE INDEX idx_learnings_text_search ON {schema}learnings
147159
USING bm25 (search_vector bm25_catalog.bm25_ops)
148160
""")
161+
elif text_search_ext == "pg_textsearch":
162+
# Timescale pg_textsearch: dummy TEXT column for consistency (indexes operate on base columns directly)
163+
op.execute(f"""
164+
ALTER TABLE {schema}learnings ADD COLUMN search_vector TEXT
165+
""")
166+
op.execute(f"""
167+
CREATE INDEX idx_learnings_text_search ON {schema}learnings
168+
USING bm25(text) WITH (text_config='english')
169+
""")
149170
else: # native
150171
# Native PostgreSQL: tsvector with automatic generation
151172
op.execute(f"""
@@ -204,6 +225,16 @@ def upgrade() -> None:
204225
CREATE INDEX idx_pinned_reflections_text_search ON {schema}pinned_reflections
205226
USING bm25 (search_vector bm25_catalog.bm25_ops)
206227
""")
228+
elif text_search_ext == "pg_textsearch":
229+
# Timescale pg_textsearch: dummy TEXT column for consistency (indexes operate on base columns directly)
230+
op.execute(f"""
231+
ALTER TABLE {schema}pinned_reflections ADD COLUMN search_vector TEXT
232+
""")
233+
op.execute(f"""
234+
CREATE INDEX idx_pinned_reflections_text_search ON {schema}pinned_reflections
235+
USING bm25(content)
236+
WITH (text_config='english')
237+
""")
207238
else: # native
208239
# Native PostgreSQL: tsvector with automatic generation
209240
op.execute(f"""

hindsight-api/hindsight_api/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -329,8 +329,8 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
329329
# Vector extension (pgvector vs vchord)
330330
DEFAULT_VECTOR_EXTENSION = "pgvector" # Options: "pgvector", "vchord"
331331

332-
# Text search extension (native PostgreSQL vs vchord BM25)
333-
DEFAULT_TEXT_SEARCH_EXTENSION = "native" # Options: "native", "vchord"
332+
# Text search extension (native PostgreSQL, vchord BM25, or Timescale pg_textsearch)
333+
DEFAULT_TEXT_SEARCH_EXTENSION = "native" # Options: "native", "vchord", "pg_textsearch"
334334

335335
# LiteLLM defaults
336336
DEFAULT_LITELLM_API_BASE = "http://localhost:4000"
@@ -706,7 +706,7 @@ def validate(self) -> None:
706706
)
707707

708708
# Validate text_search_extension
709-
valid_text_search = ("native", "vchord")
709+
valid_text_search = ("native", "vchord", "pg_textsearch")
710710
if self.text_search_extension not in valid_text_search:
711711
raise ValueError(
712712
f"Invalid text_search_extension: {self.text_search_extension}. Must be one of: {', '.join(valid_text_search)}"

hindsight-api/hindsight_api/engine/consolidation/consolidator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1030,8 +1030,9 @@ async def _create_observation_directly(
10301030
tokenize($3, 'llmlingua2')::bm25_catalog.bm25vector)
10311031
RETURNING id
10321032
"""
1033-
else: # native
1033+
else: # native or pg_textsearch
10341034
# Native PostgreSQL: search_vector is GENERATED ALWAYS, don't include it
1035+
# pg_textsearch: indexes operate on base columns directly, don't populate search_vector
10351036
query = f"""
10361037
INSERT INTO {fq_table("memory_units")} (
10371038
id, bank_id, text, fact_type, embedding, proof_count, source_memory_ids, history,

hindsight-api/hindsight_api/engine/retain/fact_storage.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,9 @@ async def insert_facts_batch(
9797
FROM input_data
9898
RETURNING id
9999
"""
100-
else: # native
100+
else: # native or pg_textsearch
101101
# Native PostgreSQL: search_vector is GENERATED ALWAYS, don't include it
102+
# pg_textsearch: indexes operate on base columns directly, don't populate search_vector
102103
query = f"""
103104
WITH input_data AS (
104105
SELECT * FROM unnest(

0 commit comments

Comments
 (0)