Skip to content

Commit d8b068c

Browse files
authored
add paradedb sample app (#125)
* first iteration * second iteration * improve README * add script.js and make other changes
1 parent ada44e2 commit d8b068c

File tree

15 files changed

+4492
-0
lines changed

15 files changed

+4492
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Dependencies
2+
node_modules/
3+
lambda/package-lock.json
4+
5+
# Build output
6+
dist/
7+
cdk.out/
8+
*.d.ts
9+
10+
# Keep TypeScript source
11+
!jest.config.js
12+
13+
# IDE
14+
.idea/
15+
.vscode/
16+
*.swp
17+
*.swo
18+
19+
# OS
20+
.DS_Store
21+
Thumbs.db
22+
23+
# Logs
24+
*.log
25+
npm-debug.log*
26+
27+
# Local env
28+
.env
29+
.env.local
30+
31+
# Data
32+
data/
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
.PHONY: install deploy init seed destroy clean help web-ui test-search get-api-url download-data
2+
3+
DATASET_URL := https://docs.aws.amazon.com/opensearch-service/latest/developerguide/samples/sample-movies.zip
4+
DATA_DIR := data
5+
6+
help:
7+
@echo "ParadeDB Movie Search Sample App"
8+
@echo ""
9+
@echo "Usage:"
10+
@echo " make install - Install all dependencies"
11+
@echo " make download-data - Download AWS sample movies dataset"
12+
@echo " make deploy - Deploy CDK stack to LocalStack"
13+
@echo " make init - Initialize database schema and BM25 index"
14+
@echo " make seed - Load movie data from S3 into ParadeDB"
15+
@echo " make test-search - Test the search endpoint"
16+
@echo " make get-api-url - Get the API Gateway URL"
17+
@echo " make web-ui - Run the Web UI on localhost port 3000"
18+
@echo " make destroy - Tear down the stack"
19+
@echo " make clean - Remove build artifacts"
20+
@echo ""
21+
@echo "Quick start:"
22+
@echo " make install && make download-data && make deploy && make init && make seed && make web-ui"
23+
24+
install:
25+
@echo "Installing CDK dependencies..."
26+
npm install
27+
@echo "Installing Lambda dependencies..."
28+
cd lambda && npm install
29+
@echo "Done!"
30+
31+
download-data:
32+
@echo "Downloading AWS sample movies dataset..."
33+
@mkdir -p $(DATA_DIR)
34+
@curl -sL $(DATASET_URL) -o $(DATA_DIR)/sample-movies.zip
35+
@echo "Extracting dataset..."
36+
@unzip -o $(DATA_DIR)/sample-movies.zip -d $(DATA_DIR)/
37+
@echo "Pre-processing bulk file (removing index instructions)..."
38+
@grep -v '^{ "index"' $(DATA_DIR)/sample-movies.bulk > $(DATA_DIR)/movies.bulk
39+
@rm -rf $(DATA_DIR)/sample-movies.zip $(DATA_DIR)/sample-movies.bulk $(DATA_DIR)/__MACOSX
40+
@echo "Dataset ready: $(DATA_DIR)/movies.bulk"
41+
@wc -l $(DATA_DIR)/movies.bulk | awk '{print "Total movies: " $$1}'
42+
43+
deploy:
44+
@echo "Deploying MovieSearchStack to LocalStack..."
45+
cdklocal bootstrap
46+
cdklocal deploy --require-approval never
47+
@echo ""
48+
@echo "Deployment complete!"
49+
50+
init:
51+
@echo "Initializing database schema and BM25 index..."
52+
@API_URL=$$(awslocal cloudformation describe-stacks \
53+
--stack-name MovieSearchStack \
54+
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
55+
--output text 2>/dev/null); \
56+
if [ -z "$$API_URL" ]; then \
57+
echo "Error: Stack not deployed. Run 'make deploy' first."; \
58+
exit 1; \
59+
fi; \
60+
curl -s -X POST "$${API_URL}admin/init" | jq .
61+
@echo "Database initialized!"
62+
63+
seed:
64+
@echo "Seeding movie data from S3..."
65+
@API_URL=$$(awslocal cloudformation describe-stacks \
66+
--stack-name MovieSearchStack \
67+
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
68+
--output text 2>/dev/null); \
69+
if [ -z "$$API_URL" ]; then \
70+
echo "Error: Stack not deployed. Run 'make deploy' first."; \
71+
exit 1; \
72+
fi; \
73+
curl -s -X POST "$${API_URL}admin/seed" | jq .
74+
@echo "Data seeded!"
75+
76+
test-search:
77+
@echo "Testing search endpoint..."
78+
@API_URL=$$(awslocal cloudformation describe-stacks \
79+
--stack-name MovieSearchStack \
80+
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
81+
--output text 2>/dev/null); \
82+
if [ -z "$$API_URL" ]; then \
83+
echo "Error: Stack not deployed. Run 'make deploy' first."; \
84+
exit 1; \
85+
fi; \
86+
echo "Searching for 'redemption'..."; \
87+
curl -s "$${API_URL}search?q=redemption" | jq .
88+
89+
destroy:
90+
@echo "Destroying MovieSearchStack..."
91+
cdklocal destroy --force
92+
@echo "Stack destroyed!"
93+
94+
clean:
95+
rm -rf node_modules lambda/node_modules cdk.out dist data/movies.bulk
96+
@echo "Cleaned!"
97+
98+
get-api-url:
99+
@awslocal cloudformation describe-stacks \
100+
--stack-name MovieSearchStack \
101+
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
102+
--output text
103+
104+
web-ui:
105+
@echo "Starting Movie Search Web UI..."
106+
@echo "API endpoint: http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev"
107+
@echo ""
108+
@which serve > /dev/null 2>&1 || (echo "Installing serve..." && npm i -g serve)
109+
serve -s ./web -l 3000
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
# ParadeDB Movie Search Sample App
2+
3+
A CDK application demonstrating ParadeDB's full-text search capabilities with LocalStack.
4+
5+
## Overview
6+
7+
This sample app deploys a serverless movie search application using:
8+
9+
- **AWS Lambda** - Handles search and data operations
10+
- **Amazon API Gateway** - REST API endpoints
11+
- **Amazon S3** - Stores movie dataset
12+
- **ParadeDB** - Full-text search engine (runs as LocalStack extension)
13+
14+
### Dataset
15+
16+
Uses the official [AWS OpenSearch sample movies dataset](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/samples/sample-movies.zip) containing **5,000 movies** with metadata including:
17+
18+
- Title, year, genres, rating
19+
- Directors and actors
20+
- Plot descriptions
21+
- Movie poster images
22+
- Runtime duration
23+
24+
### Features Demonstrated
25+
26+
| Feature | Description |
27+
|---------|-------------|
28+
| **BM25 Ranking** | Industry-standard relevance scoring |
29+
| **Fuzzy Matching** | Handles typos (e.g., "Godfater" finds "Godfather") |
30+
| **Highlighting** | Returns matched text with highlighted terms |
31+
| **Movie Posters** | Rich UI with movie poster images |
32+
33+
### API Endpoints
34+
35+
| Method | Endpoint | Description |
36+
|--------|----------|-------------|
37+
| GET | `/search?q=<query>` | Search movies with BM25 ranking |
38+
| GET | `/movies/{id}` | Get movie details by ID |
39+
| POST | `/admin/init` | Initialize database schema |
40+
| POST | `/admin/seed` | Load movie data from S3 |
41+
42+
## Prerequisites
43+
44+
- [LocalStack](https://localstack.cloud/) installed and running
45+
- [Node.js](https://nodejs.org/) 18+ installed
46+
- [AWS CDK Local](https://github.com/localstack/aws-cdk-local) (`npm install -g aws-cdk-local`)
47+
- [AWS CLI](https://aws.amazon.com/cli/) configured
48+
- ParadeDB extension installed in LocalStack
49+
50+
## Setup
51+
52+
### 1. Start LocalStack with ParadeDB Extension
53+
54+
```bash
55+
# Install the ParadeDB extension
56+
localstack extensions install localstack-extension-paradedb
57+
58+
# Start LocalStack
59+
localstack start
60+
```
61+
62+
### 2. Install Dependencies and Download Dataset
63+
64+
```bash
65+
cd paradedb/sample-movie-search
66+
make install
67+
make download-data
68+
```
69+
70+
The `download-data` target downloads the AWS sample movies dataset (~5000 movies) and preprocesses it for ParadeDB ingestion.
71+
72+
### 3. Deploy the Stack
73+
74+
```bash
75+
make deploy
76+
```
77+
78+
Or manually:
79+
80+
```bash
81+
cdklocal bootstrap
82+
cdklocal deploy
83+
```
84+
85+
After deployment, you'll see output similar to:
86+
87+
```
88+
Outputs:
89+
MovieSearchStack.ApiEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/
90+
MovieSearchStack.DataBucketName = movie-search-data
91+
MovieSearchStack.InitEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/admin/init
92+
MovieSearchStack.MovieSearchApiEndpointB25066EC = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/
93+
MovieSearchStack.MoviesEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/movies/{id}
94+
MovieSearchStack.SearchEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search
95+
MovieSearchStack.SeedEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/admin/seed
96+
```
97+
98+
### 4. Initialize Database
99+
100+
Create the movies table and BM25 search index:
101+
102+
```bash
103+
make init
104+
```
105+
106+
### 5. Seed Data
107+
108+
Load movie data from S3 into ParadeDB:
109+
110+
```bash
111+
make seed
112+
```
113+
114+
## Usage
115+
116+
### Search Movies
117+
118+
```bash
119+
# Basic search
120+
curl "http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=redemption"
121+
122+
# With pagination
123+
curl "http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=dark%20knight&limit=5&offset=0"
124+
125+
# Fuzzy search (handles typos)
126+
curl "http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=godfater"
127+
```
128+
129+
### Get Movie Details
130+
131+
```bash
132+
curl "http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/movies/tt0111161"
133+
```
134+
135+
### Example Response
136+
137+
```json
138+
{
139+
"success": true,
140+
"data": {
141+
"id": "tt0111161",
142+
"title": "The Shawshank Redemption",
143+
"year": 1994,
144+
"genres": [
145+
"Crime",
146+
"Drama"
147+
],
148+
"rating": 9.3,
149+
"directors": [
150+
"Frank Darabont"
151+
],
152+
"actors": [
153+
"Tim Robbins",
154+
"Morgan Freeman",
155+
"Bob Gunton"
156+
],
157+
"plot": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.",
158+
"image_url": "https://m.media-amazon.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX400_.jpg",
159+
"release_date": "1994-09-10T00:00:00.000Z",
160+
"rank": 80,
161+
"running_time_secs": 8520
162+
}
163+
}
164+
```
165+
166+
## Web UI
167+
168+
A web UI with movie posters is included in the `web/` directory.
169+
170+
### Quick Start
171+
172+
```bash
173+
make web-ui
174+
```
175+
176+
This starts a local web server at http://localhost:3000. The UI automatically connects to the API Gateway at `http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev`.
177+
178+
<img width="2880" height="1402" alt="image" src="https://gist.github.com/user-attachments/assets/63986bfe-709b-4bde-bac8-4df2b15bd41a" />
179+
180+
## How It Works
181+
182+
1. **Dataset Preparation**: Download and preprocess the AWS OpenSearch sample movies dataset
183+
184+
2. **Deployment**: CDK creates Lambda functions, API Gateway, and S3 bucket with movie data (bulk format)
185+
186+
3. **Initialization**: The init Lambda creates the movies table and ParadeDB BM25 index:
187+
```sql
188+
CREATE INDEX movies_search_idx ON movies
189+
USING bm25 (id, title, plot)
190+
WITH (key_field = 'id');
191+
```
192+
193+
4. **Data Loading**: The seed Lambda reads `movies.bulk` from S3 (newline-delimited JSON) and inserts 5000 movies into ParadeDB
194+
195+
5. **Search**: Queries use ParadeDB's BM25 search with fuzzy matching:
196+
```sql
197+
SELECT id, title, year, genres, rating, directors, actors, image_url, running_time_secs,
198+
pdb.snippet(plot, start_tag => '<mark>', end_tag => '</mark>') as highlight,
199+
pdb.score(id) as score
200+
FROM movies
201+
WHERE title ||| $1::pdb.fuzzy(1) OR plot ||| $1::pdb.fuzzy(1)
202+
ORDER BY score DESC
203+
```
204+
205+
## References
206+
207+
- [ParadeDB Documentation](https://docs.paradedb.com/)
208+
- [LocalStack Extensions](https://docs.localstack.cloud/aws/tooling/extensions/)
209+
- [AWS CDK Local](https://github.com/localstack/aws-cdk-local)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/usr/bin/env node
2+
import "source-map-support/register";
3+
import * as cdk from "aws-cdk-lib";
4+
import { MovieSearchStack } from "../lib/movie-search-stack";
5+
6+
const app = new cdk.App();
7+
new MovieSearchStack(app, "MovieSearchStack", {});
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"app": "npx ts-node --prefer-ts-exts bin/app.ts",
3+
"watch": {
4+
"include": ["**"],
5+
"exclude": [
6+
"README.md",
7+
"cdk*.json",
8+
"**/*.d.ts",
9+
"**/*.js",
10+
"tsconfig.json",
11+
"package*.json",
12+
"node_modules",
13+
"lambda/node_modules"
14+
]
15+
},
16+
"context": {
17+
"@aws-cdk/aws-lambda:recognizeLayerVersion": true,
18+
"@aws-cdk/core:checkSecretUsage": true,
19+
"@aws-cdk/core:target-partitions": ["aws", "aws-cn"]
20+
}
21+
}

0 commit comments

Comments
 (0)