Skip to content

Commit 864036a

Browse files
committed
feat: refactor embedder pkg to use db
1 parent 5f69ce1 commit 864036a

22 files changed

Lines changed: 725 additions & 489 deletions

.env.example

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Database connection
2-
DATABASE_URL=postgres://postgres:postgres@localhost:5432/corpus
1+
# Database connection (use docker-compose up to start local postgres)
2+
DATABASE_URL=postgres://postgres:postgres@localhost:5432/docx_corpus
33

44
# Local storage path
55
STORAGE_PATH=./corpus
@@ -20,6 +20,12 @@ EXTRACT_OUTPUT_PREFIX=extracted # Output directory prefix
2020
EXTRACT_BATCH_SIZE=100 # Documents per batch
2121
EXTRACT_WORKERS=4 # Parallel worker processes
2222

23+
# Embedder settings
24+
EMBED_INPUT_PREFIX=extracted # Input directory prefix (extracted text)
25+
EMBED_MODEL=minilm # Model: minilm, bge-m3, or voyage-lite
26+
EMBED_BATCH_SIZE=100 # Documents per batch
27+
VOYAGE_API_KEY= # Required for voyage-lite model
28+
2329
# Cloudflare R2 (optional - for cloud storage)
2430
CLOUDFLARE_ACCOUNT_ID=
2531
R2_ACCESS_KEY_ID=

README.md

Lines changed: 77 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -69,22 +69,35 @@ bun install
6969

7070
```
7171
packages/
72-
shared/ # Shared utilities (progress bars, formatting)
72+
shared/ # Shared utilities (DB client, storage, formatting)
7373
scraper/ # Core scraper logic (downloads WARC, validates .docx)
7474
extractor/ # Text extraction using Docling (Python)
75+
embedder/ # Document embeddings using sentence-transformers (Python)
7576
apps/
7677
cli/ # Unified CLI - corpus <command>
7778
cdx-filter/ # AWS Lambda - filters CDX indexes for .docx URLs
7879
web/ # Landing page - docxcorp.us
80+
db/
81+
schema.sql # PostgreSQL schema (with pgvector)
82+
migrations/ # Database migrations
7983
```
8084

81-
| Package/App | Purpose | Runtime |
82-
| -------------- | --------------------------------- | -------------------- |
83-
| **cli** | Unified CLI entry point | Bun |
84-
| **scraper** | Download and validate .docx files | Bun |
85-
| **extractor** | Extract text from .docx files | Bun + Python |
86-
| **cdx-filter** | Filter Common Crawl CDX indexes | AWS Lambda (Node.js) |
87-
| **web** | Landing page | Static HTML |
85+
**Apps** (entry points)
86+
87+
| App | Purpose | Uses |
88+
| -------------- | ------------------------------- | ------------------------ |
89+
| **cli** | `corpus` command | scraper, extractor, embedder |
90+
| **cdx-filter** | Filter CDX indexes (Lambda) | - |
91+
| **web** | Landing page | - |
92+
93+
**Packages** (libraries)
94+
95+
| Package | Purpose | Runtime |
96+
| -------------- | --------------------------------- | ------------ |
97+
| **shared** | DB client, storage, formatting | Bun |
98+
| **scraper** | Download and validate .docx files | Bun |
99+
| **extractor** | Extract text (Docling) | Bun + Python |
100+
| **embedder** | Generate embeddings | Bun + Python |
88101

89102
## Usage
90103

@@ -134,6 +147,20 @@ bun run corpus extract --batch 50 --workers 8
134147
bun run corpus extract --verbose
135148
```
136149

150+
### 4. Generate embeddings
151+
152+
```bash
153+
# Embed all extracted documents (default: minilm, 384 dims)
154+
bun run corpus embed
155+
156+
# Use a different model
157+
bun run corpus embed --model bge-m3 # 1024 dims
158+
bun run corpus embed --model voyage-lite # requires VOYAGE_API_KEY
159+
160+
# Embed with batch limit
161+
bun run corpus embed --batch 100 --verbose
162+
```
163+
137164
### Docker
138165

139166
Run the CLI in a container:
@@ -173,35 +200,62 @@ export R2_SECRET_ACCESS_KEY=xxx
173200
bun run corpus scrape --crawl CC-MAIN-2025-51 --batch 1000
174201
```
175202

203+
## Local Development
204+
205+
Start PostgreSQL with pgvector locally:
206+
207+
```bash
208+
docker compose up -d
209+
210+
# Verify
211+
docker exec docx-corpus-postgres-1 psql -U postgres -d docx_corpus -c "\dt"
212+
```
213+
214+
Run commands against local database:
215+
216+
```bash
217+
DATABASE_URL=postgres://postgres:postgres@localhost:5432/docx_corpus \
218+
CLOUDFLARE_ACCOUNT_ID='' \
219+
bun run corpus status
220+
```
221+
176222
## Configuration
177223

178224
All configuration via environment variables (`.env`):
179225

180226
```bash
181-
# Cloudflare R2 (required for both Lambda and scraper)
227+
# Database (required)
228+
DATABASE_URL=postgres://user:pass@host:5432/dbname
229+
230+
# Cloudflare R2 (required for cloud storage)
182231
CLOUDFLARE_ACCOUNT_ID=
183232
R2_ACCESS_KEY_ID=
184233
R2_SECRET_ACCESS_KEY=
185234
R2_BUCKET_NAME=docx-corpus
186235

187-
# Scraping
236+
# Local storage (used when R2 not configured)
188237
STORAGE_PATH=./corpus
189-
CRAWL_ID=CC-MAIN-2025-51
190238

191-
# Performance tuning
192-
CONCURRENCY=50 # Parallel downloads
193-
RATE_LIMIT_RPS=50 # Requests per second (initial)
194-
MAX_RPS=100 # Max requests per second
195-
MIN_RPS=10 # Min requests per second
196-
TIMEOUT_MS=45000 # Request timeout in ms
197-
MAX_RETRIES=10 # Max retry attempts
198-
MAX_BACKOFF_MS=60000 # Max backoff delay (ms)
239+
# Scraping
240+
CRAWL_ID=CC-MAIN-2025-51
241+
CONCURRENCY=50
242+
RATE_LIMIT_RPS=50
243+
MAX_RPS=100
244+
MIN_RPS=10
245+
TIMEOUT_MS=45000
246+
MAX_RETRIES=10
199247

200248
# Extractor
201-
EXTRACT_INPUT_PREFIX=documents # Input directory prefix
202-
EXTRACT_OUTPUT_PREFIX=extracted # Output directory prefix
203-
EXTRACT_BATCH_SIZE=100 # Documents per batch
204-
EXTRACT_WORKERS=4 # Parallel workers
249+
EXTRACT_INPUT_PREFIX=documents
250+
EXTRACT_OUTPUT_PREFIX=extracted
251+
EXTRACT_BATCH_SIZE=100
252+
EXTRACT_WORKERS=4
253+
254+
# Embedder
255+
EMBED_INPUT_PREFIX=extracted
256+
EMBED_MODEL=minilm # minilm | bge-m3 | voyage-lite
257+
EMBED_BATCH_SIZE=100
258+
VOYAGE_API_KEY= # Required for voyage-lite model
205259
```
206260

207261
### Rate Limiting

apps/cli/commands/embed.ts

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,11 @@ import {
66
type EmbedConfig,
77
type EmbeddingModel,
88
} from "@docx-corpus/embedder";
9-
import { createLocalStorage, createR2Storage } from "@docx-corpus/shared";
9+
import { createDb, createLocalStorage, createR2Storage } from "@docx-corpus/shared";
1010

1111
interface ParsedFlags {
1212
model?: EmbeddingModel;
1313
batchSize?: number;
14-
workers?: number;
1514
verbose: boolean;
1615
}
1716

@@ -35,11 +34,6 @@ function parseFlags(args: string[]): ParsedFlags {
3534
flags.batchSize = parseInt(next || "", 10);
3635
i++;
3736
break;
38-
case "--workers":
39-
case "-w":
40-
flags.workers = parseInt(next || "", 10);
41-
i++;
42-
break;
4337
case "--verbose":
4438
case "-v":
4539
flags.verbose = true;
@@ -56,32 +50,31 @@ corpus embed - Generate embeddings for extracted documents
5650
Usage
5751
corpus embed [options]
5852
53+
Reads extracted text from storage and writes embeddings to the database.
5954
Storage is auto-selected based on environment:
60-
- With R2 credentials: reads from r2://extracted/, writes to r2://embeddings/
61-
- Without R2 credentials: reads from ./corpus/extracted/, writes to ./corpus/embeddings/
55+
- With R2 credentials: reads from r2://extracted/
56+
- Without R2 credentials: reads from ./corpus/extracted/
6257
63-
Already-embedded files are automatically skipped (tracked in index.jsonl).
58+
Embedding progress is tracked in the database (embedded_at column).
6459
6560
Options
6661
--model, -m <name> Embedding model (default: minilm)
6762
minilm - all-MiniLM-L6-v2 (fast, 384 dims)
6863
bge-m3 - BAAI/bge-m3 (better quality, 1024 dims)
6964
voyage-lite - Voyage 3.5 lite (best, requires API key)
7065
--batch, -b <n> Limit to n documents (default: all)
71-
--workers, -w <n> Number of parallel workers (default: 4)
7266
--verbose, -v Show detailed progress
7367
--help, -h Show this help
7468
7569
Environment Variables
70+
DATABASE_URL PostgreSQL connection string (required)
7671
STORAGE_PATH Local storage path (default: ./corpus)
7772
CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2)
7873
R2_ACCESS_KEY_ID R2 access key
7974
R2_SECRET_ACCESS_KEY R2 secret key
8075
R2_BUCKET_NAME R2 bucket (default: docx-corpus)
8176
EMBED_INPUT_PREFIX Input prefix (default: extracted)
82-
EMBED_OUTPUT_PREFIX Output prefix (default: embeddings)
8377
EMBED_MODEL Default model (default: minilm)
84-
EMBED_WORKERS Worker count (default: 4)
8578
VOYAGE_API_KEY Voyage AI API key (required for voyage-lite)
8679
8780
Examples
@@ -99,6 +92,13 @@ export async function runEmbed(args: string[]) {
9992

10093
const flags = parseFlags(args);
10194
const envConfig = loadEmbedderConfig();
95+
96+
// Validate database URL
97+
if (!envConfig.database.url) {
98+
console.error("Error: DATABASE_URL environment variable is required");
99+
process.exit(1);
100+
}
101+
102102
const useCloud = hasCloudflareCredentials(envConfig);
103103
const model = flags.model ?? envConfig.embed.model;
104104

@@ -108,6 +108,9 @@ export async function runEmbed(args: string[]) {
108108
process.exit(1);
109109
}
110110

111+
// Create database client
112+
const db = await createDb(envConfig.database.url);
113+
111114
// Create storage based on credentials
112115
const storage = useCloud
113116
? createR2Storage({
@@ -119,12 +122,11 @@ export async function runEmbed(args: string[]) {
119122
: createLocalStorage(envConfig.storage.localPath);
120123

121124
const config: EmbedConfig = {
125+
db,
122126
storage,
123127
inputPrefix: envConfig.embed.inputPrefix,
124-
outputPrefix: envConfig.embed.outputPrefix,
125128
model,
126-
batchSize: flags.batchSize ?? Infinity,
127-
workers: flags.workers ?? envConfig.embed.workers,
129+
batchSize: flags.batchSize ?? 1000000,
128130
};
129131

130132
console.log("Document Embedder");
@@ -133,16 +135,17 @@ export async function runEmbed(args: string[]) {
133135
`Storage: ${useCloud ? `R2 (${envConfig.cloudflare.r2BucketName})` : `local (${envConfig.storage.localPath})`}`
134136
);
135137
console.log(`Input: ${config.inputPrefix}/`);
136-
console.log(`Output: ${config.outputPrefix}/`);
138+
console.log(`Output: database (embedding column)`);
137139
console.log(`Model: ${config.model}`);
138-
console.log(`Workers: ${config.workers}`);
139-
console.log(`Batch: ${config.batchSize === Infinity ? "all" : config.batchSize}`);
140+
console.log(`Batch: ${config.batchSize >= 1000000 ? "all" : config.batchSize}`);
140141
console.log("");
141142

142143
try {
143144
await processEmbeddings(config, flags.verbose);
144145
} catch (err) {
145146
console.error("Fatal error:", err);
146147
process.exit(1);
148+
} finally {
149+
await db.close();
147150
}
148151
}

apps/cli/commands/extract.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import {
44
hasCloudflareCredentials,
55
type ExtractConfig,
66
} from "@docx-corpus/extractor";
7-
import { createLocalStorage, createR2Storage } from "@docx-corpus/shared";
7+
import { createDb, createLocalStorage, createR2Storage } from "@docx-corpus/shared";
88

99
interface ParsedFlags {
1010
batchSize?: number;
@@ -52,7 +52,7 @@ Storage is auto-selected based on environment:
5252
- With R2 credentials: reads from r2://documents/, writes to r2://extracted/
5353
- Without R2 credentials: reads from ./corpus/documents/, writes to ./corpus/extracted/
5454
55-
Already-extracted files are automatically skipped (tracked in index.jsonl).
55+
Extraction progress is tracked in the database (extracted_at column).
5656
5757
Options
5858
--batch, -b <n> Limit to n documents (default: all)
@@ -61,6 +61,7 @@ Options
6161
--help, -h Show this help
6262
6363
Environment Variables
64+
DATABASE_URL PostgreSQL connection string (required)
6465
STORAGE_PATH Local storage path (default: ./corpus)
6566
CLOUDFLARE_ACCOUNT_ID Cloudflare account ID (enables R2)
6667
R2_ACCESS_KEY_ID R2 access key
@@ -85,8 +86,18 @@ export async function runExtract(args: string[]) {
8586

8687
const flags = parseFlags(args);
8788
const envConfig = loadExtractorConfig();
89+
90+
// Validate database URL
91+
if (!envConfig.database.url) {
92+
console.error("Error: DATABASE_URL environment variable is required");
93+
process.exit(1);
94+
}
95+
8896
const useCloud = hasCloudflareCredentials(envConfig);
8997

98+
// Create database client
99+
const db = await createDb(envConfig.database.url);
100+
90101
// Create storage based on credentials
91102
const storage = useCloud
92103
? createR2Storage({
@@ -98,10 +109,11 @@ export async function runExtract(args: string[]) {
98109
: createLocalStorage(envConfig.storage.localPath);
99110

100111
const config: ExtractConfig = {
112+
db,
101113
storage,
102114
inputPrefix: envConfig.extract.inputPrefix,
103115
outputPrefix: envConfig.extract.outputPrefix,
104-
batchSize: flags.batchSize ?? Infinity,
116+
batchSize: flags.batchSize ?? 1000000,
105117
workers: flags.workers ?? envConfig.extract.workers,
106118
};
107119

@@ -113,13 +125,15 @@ export async function runExtract(args: string[]) {
113125
console.log(`Input: ${config.inputPrefix}/`);
114126
console.log(`Output: ${config.outputPrefix}/`);
115127
console.log(`Workers: ${config.workers}`);
116-
console.log(`Batch: ${config.batchSize === Infinity ? "all" : config.batchSize}`);
128+
console.log(`Batch: ${config.batchSize >= 1000000 ? "all" : config.batchSize}`);
117129
console.log("");
118130

119131
try {
120132
await processDirectory(config, flags.verbose);
121133
} catch (err) {
122134
console.error("Fatal error:", err);
123135
process.exit(1);
136+
} finally {
137+
await db.close();
124138
}
125139
}

0 commit comments

Comments
 (0)