-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy path.env.example
More file actions
38 lines (32 loc) · 1.44 KB
/
.env.example
File metadata and controls
38 lines (32 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Database connection (use docker-compose up to start local postgres)
DATABASE_URL=postgres://postgres:postgres@localhost:5432/docx_corpus
# Local storage path
STORAGE_PATH=./corpus
# Common Crawl settings
CRAWL_ID= # Optional: specific crawl ID (e.g., CC-MAIN-2025-51)
CONCURRENCY=1 # Parallel downloads
RATE_LIMIT_RPS=2 # Requests per second
MAX_RPS=5 # Maximum RPS after recovery
MIN_RPS=1 # Minimum RPS during backoff
TIMEOUT_MS=45000 # Request timeout
MAX_RETRIES=5 # Max retry attempts
MAX_BACKOFF_MS=60000 # Max backoff delay (ms)
# Extractor settings
EXTRACT_INPUT_PREFIX=documents # Input directory prefix
EXTRACT_OUTPUT_PREFIX=extracted # Output directory prefix
EXTRACT_BATCH_SIZE=100 # Documents per batch
EXTRACT_WORKERS=4 # Parallel worker processes
# Embedder settings
EMBED_INPUT_PREFIX=extracted # Input directory prefix (extracted text)
EMBED_BATCH_SIZE=100 # Documents per batch
EMBED_CONCURRENCY=20 # Parallel API requests
GOOGLE_API_KEY= # Required - get from https://aistudio.google.com/apikey
# Cloudflare R2 (optional - for cloud storage)
CLOUDFLARE_ACCOUNT_ID=
R2_ACCESS_KEY_ID=
R2_SECRET_ACCESS_KEY=
R2_BUCKET_NAME=docx-corpus
# AWS creds
AWS_REGION=us-east-1
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=