-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
188 lines (165 loc) · 5.31 KB
/
docker-compose.yml
File metadata and controls
188 lines (165 loc) · 5.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
version: '3.8'
services:
# Redis para cache e queue
redis:
image: redis:7-alpine
container_name: firecrawl-redis
command: redis-server --maxmemory 2gb --maxmemory-policy allkeys-lru --appendonly yes
volumes:
- redis_data:/data
restart: unless-stopped
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
networks:
- firecrawl-network
# Playwright Service para scraping
playwright-service:
image: mcr.microsoft.com/playwright:v1.40.0-focal
container_name: firecrawl-playwright
environment:
PORT: 3000
PROXY_SERVER: http://xcuuwcfa:yccpzo0b9nth@45.61.100.172:6440
PROXY_USERNAME: xcuuwcfa
PROXY_PASSWORD: yccpzo0b9nth
command: |
sh -c "
npm init -y &&
npm install express playwright &&
cat > server.js << 'EOF'
const express = require('express');
const { chromium } = require('playwright');
const app = express();
app.use(express.json());
app.post('/scrape', async (req, res) => {
const { url } = req.body;
const browser = await chromium.launch({
proxy: {
server: process.env.PROXY_SERVER,
username: process.env.PROXY_USERNAME,
password: process.env.PROXY_PASSWORD
}
});
try {
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
const content = await page.content();
res.json({ content });
} catch (error) {
res.status(500).json({ error: error.message });
} finally {
await browser.close();
}
});
app.listen(3000, () => console.log('Playwright service running on port 3000'));
EOF
node server.js
"
restart: unless-stopped
networks:
- firecrawl-network
# Firecrawl API Principal
firecrawl-api:
build: .
image: firecrawl-production:latest
container_name: firecrawl-api
ports:
- "3002:3002"
environment:
# Core
PORT: 3002
HOST: 0.0.0.0
NODE_ENV: production
USE_DB_AUTHENTICATION: "false"
TEST_API_KEY: "7877e105e5f7b9ec3edf4a8eec5059ab9914efef1b30fe232f59ff31cb8e6fcf"
# Gemini 2.5 Flash
GEMINI_API_KEY: "AIzaSyAJWyUi0s9DHtRorkuDVnpoxDXqDGTqNdw"
GEMINI_MODEL: "gemini-2.5-flash-latest"
MODEL_NAME: "gemini-2.5-flash-latest"
# Ollama para Embeddings (conecta ao host)
OLLAMA_BASE_URL: "http://host.docker.internal:11434"
MODEL_EMBEDDING_NAME: "nomic-embed-text"
# Estratégia AI
AI_STRATEGY: "hybrid"
AI_PRIMARY: "gemini"
AI_EMBEDDINGS: "ollama"
# Redis
REDIS_URL: "redis://redis:6379"
REDIS_RATE_LIMIT_URL: "redis://redis:6379"
# Playwright
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/scrape"
# Proxy Webshare
PROXY_SERVER: "http://xcuuwcfa:yccpzo0b9nth@45.61.100.172:6440"
PROXY_USERNAME: "xcuuwcfa"
PROXY_PASSWORD: "yccpzo0b9nth"
WEBSHARE_API_KEY: "7tk8w3pjogiwpwpf84y0m8wc6zj6yrn8ycvhrrz8"
PROXY_ROTATION_ENABLED: "true"
PROXY_ROTATION_INTERVAL: "100"
# Rate Limiting (300k/mês)
RATE_LIMIT_ENABLED: "true"
RATE_LIMIT_MAX_REQUESTS: "411"
RATE_LIMIT_WINDOW_MS: "3600000"
# Performance
MAX_CONCURRENT_REQUESTS: "25"
TIMEOUT_MS: "30000"
RETRY_ATTEMPTS: "3"
NODE_OPTIONS: "--max-old-space-size=8192"
# Queue
BULL_AUTH_KEY: "firecrawl-bull-2024-secure"
QUEUE_CONCURRENCY: "50"
volumes:
- firecrawl_data:/app/data
depends_on:
- redis
- playwright-service
restart: unless-stopped
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- firecrawl-network
labels:
- "traefik.enable=true"
- "traefik.http.routers.firecrawl.rule=Host(`firecrawl.wmappliances.cloud`)"
- "traefik.http.routers.firecrawl.entrypoints=websecure"
- "traefik.http.routers.firecrawl.tls.certresolver=letsencrypt"
- "traefik.http.services.firecrawl.loadbalancer.server.port=3002"
# Firecrawl Worker
firecrawl-worker:
build: .
image: firecrawl-production:latest
container_name: firecrawl-worker
command: ["npm", "run", "worker"]
environment:
# Mesmas variáveis do API
REDIS_URL: "redis://redis:6379"
REDIS_RATE_LIMIT_URL: "redis://redis:6379"
PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/scrape"
# Gemini para processamento
GEMINI_API_KEY: "AIzaSyAJWyUi0s9DHtRorkuDVnpoxDXqDGTqNdw"
MODEL_NAME: "gemini-2.5-flash-latest"
# Ollama para embeddings
OLLAMA_BASE_URL: "http://host.docker.internal:11434"
MODEL_EMBEDDING_NAME: "nomic-embed-text"
# Proxy
PROXY_SERVER: "http://xcuuwcfa:yccpzo0b9nth@45.61.100.172:6440"
# Performance
MAX_CONCURRENT_REQUESTS: "15"
WORKER_THREADS: "4"
volumes:
- firecrawl_data:/app/data
depends_on:
- redis
- playwright-service
restart: unless-stopped
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- firecrawl-network
volumes:
redis_data:
firecrawl_data:
networks:
firecrawl-network:
driver: bridge