Skip to content

Commit 8a2fb3a

Browse files
authored
Merge pull request #73 from fuzziecoder/codex/implement-error-monitoring-with-sentry
Add observability stack: Prometheus, Grafana, ELK and Sentry integration
2 parents 3e71665 + 7235294 commit 8a2fb3a

9 files changed

Lines changed: 220 additions & 1 deletion

File tree

pipeline/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ A production-ready pipeline automation system built with:
77
- **Redis** - State management, locks, caching
88
- **PostgreSQL** - Persistence
99
- **AI Safety Module** - Failure prediction & anomaly handling
10+
- **Prometheus + Grafana** - Metrics collection and dashboards
11+
- **ELK Stack (Elasticsearch, Logstash, Kibana)** - Centralized logging
12+
- **Sentry** - Error monitoring and tracing
1013

1114
## Architecture Overview
1215

@@ -98,6 +101,10 @@ docker-compose up -d
98101
| FastAPI Docs | http://localhost:8000/api/docs | - |
99102
| PostgreSQL | localhost:5432 | airflow / airflow |
100103
| Redis | localhost:6379 | - |
104+
| Prometheus | http://localhost:9090 | - |
105+
| Grafana | http://localhost:3000 | admin / admin |
106+
| Kibana | http://localhost:5601 | - |
107+
| Elasticsearch | http://localhost:9200 | - |
101108

102109
### 4. Create Your First Pipeline
103110

@@ -176,6 +183,26 @@ curl -X POST http://localhost:8000/api/executions/pipeline-xxx/execute
176183
| GET | `/metrics` | Dashboard metrics |
177184
| GET | `/insights` | AI insights |
178185

186+
187+
## Observability & Monitoring
188+
189+
### Metrics (Prometheus)
190+
- Backend exposes Prometheus metrics at `/metrics` via `prometheus-fastapi-instrumentator`.
191+
- Prometheus scrapes `backend:8000/metrics` every 15 seconds.
192+
193+
### Dashboards (Grafana)
194+
- Grafana runs on port `3000` and can connect to Prometheus (`http://prometheus:9090`) as a data source.
195+
- Default credentials are `admin/admin` (change in production).
196+
197+
### Centralized Logging (ELK Stack)
198+
- Elasticsearch stores indexed logs.
199+
- Logstash listens on `5000` (TCP JSON) and `5044` (beats) and forwards to Elasticsearch.
200+
- Kibana provides visualization for indices like `flexiroaster-backend-*`.
201+
202+
### Error Monitoring (Sentry)
203+
- Configure `SENTRY_DSN` to enable Sentry for FastAPI exception capture and tracing.
204+
- Optional tuning: `SENTRY_ENVIRONMENT`, `SENTRY_TRACES_SAMPLE_RATE`, and `SENTRY_PROFILES_SAMPLE_RATE`.
205+
179206
## Configuration
180207

181208
### Environment Variables
@@ -188,6 +215,9 @@ curl -X POST http://localhost:8000/api/executions/pipeline-xxx/execute
188215
| `EXECUTOR_STAGE_TIMEOUT` | `120` | Stage timeout in seconds |
189216
| `AI_BLOCK_HIGH_RISK` | `false` | Block high-risk executions |
190217
| `AI_RISK_THRESHOLD_HIGH` | `0.7` | High risk threshold |
218+
| `SENTRY_DSN` | `""` | Enables Sentry when set |
219+
| `SENTRY_ENVIRONMENT` | `development` | Sentry environment label |
220+
| `SENTRY_TRACES_SAMPLE_RATE` | `0.1` | Fraction of traced requests |
191221

192222
### Airflow Variables
193223

pipeline/backend/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ class Settings(BaseSettings):
9090
LOG_FORMAT: str = "json" # "json" or "text"
9191
LOG_FILE: Optional[str] = None
9292

93+
# ===================
94+
# Observability Settings
95+
# ===================
96+
SENTRY_DSN: str = ""
97+
SENTRY_ENVIRONMENT: str = "development"
98+
SENTRY_TRACES_SAMPLE_RATE: float = 0.1
99+
SENTRY_PROFILES_SAMPLE_RATE: float = 0.1
100+
93101
# ===================
94102
# Airflow Integration
95103
# ===================

pipeline/backend/main.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from core.redis_state import redis_state_manager
1919
from core.executor import pipeline_executor
2020
from api.routes import pipelines, executions, health, monitoring, ai_automation
21-
from api.routes import pipelines, executions, health, monitoring
21+
from observability import setup_observability
2222

2323

2424
# ===================
@@ -134,6 +134,9 @@ async def lifespan(app: FastAPI):
134134
allow_headers=["*"],
135135
)
136136

137+
# Configure metrics and error monitoring
138+
setup_observability(app)
139+
137140

138141
# ===================
139142
# Exception Handlers

pipeline/backend/observability.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""Observability setup for metrics and error monitoring."""
2+
import logging
3+
4+
import sentry_sdk
5+
from fastapi import FastAPI
6+
from prometheus_fastapi_instrumentator import Instrumentator
7+
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
8+
9+
from config import settings
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
def setup_observability(app: FastAPI) -> None:
15+
"""Configure Prometheus metrics and Sentry error monitoring."""
16+
Instrumentator(
17+
should_group_status_codes=False,
18+
should_ignore_untemplated=True,
19+
should_respect_env_var=False,
20+
should_instrument_requests_inprogress=True,
21+
excluded_handlers=["/metrics", "/health"],
22+
env_var_name="ENABLE_METRICS",
23+
inprogress_name="flexiroaster_inprogress",
24+
inprogress_labels=True,
25+
).instrument(app).expose(app, include_in_schema=False)
26+
27+
if not settings.SENTRY_DSN:
28+
logger.info("Sentry disabled - SENTRY_DSN not configured")
29+
return
30+
31+
sentry_sdk.init(
32+
dsn=settings.SENTRY_DSN,
33+
environment=settings.SENTRY_ENVIRONMENT,
34+
release=f"{settings.APP_NAME}@{settings.APP_VERSION}",
35+
traces_sample_rate=settings.SENTRY_TRACES_SAMPLE_RATE,
36+
profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE,
37+
)
38+
app.add_middleware(SentryAsgiMiddleware)
39+
logger.info("Sentry error monitoring enabled")

pipeline/backend/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ pyyaml==6.0.1
2727
httpx==0.25.2
2828
structlog==23.2.0
2929
tenacity==8.2.3
30+
sentry-sdk[fastapi]==2.19.2
31+
prometheus-fastapi-instrumentator==7.0.0
3032

3133
# Async support
3234
anyio==4.1.0

pipeline/docker-compose.yml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,12 @@ services:
9494

9595
# Logging
9696
LOG_LEVEL: INFO
97+
98+
# Error Monitoring
99+
SENTRY_DSN: ${SENTRY_DSN:-}
100+
SENTRY_ENVIRONMENT: ${SENTRY_ENVIRONMENT:-local}
101+
SENTRY_TRACES_SAMPLE_RATE: ${SENTRY_TRACES_SAMPLE_RATE:-0.1}
102+
SENTRY_PROFILES_SAMPLE_RATE: ${SENTRY_PROFILES_SAMPLE_RATE:-0.1}
97103
ports:
98104
- "8000:8000"
99105
depends_on:
@@ -198,9 +204,91 @@ services:
198204
- -c
199205
- airflow
200206

207+
# Prometheus metrics store
208+
prometheus:
209+
image: prom/prometheus:v2.54.1
210+
container_name: flexiroaster-prometheus
211+
command:
212+
- "--config.file=/etc/prometheus/prometheus.yml"
213+
volumes:
214+
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
215+
- prometheus-data:/prometheus
216+
ports:
217+
- "9090:9090"
218+
depends_on:
219+
backend:
220+
condition: service_healthy
221+
restart: always
222+
223+
# Grafana dashboards
224+
grafana:
225+
image: grafana/grafana:11.2.2
226+
container_name: flexiroaster-grafana
227+
ports:
228+
- "3000:3000"
229+
environment:
230+
GF_SECURITY_ADMIN_USER: admin
231+
GF_SECURITY_ADMIN_PASSWORD: admin
232+
GF_USERS_ALLOW_SIGN_UP: "false"
233+
volumes:
234+
- grafana-data:/var/lib/grafana
235+
depends_on:
236+
- prometheus
237+
restart: always
238+
239+
# ELK Stack - Elasticsearch
240+
elasticsearch:
241+
image: docker.elastic.co/elasticsearch/elasticsearch:8.15.1
242+
container_name: flexiroaster-elasticsearch
243+
environment:
244+
ES_JAVA_OPTS: "-Xms512m -Xmx512m"
245+
discovery.type: single-node
246+
xpack.security.enabled: "false"
247+
volumes:
248+
- ./monitoring/elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml:ro
249+
- elasticsearch-data:/usr/share/elasticsearch/data
250+
ports:
251+
- "9200:9200"
252+
healthcheck:
253+
test: ["CMD-SHELL", "curl -fsS http://localhost:9200/_cluster/health || exit 1"]
254+
interval: 30s
255+
timeout: 10s
256+
retries: 10
257+
restart: always
258+
259+
# ELK Stack - Logstash
260+
logstash:
261+
image: docker.elastic.co/logstash/logstash:8.15.1
262+
container_name: flexiroaster-logstash
263+
volumes:
264+
- ./monitoring/logstash/logstash.conf:/usr/share/logstash/pipeline/logstash.conf:ro
265+
ports:
266+
- "5000:5000"
267+
- "5044:5044"
268+
depends_on:
269+
elasticsearch:
270+
condition: service_healthy
271+
restart: always
272+
273+
# ELK Stack - Kibana
274+
kibana:
275+
image: docker.elastic.co/kibana/kibana:8.15.1
276+
container_name: flexiroaster-kibana
277+
environment:
278+
ELASTICSEARCH_HOSTS: http://elasticsearch:9200
279+
ports:
280+
- "5601:5601"
281+
depends_on:
282+
elasticsearch:
283+
condition: service_healthy
284+
restart: always
285+
201286
volumes:
202287
postgres-data:
203288
redis-data:
289+
prometheus-data:
290+
grafana-data:
291+
elasticsearch-data:
204292

205293
networks:
206294
default:
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
cluster.name: flexiroaster-observability
2+
node.name: flexiroaster-es01
3+
network.host: 0.0.0.0
4+
discovery.type: single-node
5+
xpack.security.enabled: false
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
input {
2+
beats {
3+
port => 5044
4+
}
5+
6+
tcp {
7+
port => 5000
8+
codec => json
9+
}
10+
}
11+
12+
filter {
13+
if [service] == "flexiroaster-backend" {
14+
mutate {
15+
add_field => { "[@metadata][target_index]" => "flexiroaster-backend-%{+YYYY.MM.dd}" }
16+
}
17+
} else {
18+
mutate {
19+
add_field => { "[@metadata][target_index]" => "flexiroaster-logs-%{+YYYY.MM.dd}" }
20+
}
21+
}
22+
}
23+
24+
output {
25+
elasticsearch {
26+
hosts => ["http://elasticsearch:9200"]
27+
index => "%{[@metadata][target_index]}"
28+
}
29+
30+
stdout { codec => rubydebug }
31+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
global:
2+
scrape_interval: 15s
3+
evaluation_interval: 15s
4+
5+
scrape_configs:
6+
- job_name: 'flexiroaster-backend'
7+
metrics_path: /metrics
8+
static_configs:
9+
- targets: ['backend:8000']
10+
11+
- job_name: 'prometheus'
12+
static_configs:
13+
- targets: ['prometheus:9090']

0 commit comments

Comments
 (0)