Skip to content

Commit 8834b5c

Browse files
committed
Add support for opentelemetry
1 parent 7f81a54 commit 8834b5c

29 files changed

Lines changed: 2039 additions & 747 deletions

deployment/common/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## Running telemetry stack
2+
3+
To run telemetry stack, run
4+
```
5+
cd deployment/common/telemetry
6+
docker compose -f docker-compose.otel.yml up -d
7+
```
8+
9+
If you want to use your own opentelemetry collector you need to modify variables in .otel.env which are used in merginmaps server and celery workers.
10+
11+
Grafana UI is accesible on port 3000 but it can be exposed via mergin nginx proxy (uncomment in nginx.conf).

deployment/common/nginx.conf

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ server {
3636
# redirects, we set the Host: header above already.
3737
proxy_redirect off;
3838
proxy_pass http://merginmaps-server:5000;
39+
proxy_hide_header X-Trace-Id;
3940

4041
# disable buffering
4142
client_max_body_size 0; # No maximum client body size
@@ -49,6 +50,7 @@ server {
4950
proxy_set_header X-Forwarded-Proto $scheme;
5051
proxy_set_header Host $http_host;
5152
proxy_pass http://merginmaps-server:5000;
53+
proxy_hide_header X-Trace-Id;
5254
}
5355

5456
location /download/ {
@@ -59,3 +61,25 @@ server {
5961
}
6062
}
6163

64+
# if needed to expose granafa
65+
# server {
66+
# listen 8082;
67+
# listen [::]:8082;
68+
# server_name _;
69+
70+
# client_max_body_size 4G;
71+
72+
# # Don't show version information
73+
# server_tokens off;
74+
75+
# location / {
76+
# proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
77+
# proxy_set_header X-Forwarded-Proto $scheme;
78+
# proxy_set_header Host $http_host;
79+
# # we don't want nginx trying to do something clever with
80+
# # redirects, we set the Host: header above already.
81+
# proxy_redirect off;
82+
# proxy_pass http://merginmaps-grafana:3000;
83+
# }
84+
# }
85+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
loki-data
2+
dashboards
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
OTEL_METRICS_EXPORTER=otlp
2+
OTEL_TRACES_EXPORTER=otlp
3+
OTEL_LOGS_EXPORTER=otlp
4+
OTEL_EXPORTER_OTLP_ENDPOINT=http://merginmaps-otel-collector:4317
5+
OTEL_EXPORTER_OTLP_LOGS_ENDPOINT=http://merginmaps-otel-collector:4317
6+
OTEL_METRIC_EXPORT_INTERVAL=10000
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
networks:
2+
merginmaps:
3+
name: mergin
4+
external: true
5+
6+
services:
7+
otel-collector:
8+
# Contrib version is required for StatsD
9+
image: otel/opentelemetry-collector-contrib:0.90.0
10+
container_name: merginmaps-otel-collector
11+
volumes:
12+
- ./otel-config.yaml:/etc/otelcol-contrib/config.yaml
13+
- /var/lib/docker/containers:/var/lib/docker/containers:ro
14+
- /var/run/docker.sock:/var/run/docker.sock:ro
15+
networks:
16+
- merginmaps
17+
user: "0:0"
18+
privileged: true
19+
ports:
20+
- "8125:8125/udp" # StatsD (Metrics)
21+
- "4317:4317" # OTLP (Traces)
22+
- "8889:8889" # Prometheus Scrape Port
23+
# - "8888:8889"
24+
- "55679:55679"
25+
depends_on:
26+
tempo:
27+
condition: service_started
28+
29+
tempo:
30+
image: grafana/tempo:2.8.3
31+
container_name: merginmaps-tempo
32+
command: ["-config.file=/etc/tempo.yaml", "-target=all"]
33+
networks:
34+
- merginmaps
35+
volumes:
36+
- ./tempo.yaml:/etc/tempo.yaml
37+
# - ./tempo-data:/tmp/tempo
38+
ports:
39+
- "3200:3200" # Tempo UI/API
40+
41+
prometheus:
42+
image: prom/prometheus:v3.9.0
43+
container_name: merginmaps-prometheus
44+
networks:
45+
- merginmaps
46+
volumes:
47+
- ./prometheus.yml:/etc/prometheus/prometheus.yml
48+
ports:
49+
- "9090:9090"
50+
51+
loki:
52+
image: grafana/loki:3.6.4
53+
container_name: merginmaps-loki
54+
ports:
55+
- "3100:3100"
56+
user: "root"
57+
volumes:
58+
- ./loki-config.yaml:/etc/loki/local-config.yaml
59+
- ./loki-data:/loki # Persistent storage
60+
command: -config.file=/etc/loki/local-config.yaml
61+
networks:
62+
- merginmaps
63+
64+
grafana:
65+
image: grafana/grafana:12.3.2
66+
container_name: merginmaps-grafana
67+
networks:
68+
- merginmaps
69+
ports:
70+
- "3000:3000"
71+
environment:
72+
- GF_AUTH_ANONYMOUS_ENABLED=true
73+
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
74+
volumes:
75+
- ./grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml
76+
- ./dashboards:/var/lib/grafana/dashboards
77+
depends_on:
78+
- loki
79+
- prometheus
80+
- tempo
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: 1
2+
datasources:
3+
- name: Prometheus
4+
type: prometheus
5+
url: http://merginmaps-prometheus:9090
6+
isDefault: true
7+
- name: Tempo
8+
type: tempo
9+
url: http://merginmaps-tempo:3200
10+
- name: Loki
11+
type: loki
12+
url: http://merginmaps-loki:3100
13+
jsonData:
14+
derivedFields:
15+
- datasourceUid: Tempo
16+
matcherRegex: '\[(?:ACCESS|INFO|ERROR)\]\s\[(\w{32})\]' # Finds the trace_id in the log JSON/metadata
17+
name: TraceID
18+
url: '$${__value.raw}' # The raw ID value used to query Tempo
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
auth_enabled: false
2+
3+
server:
4+
http_listen_port: 3100
5+
grpc_listen_port: 9096
6+
7+
common:
8+
instance_addr: 0.0.0.0
9+
path_prefix: /loki
10+
storage:
11+
filesystem:
12+
chunks_directory: /loki/chunks
13+
rules_directory: /loki/rules
14+
replication_factor: 1
15+
ring:
16+
kvstore:
17+
store: inmemory
18+
19+
schema_config:
20+
configs:
21+
- from: 2024-01-01
22+
store: tsdb
23+
object_store: filesystem
24+
schema: v13
25+
index:
26+
prefix: index_
27+
period: 24h
28+
29+
limits_config:
30+
allow_structured_metadata: true
31+
reject_old_samples: false
32+
otlp_config:
33+
resource_attributes:
34+
attributes_config:
35+
- action: index_label
36+
attributes:
37+
- container_id
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
extensions:
2+
health_check:
3+
4+
receivers:
5+
otlp:
6+
protocols:
7+
grpc:
8+
endpoint: "0.0.0.0:4317"
9+
http:
10+
endpoint: "0.0.0.0:4318"
11+
statsd:
12+
endpoint: "0.0.0.0:8125"
13+
aggregation_interval: 10s
14+
enable_metric_type: true
15+
redis:
16+
endpoint: "merginmaps-redis:6379"
17+
collection_interval: 10s
18+
#password: "${REDIS_PASSWORD}"
19+
# This replaces Promtail
20+
filelog:
21+
include:
22+
- /var/lib/docker/containers/*/*.log
23+
start_at: end
24+
include_file_path: true
25+
operators:
26+
- type: json_parser
27+
- type: move
28+
from: attributes.log
29+
to: body
30+
- type: regex_parser
31+
parse_from: attributes["log.file.path"]
32+
regex: '/var/lib/docker/containers/(?P<container_id>[a-f0-9]{12})'
33+
on_error: send
34+
35+
processors:
36+
resourcedetection/docker:
37+
detectors: [env, docker]
38+
timeout: 2s
39+
override: true
40+
filter/exclude_collector:
41+
logs:
42+
log_record:
43+
- 'IsMatch(body, ".*exporterhelper.*")'
44+
- 'IsMatch(body, ".*Debug.*")'
45+
- 'IsMatch(body, ".*kind.*exporter.*")'
46+
batch:
47+
transform/logs:
48+
log_statements:
49+
- context: log
50+
statements:
51+
- set(resource.attributes["container_id"], attributes["container_id"]) where attributes["container_id"] != nil
52+
transform:
53+
metric_statements:
54+
- context: metric
55+
statements:
56+
- set(name, "mergin_gunicorn_workers") where name == "app.gunicorn.workers"
57+
- set(name, "mergin_gunicorn_request_duration") where name == "app.gunicorn.request.duration"
58+
# these metrics are not working
59+
- set(name, "mergin_gunicorn_request_rate") where name == "app.gunicorn.requests"
60+
- set(name, "mergin_gunicorn_log_critical") where name == "gunicorn.log.critical"
61+
- set(name, "mergin_gunicorn_log_error") where name == "gunicorn.log.error"
62+
- set(name, "mergin_gunicorn_log_warning") where name == "gunicorn.log.warning"
63+
- set(name, "mergin_gunicorn_log_exception") where name == "gunicorn.log.exception"
64+
- set(name, "mergin_gunicorn_response_code_200") where name == "app.gunicorn.request.status.200"
65+
# log_statements:
66+
# - context: log
67+
# statements:
68+
# - set(attributes["service_name"], attributes["docker_id"])
69+
# - set(resource.attributes["service.name"], attributes["docker_id"])
70+
attributes:
71+
actions:
72+
- key: service.name
73+
action: insert
74+
value: "merginmaps"
75+
76+
77+
exporters:
78+
prometheus:
79+
endpoint: "0.0.0.0:8889" # The Collector will "host" metrics here
80+
resource_to_telemetry_conversion:
81+
enabled: true # Converts OTel resource attributes to Prometheus labels
82+
add_metric_suffixes: true
83+
otlp:
84+
endpoint: "merginmaps-tempo:4317"
85+
tls:
86+
insecure: true
87+
loki:
88+
endpoint: "http://merginmaps-loki:3100/loki/api/v1/push"
89+
# This ensures the trace and span IDs are included in the log metadata
90+
default_labels_enabled:
91+
exporter: true
92+
job: true
93+
instance: true
94+
level: true
95+
debug:
96+
verbosity: detailed
97+
otlphttp/loki:
98+
endpoint: "http://loki:3100/otlp"
99+
tls:
100+
insecure: true
101+
102+
service:
103+
extensions: [health_check]
104+
telemetry:
105+
metrics:
106+
address: 0.0.0.0:8888 # This enables the /metrics port
107+
logs:
108+
level: "warn"
109+
pipelines:
110+
metrics:
111+
receivers: [otlp, statsd, redis]
112+
processors: [transform, batch]
113+
exporters: [prometheus]
114+
traces:
115+
receivers: [otlp]
116+
processors: [batch]
117+
exporters: [otlp]
118+
logs:
119+
receivers: [filelog]
120+
processors: [filter/exclude_collector, resourcedetection/docker, attributes, transform/logs, batch]
121+
exporters: [otlphttp/loki]
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
global:
2+
scrape_interval: 15s # How often to scrape targets
3+
evaluation_interval: 15s
4+
5+
# Enable the Exemplars feature in the storage engine
6+
# storage:
7+
# tsdb:
8+
# out_of_order_time_window: 0s # Standard for local dev
9+
# exemplars:
10+
# max_exemplars: 100000
11+
12+
scrape_configs:
13+
# 1. Scrape the OTel Collector (where Gunicorn metrics are hosted)
14+
- job_name: 'otel-collector'
15+
static_configs:
16+
- targets: ['merginmaps-otel-collector:8889']
17+
# This allows Prometheus to pull TraceIDs from the Collector
18+
# and attach them to the metrics as Exemplars.
19+
# metric_relabel_configs:
20+
# - source_labels: [__name__]
21+
# separator: ;
22+
# regex: 'mergin_gunicorn.*'
23+
# replacement: $1
24+
# action: keep
25+
26+
# 2. Optional: Scrape Prometheus itself
27+
- job_name: 'prometheus'
28+
static_configs:
29+
- targets: ['localhost:9090']
30+
31+
# 3. Optional: Scrape Tempo (to monitor your tracing backend)
32+
- job_name: 'tempo'
33+
static_configs:
34+
- targets: ['merginmaps-tempo:3200']

0 commit comments

Comments
 (0)