forked from ai-dynamo/dynamo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-observability.yml
More file actions
137 lines (127 loc) · 4.73 KB
/
docker-observability.yml
File metadata and controls
137 lines (127 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Observability stack for Dynamo: metrics, tracing, and visualization.
# Requires deploy/docker-compose.yml to be running for NATS and etcd connectivity.
#
# Usage:
# docker compose -f deploy/docker-observability.yml up -d
version: '3.8'
networks:
server:
external: true
name: deploy_server
volumes:
grafana-data:
tempo-data:
services:
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
# dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
dcgm-exporter:
image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
ports:
# Expose dcgm-exporter on port 9401 both inside and outside the container
# to avoid conflicts with other dcgm-exporter instances in distributed environments.
# To access DCGM metrics:
# Outside the container: curl http://localhost:9401/metrics (or the host IP)
# Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
- 9401:9401
cap_add:
- SYS_ADMIN
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
- NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
- DCGM_EXPORTER_LISTEN=:9401
runtime: nvidia # Specify the NVIDIA runtime
networks:
- server
# The exporter translates from /varz and other stats to Prometheus metrics
nats-prometheus-exporter:
image: natsio/prometheus-nats-exporter:0.17.3
command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
ports:
- 7777:7777
networks:
- server
# To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 9090/tcp
prometheus:
image: prom/prometheus:v3.4.1
container_name: prometheus
volumes:
- ./observability/prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# These provide the web console functionality
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
restart: unless-stopped
# Example to pull from the /query endpoint:
# {__name__=~"DCGM.*", job="dcgm-exporter"}
ports:
- "9090:9090"
networks:
- server
extra_hosts:
- "host.docker.internal:host-gateway"
depends_on:
- dcgm-exporter
- nats-prometheus-exporter
# Tempo - Distributed tracing backend
tempo:
image: grafana/tempo:2.8.2
command: [ "-config.file=/etc/tempo.yaml" ]
user: root
volumes:
- ./observability/tempo.yaml:/etc/tempo.yaml
- tempo-data:/tmp/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC receiver (accessible from host)
- "4318:4318" # OTLP HTTP receiver (accessible from host)
networks:
- server
# Grafana - Visualization and dashboards
# Supports both Prometheus (metrics) and Tempo (tracing) datasources
# Default credentials: dynamo/dynamo
# To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
# sudo ufw allow 3000/tcp
grafana:
image: grafana/grafana:12.2.0
container_name: grafana
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
- ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
environment:
- GF_SERVER_HTTP_PORT=3000
# do not make it admin/admin, because you will be prompted to change the password every time
- GF_SECURITY_ADMIN_USER=dynamo
- GF_SECURITY_ADMIN_PASSWORD=dynamo
- GF_USERS_ALLOW_SIGN_UP=false
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
- GF_INSTALL_PLUGINS=grafana-piechart-panel
# Default min interval is 5s, but can be configured lower
- GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
# Disable password change requirement
- GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
- GF_SECURITY_ADMIN_PASSWORD_POLICY=false
- GF_AUTH_DISABLE_LOGIN_FORM=false
- GF_AUTH_DISABLE_SIGNOUT_MENU=false
restart: unless-stopped
ports:
- "3000:3000"
networks:
- server
depends_on:
- prometheus
- tempo