Skip to content

Commit e403418

Browse files
authored
Expose GPU metrics collected by runner as Prometheus metrics (#2916)
Closes: #2800
1 parent 7b16083 commit e403418

File tree

3 files changed

+45
-2
lines changed

3 files changed

+45
-2
lines changed

docs/docs/guides/metrics.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ To enable exporting metrics to Prometheus, set the
4343
In addition to the essential metrics available via the CLI and UI, `dstack` exports additional metrics to Prometheus, including data on fleets, runs, jobs, and DCGM metrics.
4444

4545
??? info "NVIDIA DCGM"
46-
NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends,
46+
NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends,
4747
as well as for [SSH fleets](../concepts/fleets.md#ssh).
48-
48+
4949
To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`,
5050
`datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts.
5151

@@ -112,6 +112,9 @@ telemetry, and more.
112112
| `dstack_job_memory_total_bytes` | *gauge* | Total memory allocated for the job, bytes | `4009754624.0` |
113113
| `dstack_job_memory_usage_bytes` | *gauge* | Memory used by the job (including cache), bytes | `339017728.0` |
114114
| `dstack_job_memory_working_set_bytes` | *gauge* | Memory used by the job (not including cache), bytes | `147251200.0` |
115+
| `dstack_job_gpu_usage_ratio` | *gauge* | Job GPU usage, percent (as 0.0-1.0) | `0.93` |
116+
| `dstack_job_gpu_memory_total_bytes` | *gauge* | Total GPU memory allocated for the job, bytes | `8589934592.0` |
117+
| `dstack_job_gpu_memory_usage_bytes` | *gauge* | GPU memory used by the job, bytes | `1048576.0` |
115118
| `DCGM_FI_DEV_GPU_UTIL` | *gauge* | GPU utilization (in %) | |
116119
| `DCGM_FI_DEV_MEM_COPY_UTIL` | *gauge* | Memory utilization (in %) | |
117120
| `DCGM_FI_DEV_ENC_UTIL` | *gauge* | Encoder utilization (in %) | |
@@ -176,6 +179,9 @@ telemetry, and more.
176179
| `dstack_run_type` | *string* | Run configuration type | `task`, `dev-environment` |
177180
| `dstack_backend` | *string* | Backend | `aws`, `runpod` |
178181
| `dstack_gpu` | *string?* | GPU name | `H100` |
182+
| `dstack_gpu_num`[^1] | *integer* | GPU number (0-based) | `0` |
183+
184+
[^1]: For `dstack_gpu_*` metrics only.
179185

180186
### Server health metrics
181187

src/dstack/_internal/server/services/prometheus/custom_metrics.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import itertools
2+
import json
23
from collections import defaultdict
34
from collections.abc import Generator, Iterable
45
from datetime import timezone
@@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
177178
metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
178179
metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
179180
metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
181+
if gpus:
182+
gpu_memory_total = gpus[0].memory_mib * 1024 * 1024
183+
for gpu_num, (gpu_util, gpu_memory_usage) in enumerate(
184+
zip(
185+
json.loads(jmp.gpus_util_percent),
186+
json.loads(jmp.gpus_memory_usage_bytes),
187+
)
188+
):
189+
gpu_labels = labels.copy()
190+
gpu_labels["dstack_gpu_num"] = gpu_num
191+
metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100)
192+
metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total)
193+
metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage)
180194
jpm = job_prometheus_metrics.get(job.id)
181195
if jpm is not None:
182196
for metric in text_string_to_metric_families(jpm.text):
@@ -202,6 +216,9 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
202216
_JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
203217
_JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
204218
_JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
219+
_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio"
220+
_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes"
221+
_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes"
205222

206223

207224
class _Metrics(dict[str, Metric]):
@@ -259,6 +276,9 @@ class _JobMetrics(_Metrics):
259276
(_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
260277
(_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
261278
(_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
279+
(_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"),
280+
(_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"),
281+
(_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"),
262282
]
263283

264284

src/tests/_internal/server/routers/test_prometheus.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
109109
memory_gib=128,
110110
gpu_count=2,
111111
gpu_name="V4",
112+
gpu_memory_gib=16,
112113
price=12,
113114
)
114115
project_2 = await _create_project(session, "project-2", user)
@@ -140,6 +141,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
140141
"""),
141142
)
142143
project_1 = await _create_project(session, "project-1", user)
144+
# jrd.offer.instance.resources has higher priority than jpd.instance_type.resources,
145+
# should be ignored
143146
jpd_1_1 = get_job_provisioning_data(backend=BackendType.AWS, gpu_count=4, gpu_name="T4")
144147
jrd_1_1 = get_job_runtime_data(offer=offer)
145148
job_1_1 = await _create_job(
@@ -176,6 +179,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
176179
cpu_usage_micro=3_500_000,
177180
memory_working_set_bytes=3_221_225_472,
178181
memory_usage_bytes=4_294_967_296,
182+
gpus_util_percent=[80, 90],
183+
gpus_memory_usage_bytes=[1_073_741_824, 2_147_483_648],
179184
)
180185
# Older, ignored
181186
await create_job_metrics_point(
@@ -316,6 +321,18 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
316321
# HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
317322
# TYPE dstack_job_memory_working_set_bytes gauge
318323
dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
324+
# HELP dstack_job_gpu_usage_ratio Job GPU usage, percent (as 0.0-1.0)
325+
# TYPE dstack_job_gpu_usage_ratio gauge
326+
dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 0.8
327+
dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 0.9
328+
# HELP dstack_job_gpu_memory_total_bytes Total GPU memory allocated for the job, bytes
329+
# TYPE dstack_job_gpu_memory_total_bytes gauge
330+
dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 17179869184.0
331+
dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 17179869184.0
332+
# HELP dstack_job_gpu_memory_usage_bytes GPU memory used by the job, bytes
333+
# TYPE dstack_job_gpu_memory_usage_bytes gauge
334+
dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 1073741824.0
335+
dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 2147483648.0
319336
# HELP FIELD_1 Test field 1
320337
# TYPE FIELD_1 gauge
321338
FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0

0 commit comments

Comments
 (0)