Expose GPU metrics collected by runner as Prometheus metrics (#2916)

un-def · web-flow · commit e4034184118d · 2025-07-21T07:14:26.000Z
Closes: #2800
diff --git a/docs/docs/guides/metrics.md b/docs/docs/guides/metrics.md
@@ -43,9 +43,9 @@ To enable exporting metrics to Prometheus, set the
 In addition to the essential metrics available via the CLI and UI, `dstack` exports additional metrics to Prometheus, including data on fleets, runs, jobs, and DCGM metrics.
 
 ??? info "NVIDIA DCGM"
-    NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, 
+    NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends,
     as well as for [SSH fleets](../concepts/fleets.md#ssh).
-    
+
     To ensure NVIDIA DCGM metrics are collected from SSH fleets, ensure the `datacenter-gpu-manager-4-core`,
     `datacenter-gpu-manager-4-proprietary`, and `datacenter-gpu-manager-exporter` packages are installed on the hosts.
 
@@ -112,6 +112,9 @@ telemetry, and more.
     | `dstack_job_memory_total_bytes`                 | *gauge*   | Total memory allocated for the job, bytes                                                  | `4009754624.0` |
     | `dstack_job_memory_usage_bytes`                 | *gauge*   | Memory used by the job (including cache), bytes                                            | `339017728.0`  |
     | `dstack_job_memory_working_set_bytes`           | *gauge*   | Memory used by the job (not including cache), bytes                                        | `147251200.0`  |
+    | `dstack_job_gpu_usage_ratio`                    | *gauge*   | Job GPU usage, percent (as 0.0-1.0)                                                        | `0.93`         |
+    | `dstack_job_gpu_memory_total_bytes`             | *gauge*   | Total GPU memory allocated for the job, bytes                                              | `8589934592.0` |
+    | `dstack_job_gpu_memory_usage_bytes`             | *gauge*   | GPU memory used by the job, bytes                                                          | `1048576.0`    |
     | `DCGM_FI_DEV_GPU_UTIL`                          | *gauge*   | GPU utilization (in %)                                                                     |                |
     | `DCGM_FI_DEV_MEM_COPY_UTIL`                     | *gauge*   | Memory utilization (in %)                                                                  |                |
     | `DCGM_FI_DEV_ENC_UTIL`                          | *gauge*   | Encoder utilization (in %)                                                                 |                |
@@ -176,6 +179,9 @@ telemetry, and more.
     | `dstack_run_type`     | *string*  | Run configuration type | `task`, `dev-environment`              |
     | `dstack_backend`      | *string*  | Backend                | `aws`, `runpod`                        |
     | `dstack_gpu`          | *string?* | GPU name               | `H100`                                 |
+    | `dstack_gpu_num`[^1]  | *integer* | GPU number (0-based)   | `0`                                    |
+
+    [^1]: For `dstack_gpu_*` metrics only.
 
 ### Server health metrics
 
diff --git a/src/dstack/_internal/server/services/prometheus/custom_metrics.py b/src/dstack/_internal/server/services/prometheus/custom_metrics.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 from collections import defaultdict
 from collections.abc import Generator, Iterable
 from datetime import timezone
@@ -177,6 +178,19 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
             metrics.add_sample(_JOB_CPU_TIME, labels, jmp.cpu_usage_micro / 1_000_000)
             metrics.add_sample(_JOB_MEMORY_USAGE, labels, jmp.memory_usage_bytes)
             metrics.add_sample(_JOB_MEMORY_WORKING_SET, labels, jmp.memory_working_set_bytes)
+            if gpus:
+                gpu_memory_total = gpus[0].memory_mib * 1024 * 1024
+                for gpu_num, (gpu_util, gpu_memory_usage) in enumerate(
+                    zip(
+                        json.loads(jmp.gpus_util_percent),
+                        json.loads(jmp.gpus_memory_usage_bytes),
+                    )
+                ):
+                    gpu_labels = labels.copy()
+                    gpu_labels["dstack_gpu_num"] = gpu_num
+                    metrics.add_sample(_JOB_GPU_USAGE_RATIO, gpu_labels, gpu_util / 100)
+                    metrics.add_sample(_JOB_GPU_MEMORY_TOTAL, gpu_labels, gpu_memory_total)
+                    metrics.add_sample(_JOB_GPU_MEMORY_USAGE, gpu_labels, gpu_memory_usage)
         jpm = job_prometheus_metrics.get(job.id)
         if jpm is not None:
             for metric in text_string_to_metric_families(jpm.text):
@@ -202,6 +216,9 @@ async def get_job_metrics(session: AsyncSession) -> Iterable[Metric]:
 _JOB_MEMORY_TOTAL = "dstack_job_memory_total_bytes"
 _JOB_MEMORY_USAGE = "dstack_job_memory_usage_bytes"
 _JOB_MEMORY_WORKING_SET = "dstack_job_memory_working_set_bytes"
+_JOB_GPU_USAGE_RATIO = "dstack_job_gpu_usage_ratio"
+_JOB_GPU_MEMORY_TOTAL = "dstack_job_gpu_memory_total_bytes"
+_JOB_GPU_MEMORY_USAGE = "dstack_job_gpu_memory_usage_bytes"
 
 
 class _Metrics(dict[str, Metric]):
@@ -259,6 +276,9 @@ class _JobMetrics(_Metrics):
         (_JOB_MEMORY_TOTAL, _GAUGE, "Total memory allocated for the job, bytes"),
         (_JOB_MEMORY_USAGE, _GAUGE, "Memory used by the job (including cache), bytes"),
         (_JOB_MEMORY_WORKING_SET, _GAUGE, "Memory used by the job (not including cache), bytes"),
+        (_JOB_GPU_USAGE_RATIO, _GAUGE, "Job GPU usage, percent (as 0.0-1.0)"),
+        (_JOB_GPU_MEMORY_TOTAL, _GAUGE, "Total GPU memory allocated for the job, bytes"),
+        (_JOB_GPU_MEMORY_USAGE, _GAUGE, "GPU memory used by the job, bytes"),
     ]
 
 
diff --git a/src/tests/_internal/server/routers/test_prometheus.py b/src/tests/_internal/server/routers/test_prometheus.py
@@ -109,6 +109,7 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
             memory_gib=128,
             gpu_count=2,
             gpu_name="V4",
+            gpu_memory_gib=16,
             price=12,
         )
         project_2 = await _create_project(session, "project-2", user)
@@ -140,6 +141,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
             """),
         )
         project_1 = await _create_project(session, "project-1", user)
+        # jrd.offer.instance.resources has higher priority than jpd.instance_type.resources,
+        # should be ignored
         jpd_1_1 = get_job_provisioning_data(backend=BackendType.AWS, gpu_count=4, gpu_name="T4")
         jrd_1_1 = get_job_runtime_data(offer=offer)
         job_1_1 = await _create_job(
@@ -176,6 +179,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
             cpu_usage_micro=3_500_000,
             memory_working_set_bytes=3_221_225_472,
             memory_usage_bytes=4_294_967_296,
+            gpus_util_percent=[80, 90],
+            gpus_memory_usage_bytes=[1_073_741_824, 2_147_483_648],
         )
         # Older, ignored
         await create_job_metrics_point(
@@ -316,6 +321,18 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
             # HELP dstack_job_memory_working_set_bytes Memory used by the job (not including cache), bytes
             # TYPE dstack_job_memory_working_set_bytes gauge
             dstack_job_memory_working_set_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 3221225472.0
+            # HELP dstack_job_gpu_usage_ratio Job GPU usage, percent (as 0.0-1.0)
+            # TYPE dstack_job_gpu_usage_ratio gauge
+            dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 0.8
+            dstack_job_gpu_usage_ratio{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 0.9
+            # HELP dstack_job_gpu_memory_total_bytes Total GPU memory allocated for the job, bytes
+            # TYPE dstack_job_gpu_memory_total_bytes gauge
+            dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 17179869184.0
+            dstack_job_gpu_memory_total_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 17179869184.0
+            # HELP dstack_job_gpu_memory_usage_bytes GPU memory used by the job, bytes
+            # TYPE dstack_job_gpu_memory_usage_bytes gauge
+            dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="0"}} 1073741824.0
+            dstack_job_gpu_memory_usage_bytes{{dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4",dstack_gpu_num="1"}} 2147483648.0
             # HELP FIELD_1 Test field 1
             # TYPE FIELD_1 gauge
             FIELD_1{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 350.0