Skip to content

Commit e31b609

Browse files
authored
Add new metrics (#2434)
* `memory_total_bytes` * `gpu_memory_total_bytes` * `cpus_detected_num`
1 parent 8d2cf6d commit e31b609

File tree

5 files changed

+101
-28
lines changed

5 files changed

+101
-28
lines changed

src/dstack/_internal/server/routers/metrics.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,16 @@ async def get_job_metrics(
4040
By default, returns one latest sample. To control time window/number of samples, use
4141
`limit`, `after`, `before`.
4242
43-
Supported metrics: [
44-
"cpu_usage_percent",
45-
"memory_usage_bytes",
46-
"memory_working_set_bytes",
47-
"gpus_detected_num",
48-
"gpu_memory_usage_bytes_gpu{i}",
49-
"gpu_util_percent_gpu{i}"
50-
]
43+
Supported metrics (all optional):
44+
* `cpus_detected_num`
45+
* `cpu_usage_percent`
46+
* `memory_total_bytes`
47+
* `memory_usage_bytes`
48+
* `memory_working_set_bytes`
49+
* `gpus_detected_num`
50+
* `gpu_memory_total_bytes`
51+
* `gpu_memory_usage_bytes_gpu{i}`
52+
* `gpu_util_percent_gpu{i}`
5153
"""
5254
_, project = user_project
5355

src/dstack/_internal/server/services/metrics.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@
77
from sqlalchemy import select
88
from sqlalchemy.ext.asyncio import AsyncSession
99

10+
from dstack._internal.core.models.instances import Resources
1011
from dstack._internal.core.models.metrics import JobMetrics, Metric
1112
from dstack._internal.server.models import JobMetricsPoint, JobModel
13+
from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
14+
from dstack._internal.utils.common import get_or_error
1215
from dstack._internal.utils.logging import get_logger
1316

1417
logger = get_logger(__name__)
@@ -47,17 +50,34 @@ async def get_job_metrics(
4750
# we need at least 2 points to calculate cpu_usage_percent
4851
if len(points) < 2:
4952
return JobMetrics(metrics=[])
50-
return _calculate_job_metrics(points)
53+
return _calculate_job_metrics(job_model, points)
5154

5255

53-
def _calculate_job_metrics(points: Sequence[JobMetricsPoint]) -> JobMetrics:
56+
def _calculate_job_metrics(job_model: JobModel, points: Sequence[JobMetricsPoint]) -> JobMetrics:
5457
timestamps: list[datetime] = []
5558
cpu_usage_points: list[int] = []
5659
memory_usage_points: list[int] = []
5760
memory_working_set_points: list[int] = []
5861
gpus_memory_usage_points: defaultdict[int, list[int]] = defaultdict(list)
5962
gpus_util_points: defaultdict[int, list[int]] = defaultdict(list)
6063

64+
cpus_detected_num: Optional[int] = None
65+
memory_total: Optional[int] = None
66+
gpu_memory_total: Optional[int] = None
67+
resources: Optional[Resources] = None
68+
jrd = get_job_runtime_data(job_model)
69+
if jrd is not None and jrd.offer is not None:
70+
resources = jrd.offer.instance.resources
71+
else:
72+
jpd = get_job_provisioning_data(job_model)
73+
if jpd is not None:
74+
resources = jpd.instance_type.resources
75+
if resources is not None:
76+
cpus_detected_num = resources.cpus
77+
memory_total = resources.memory_mib * 1024 * 1024
78+
if len(resources.gpus) > 0:
79+
gpu_memory_total = resources.gpus[0].memory_mib * 1024 * 1024
80+
6181
gpus_detected_num: Optional[int] = None
6282
gpus_detected_num_mismatch: bool = False
6383
for point, prev_point in zip(points, points[1:]):
@@ -93,25 +113,23 @@ def _calculate_job_metrics(points: Sequence[JobMetricsPoint]) -> JobMetrics:
93113
values=memory_working_set_points,
94114
),
95115
]
116+
if cpus_detected_num is not None:
117+
metrics.append(_make_constant_metric("cpus_detected_num", timestamps, cpus_detected_num))
118+
if memory_total is not None:
119+
metrics.append(_make_constant_metric("memory_total_bytes", timestamps, memory_total))
96120
if gpus_detected_num_mismatch:
97121
# If number of GPUs changed in the time window, skip GPU metrics altogether, otherwise
98122
# results can be unpredictable (e.g, one GPU takes place of another, as they are
99123
# identified by an array index only).
100124
logger.warning("gpus_detected_num mismatch, skipping GPU metrics")
101125
else:
102126
metrics.append(
103-
# As gpus_detected_num expected to be constant, we add only two points — the latest
104-
# and the earliest in the batch
105-
Metric(
106-
name="gpus_detected_num",
107-
timestamps=[timestamps[0], timestamps[-1]]
108-
if len(timestamps) > 1
109-
else [timestamps[0]],
110-
values=[gpus_detected_num, gpus_detected_num]
111-
if len(timestamps) > 1
112-
else [gpus_detected_num],
113-
)
127+
_make_constant_metric("gpus_detected_num", timestamps, get_or_error(gpus_detected_num))
114128
)
129+
if gpu_memory_total is not None:
130+
metrics.append(
131+
_make_constant_metric("gpu_memory_total_bytes", timestamps, gpu_memory_total)
132+
)
115133
for index, gpu_memory_usage_points in gpus_memory_usage_points.items():
116134
metrics.append(
117135
Metric(
@@ -131,6 +149,14 @@ def _calculate_job_metrics(points: Sequence[JobMetricsPoint]) -> JobMetrics:
131149
return JobMetrics(metrics=metrics)
132150

133151

152+
def _make_constant_metric(name: str, timestamps: list[datetime], value: float) -> Metric:
153+
return Metric(
154+
name=name,
155+
timestamps=timestamps,
156+
values=[value] * len(timestamps),
157+
)
158+
159+
134160
def _get_cpu_usage(last_point: JobMetricsPoint, prev_point: JobMetricsPoint) -> int:
135161
window = last_point.timestamp_micro - prev_point.timestamp_micro
136162
if window == 0:

src/dstack/_internal/server/testing/common.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,13 +329,20 @@ def get_job_provisioning_data(
329329
backend: BackendType = BackendType.AWS,
330330
region: str = "us-east-1",
331331
gpu_count: int = 0,
332+
gpu_memory_gib: float = 16,
332333
cpu_count: int = 1,
333334
memory_gib: float = 0.5,
334335
spot: bool = False,
335336
hostname: str = "127.0.0.4",
336337
internal_ip: Optional[str] = "127.0.0.4",
337338
) -> JobProvisioningData:
338-
gpus = [Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] * gpu_count
339+
gpus = [
340+
Gpu(
341+
name="T4",
342+
memory_mib=int(gpu_memory_gib * 1024),
343+
vendor=gpuhunt.AcceleratorVendor.NVIDIA,
344+
)
345+
] * gpu_count
339346
return JobProvisioningData(
340347
backend=backend,
341348
instance_type=InstanceType(
@@ -597,6 +604,7 @@ def get_instance_offer_with_availability(
597604
backend: BackendType = BackendType.AWS,
598605
region: str = "eu-west",
599606
gpu_count: int = 0,
607+
gpu_memory_gib: float = 16,
600608
cpu_count: int = 2,
601609
memory_gib: float = 12,
602610
disk_gib: float = 100.0,
@@ -605,7 +613,13 @@ def get_instance_offer_with_availability(
605613
total_blocks: int = 1,
606614
availability_zones: Optional[List[str]] = None,
607615
):
608-
gpus = [Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA)] * gpu_count
616+
gpus = [
617+
Gpu(
618+
name="T4",
619+
memory_mib=int(gpu_memory_gib * 1024),
620+
vendor=gpuhunt.AcceleratorVendor.NVIDIA,
621+
)
622+
] * gpu_count
609623
return InstanceOfferWithAvailability(
610624
backend=backend,
611625
instance=InstanceType(

src/tests/_internal/server/routers/test_metrics.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
create_run,
1515
create_user,
1616
get_auth_headers,
17+
get_instance_offer_with_availability,
18+
get_job_provisioning_data,
19+
get_job_runtime_data,
1720
)
1821

1922
pytestmark = pytest.mark.usefixtures("image_config_mock")
@@ -51,9 +54,18 @@ async def test_returns_metrics(self, test_db, session: AsyncSession, client: Asy
5154
repo=repo,
5255
user=user,
5356
)
57+
jpd = get_job_provisioning_data(
58+
cpu_count=128, memory_gib=256, gpu_count=2, gpu_memory_gib=32
59+
)
60+
offer = get_instance_offer_with_availability(
61+
cpu_count=64, memory_gib=128, gpu_count=1, gpu_memory_gib=32
62+
)
63+
jrd = get_job_runtime_data(offer=offer)
5464
job = await create_job(
5565
session=session,
5666
run=run,
67+
job_provisioning_data=jpd,
68+
job_runtime_data=jrd,
5769
)
5870
await create_job_metrics_point(
5971
session=session,
@@ -108,11 +120,26 @@ async def test_returns_metrics(self, test_db, session: AsyncSession, client: Asy
108120
"timestamps": ["2023-01-02T03:04:25+00:00"],
109121
"values": [512],
110122
},
123+
{
124+
"name": "cpus_detected_num",
125+
"timestamps": ["2023-01-02T03:04:25+00:00"],
126+
"values": [64],
127+
},
128+
{
129+
"name": "memory_total_bytes",
130+
"timestamps": ["2023-01-02T03:04:25+00:00"],
131+
"values": [137438953472],
132+
},
111133
{
112134
"name": "gpus_detected_num",
113135
"timestamps": ["2023-01-02T03:04:25+00:00"],
114136
"values": [1],
115137
},
138+
{
139+
"name": "gpu_memory_total_bytes",
140+
"timestamps": ["2023-01-02T03:04:25+00:00"],
141+
"values": [34359738368],
142+
},
116143
{
117144
"name": "gpu_memory_usage_bytes_gpu0",
118145
"timestamps": ["2023-01-02T03:04:25+00:00"],

src/tests/_internal/server/services/test_metrics.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
create_repo,
1313
create_run,
1414
create_user,
15+
get_job_provisioning_data,
1516
)
1617

1718

@@ -129,9 +130,13 @@ async def test_get_metrics(
129130
repo=repo,
130131
user=user,
131132
)
133+
jpd = get_job_provisioning_data(
134+
cpu_count=64, memory_gib=128, gpu_count=2, gpu_memory_gib=32
135+
)
132136
job = await create_job(
133137
session=session,
134138
run=run,
139+
job_provisioning_data=jpd,
135140
)
136141
for dt, _cpu, _mem, _mem_ws, _gpu0_mem, _gpu0_util, _gpu1_mem, _gpu1_util in self.points:
137142
await create_job_metrics_point(
@@ -151,11 +156,10 @@ async def test_get_metrics(
151156
Metric(name="cpu_usage_percent", timestamps=ts, values=cpu),
152157
Metric(name="memory_usage_bytes", timestamps=ts, values=mem),
153158
Metric(name="memory_working_set_bytes", timestamps=ts, values=mem_ws),
154-
Metric(
155-
name="gpus_detected_num",
156-
timestamps=[ts[0], ts[-1]] if len(ts) > 1 else ts,
157-
values=[2, 2] if len(ts) > 1 else [2],
158-
),
159+
Metric(name="cpus_detected_num", timestamps=ts, values=[64] * len(ts)),
160+
Metric(name="memory_total_bytes", timestamps=ts, values=[137438953472] * len(ts)),
161+
Metric(name="gpus_detected_num", timestamps=ts, values=[2] * len(ts)),
162+
Metric(name="gpu_memory_total_bytes", timestamps=ts, values=[34359738368] * len(ts)),
159163
Metric(name="gpu_memory_usage_bytes_gpu0", timestamps=ts, values=gpu0_mem),
160164
Metric(name="gpu_memory_usage_bytes_gpu1", timestamps=ts, values=gpu1_mem),
161165
Metric(name="gpu_util_percent_gpu0", timestamps=ts, values=gpu0_util),

0 commit comments

Comments
 (0)