Skip to content

Commit a5e1429

Browse files
committed
Add basic http metrics
1 parent b9677fa commit a5e1429

4 files changed

Lines changed: 103 additions & 5 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ dependencies = [
3535
"gpuhunt==0.1.6",
3636
"argcomplete>=3.5.0",
3737
"ignore-python>=0.2.0",
38+
"prometheus-fastapi-instrumentator>=7.1.0",
3839
]
3940

4041
[project.urls]

src/dstack/_internal/server/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from fastapi.datastructures import URL
1212
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
1313
from fastapi.staticfiles import StaticFiles
14+
from prometheus_fastapi_instrumentator import Instrumentator
1415

1516
from dstack._internal.cli.utils.common import console
1617
from dstack._internal.core.errors import ForbiddenError, ServerClientError
@@ -77,6 +78,7 @@ def create_app() -> FastAPI:
7778

7879
app = FastAPI(docs_url="/api/docs", lifespan=lifespan)
7980
app.state.proxy_dependency_injector = ServerProxyDependencyInjector()
81+
Instrumentator().instrument(app, metric_namespace="dstack", metric_subsystem="server")
8082
return app
8183

8284

src/dstack/_internal/server/routers/prometheus.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
from typing import Annotated
33

44
from fastapi import APIRouter, Depends
5-
from fastapi.responses import PlainTextResponse
5+
from fastapi.responses import PlainTextResponse, Response
6+
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
67
from sqlalchemy.ext.asyncio import AsyncSession
78

89
from dstack._internal.server import settings
@@ -23,7 +24,12 @@
2324
@router.get("/metrics")
2425
async def get_prometheus_metrics(
2526
session: Annotated[AsyncSession, Depends(get_session)],
26-
) -> str:
27+
):
2728
if not settings.ENABLE_PROMETHEUS_METRICS:
2829
raise error_not_found()
29-
return await prometheus.get_metrics(session=session)
30+
custom_metrics = await prometheus.get_metrics(session=session)
31+
instrumentator_metrics = generate_latest().decode()
32+
return Response(
33+
custom_metrics + instrumentator_metrics,
34+
media_type=CONTENT_TYPE_LATEST,
35+
)

src/tests/_internal/server/routers/test_prometheus.py

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import datetime, timedelta, timezone
22
from textwrap import dedent
33
from typing import Optional
4+
from unittest.mock import patch
45

56
import pytest
67
from freezegun import freeze_time
@@ -35,6 +36,87 @@
3536
get_run_spec,
3637
)
3738

39+
BASE_HTTP_METRICS = b"""
40+
# HELP python_gc_objects_collected_total Objects collected during gc
41+
# TYPE python_gc_objects_collected_total counter
42+
python_gc_objects_collected_total{generation="0"} 16262.0
43+
python_gc_objects_collected_total{generation="1"} 3588.0
44+
python_gc_objects_collected_total{generation="2"} 325.0
45+
# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
46+
# TYPE python_gc_objects_uncollectable_total counter
47+
python_gc_objects_uncollectable_total{generation="0"} 0.0
48+
python_gc_objects_uncollectable_total{generation="1"} 0.0
49+
python_gc_objects_uncollectable_total{generation="2"} 0.0
50+
# HELP python_gc_collections_total Number of times this generation was collected
51+
# TYPE python_gc_collections_total counter
52+
python_gc_collections_total{generation="0"} 1687.0
53+
python_gc_collections_total{generation="1"} 153.0
54+
python_gc_collections_total{generation="2"} 10.0
55+
# HELP python_info Python platform information
56+
# TYPE python_info gauge
57+
python_info{implementation="CPython",major="3",minor="12",patchlevel="2",version="3.12.2"} 1.0
58+
# HELP dstack_server_http_requests_total Total number of requests by method, status and handler.
59+
# TYPE dstack_server_http_requests_total counter
60+
dstack_server_http_requests_total{handler="/metrics",method="GET",status="2xx"} 1.0
61+
# HELP dstack_server_http_requests_created Total number of requests by method, status and handler.
62+
# TYPE dstack_server_http_requests_created gauge
63+
dstack_server_http_requests_created{handler="/metrics",method="GET",status="2xx"} 1.67262864e+09
64+
# HELP dstack_server_http_request_size_bytes Content length of incoming requests by handler. Only value of header is respected. Otherwise ignored. No percentile calculated.
65+
# TYPE dstack_server_http_request_size_bytes summary
66+
dstack_server_http_request_size_bytes_count{handler="/metrics"} 1.0
67+
dstack_server_http_request_size_bytes_sum{handler="/metrics"} 0.0
68+
# HELP dstack_server_http_request_size_bytes_created Content length of incoming requests by handler. Only value of header is respected. Otherwise ignored. No percentile calculated.
69+
# TYPE dstack_server_http_request_size_bytes_created gauge
70+
dstack_server_http_request_size_bytes_created{handler="/metrics"} 1.67262864e+09
71+
# HELP dstack_server_http_response_size_bytes Content length of outgoing responses by handler. Only value of header is respected. Otherwise ignored. No percentile calculated.
72+
# TYPE dstack_server_http_response_size_bytes summary
73+
dstack_server_http_response_size_bytes_count{handler="/metrics"} 1.0
74+
dstack_server_http_response_size_bytes_sum{handler="/metrics"} 17846.0
75+
# HELP dstack_server_http_response_size_bytes_created Content length of outgoing responses by handler. Only value of header is respected. Otherwise ignored. No percentile calculated.
76+
# TYPE dstack_server_http_response_size_bytes_created gauge
77+
dstack_server_http_response_size_bytes_created{handler="/metrics"} 1.67262864e+09
78+
# HELP dstack_server_http_request_duration_highr_seconds Latency with many buckets but no API specific labels. Made for more accurate percentile calculations.
79+
# TYPE dstack_server_http_request_duration_highr_seconds histogram
80+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.01"} 1.0
81+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.025"} 1.0
82+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.05"} 1.0
83+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.075"} 1.0
84+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.1"} 1.0
85+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.25"} 1.0
86+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.5"} 1.0
87+
dstack_server_http_request_duration_highr_seconds_bucket{le="0.75"} 1.0
88+
dstack_server_http_request_duration_highr_seconds_bucket{le="1.0"} 1.0
89+
dstack_server_http_request_duration_highr_seconds_bucket{le="1.5"} 1.0
90+
dstack_server_http_request_duration_highr_seconds_bucket{le="2.0"} 1.0
91+
dstack_server_http_request_duration_highr_seconds_bucket{le="2.5"} 1.0
92+
dstack_server_http_request_duration_highr_seconds_bucket{le="3.0"} 1.0
93+
dstack_server_http_request_duration_highr_seconds_bucket{le="3.5"} 1.0
94+
dstack_server_http_request_duration_highr_seconds_bucket{le="4.0"} 1.0
95+
dstack_server_http_request_duration_highr_seconds_bucket{le="4.5"} 1.0
96+
dstack_server_http_request_duration_highr_seconds_bucket{le="5.0"} 1.0
97+
dstack_server_http_request_duration_highr_seconds_bucket{le="7.5"} 1.0
98+
dstack_server_http_request_duration_highr_seconds_bucket{le="10.0"} 1.0
99+
dstack_server_http_request_duration_highr_seconds_bucket{le="30.0"} 1.0
100+
dstack_server_http_request_duration_highr_seconds_bucket{le="60.0"} 1.0
101+
dstack_server_http_request_duration_highr_seconds_bucket{le="+Inf"} 1.0
102+
dstack_server_http_request_duration_highr_seconds_count 1.0
103+
dstack_server_http_request_duration_highr_seconds_sum 0.0
104+
# HELP dstack_server_http_request_duration_highr_seconds_created Latency with many buckets but no API specific labels. Made for more accurate percentile calculations.
105+
# TYPE dstack_server_http_request_duration_highr_seconds_created gauge
106+
dstack_server_http_request_duration_highr_seconds_created 1.67262864e+09
107+
# HELP dstack_server_http_request_duration_seconds Latency with only few buckets by handler. Made to be only used if aggregation by handler is important.
108+
# TYPE dstack_server_http_request_duration_seconds histogram
109+
dstack_server_http_request_duration_seconds_bucket{handler="/metrics",le="0.1",method="GET"} 1.0
110+
dstack_server_http_request_duration_seconds_bucket{handler="/metrics",le="0.5",method="GET"} 1.0
111+
dstack_server_http_request_duration_seconds_bucket{handler="/metrics",le="1.0",method="GET"} 1.0
112+
dstack_server_http_request_duration_seconds_bucket{handler="/metrics",le="+Inf",method="GET"} 1.0
113+
dstack_server_http_request_duration_seconds_count{handler="/metrics",method="GET"} 1.0
114+
dstack_server_http_request_duration_seconds_sum{handler="/metrics",method="GET"} 0.0
115+
# HELP dstack_server_http_request_duration_seconds_created Latency with only few buckets by handler. Made to be only used if aggregation by handler is important.
116+
# TYPE dstack_server_http_request_duration_seconds_created gauge
117+
dstack_server_http_request_duration_seconds_created{handler="/metrics",method="GET"} 1.67262864e+09
118+
"""
119+
38120

39121
@pytest.fixture
40122
def enable_metrics(monkeypatch: pytest.MonkeyPatch):
@@ -50,6 +132,7 @@ def enable_metrics(monkeypatch: pytest.MonkeyPatch):
50132
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
51133
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
52134
class TestGetPrometheusMetrics:
135+
@patch("dstack._internal.server.routers.prometheus.generate_latest", lambda: BASE_HTTP_METRICS)
53136
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
54137
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
55138
offer = get_instance_offer_with_availability(
@@ -200,7 +283,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
200283
response = await client.get("/metrics")
201284

202285
assert response.status_code == 200
203-
assert response.text == dedent(f"""\
286+
actual = (
287+
dedent(f"""\
204288
# HELP dstack_instance_duration_seconds_total Total seconds the instance is running
205289
# TYPE dstack_instance_duration_seconds_total counter
206290
dstack_instance_duration_seconds_total{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 3600.0
@@ -278,11 +362,16 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
278362
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
279363
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
280364
""")
365+
+ "\n"
366+
+ BASE_HTTP_METRICS.decode().strip()
367+
)
368+
assert response.text.strip() == actual
281369

370+
@patch("dstack._internal.server.routers.prometheus.generate_latest", lambda: BASE_HTTP_METRICS)
282371
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
283372
response = await client.get("/metrics")
284373
assert response.status_code == 200
285-
assert response.text == "\n"
374+
assert response.text.strip() == BASE_HTTP_METRICS.decode().strip()
286375

287376
async def test_returns_404_if_not_enabled(
288377
self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient

0 commit comments

Comments
 (0)