Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/docs/guides/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,21 @@ telemetry, and more.
| `dstack_run_type` | *string* | Run configuration type | `task`, `dev-environment` |
| `dstack_backend` | *string* | Backend | `aws`, `runpod` |
| `dstack_gpu` | *string?* | GPU name | `H100` |

### Server health metrics

These are operational metrics to monitor the health of the dstack server. For now, these only include HTTP metrics, but more will be added later.

=== "Metrics"
| Name | Type | Description | Examples |
|------------------------------------------|-----------|-----------------------------------|--------------|
| `dstack_server_requests_total` | *counter* | Total number of HTTP requests | `100.0` |
| `dstack_server_request_duration_seconds` | *histogram* | HTTP request duration in seconds | `1.0`|

=== "Labels"
| Name | Type | Description | Examples |
|------------------------|-----------|:--------------|----------------------------------------|
| `method` | *string* | HTTP method | `POST` |
| `endpoint` | *string* | Endpoint path | `/api/project/main/repos/get` |
| `http_status` | *string* | HTTP status code | `200` |
| `project_name` | *string?* | Project name | `main` |
45 changes: 45 additions & 0 deletions src/dstack/_internal/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from fastapi.datastructures import URL
from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from prometheus_client import Counter, Histogram

from dstack._internal.cli.utils.common import console
from dstack._internal.core.errors import ForbiddenError, ServerClientError
Expand Down Expand Up @@ -63,6 +64,18 @@

logger = get_logger(__name__)

# Server HTTP metrics
REQUESTS_TOTAL = Counter(
"dstack_server_requests_total",
"Total number of HTTP requests",
["method", "endpoint", "http_status", "project_name"],
)
REQUEST_DURATION = Histogram(
"dstack_server_request_duration_seconds",
"HTTP request duration in seconds",
["method", "endpoint", "http_status", "project_name"],
)


def create_app() -> FastAPI:
if settings.SENTRY_DSN is not None:
Expand Down Expand Up @@ -216,6 +229,8 @@ async def log_request(request: Request, call_next):
start_time = time.time()
response: Response = await call_next(request)
process_time = time.time() - start_time
# log process_time to be used in the log_http_metrics middleware
request.state.process_time = process_time
logger.debug(
"Processed request %s %s in %s. Status: %s",
request.method,
Expand All @@ -225,6 +240,36 @@ async def log_request(request: Request, call_next):
)
return response

# this middleware must be defined after the log_request middleware
@app.middleware("http")
async def log_http_metrics(request: Request, call_next):
def _extract_project_name(request: Request):
project_name = None
prefix = "/api/project/"
if request.url.path.startswith(prefix):
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of manually parsing this out, I wonder if the parsing might already be done for us... anything interesting in the request object, e.g. in path_params?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No path_params isn't available at the time the middleware is called. Middlewares are executed before FastAPI routes the request to an endpoint, so we only have access to the raw path.

A "middleware" is a function that works with every request before it is processed by any specific path operation. And also with every response before returning it.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the request object when it gets to the middleware:
request.__dict__ {'scope': {'type': 'http', 'asgi': {'version': '3.0', 'spec_version': '2.3'}, 'http_version': '1.1', 'server': ('127.0.0.1', 3000), 'client': ('127.0.0.1', 55612), 'scheme': 'http', 'method': 'POST', 'root_path': '', 'path': '/api/project/main/repos/get', 'raw_path': b'/api/project/main/repos/get', 'query_string': b'', 'headers': [(b'host', b'127.0.0.1:3000'), (b'user-agent', b'python-requests/2.32.3'), (b'accept-encoding', b'gzip, deflate, br'), (b'accept', b'*/*'), (b'connection', b'keep-alive'), (b'authorization', b'Bearer REDACTED'), (b'x-api-version', b'0.0.0'), (b'content-type', b'application/json'), (b'content-length', b'60')], 'state': {}, 'app': <fastapi.applications.FastAPI object at 0x1182e7680>}, '_receive': <function BaseHTTPMiddleware.__call__.<locals>.call_next.<locals>.receive_or_disconnect at 0x12b1494e0>, '_send': <function empty_send at 0x1186fbe20>, '_stream_consumed': False, '_is_disconnected': False, '_form': None, '_wrapped_rcv_disconnected': False, '_wrapped_rcv_consumed': False, '_wrapped_rc_stream': <async_generator object Request.stream at 0x12b151380>}

rest = request.url.path[len(prefix) :]
project_name = rest.split("/", 1)[0] if rest else None

return project_name

project_name = _extract_project_name(request)
response: Response = await call_next(request)

REQUEST_DURATION.labels(
method=request.method,
endpoint=request.url.path,
http_status=response.status_code,
project_name=project_name,
).observe(request.state.process_time)

REQUESTS_TOTAL.labels(
method=request.method,
endpoint=request.url.path,
http_status=response.status_code,
project_name=project_name,
).inc()
return response

@app.middleware("http")
async def check_client_version(request: Request, call_next):
if (
Expand Down
5 changes: 4 additions & 1 deletion src/dstack/_internal/server/routers/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from fastapi import APIRouter, Depends
from fastapi.responses import PlainTextResponse
from prometheus_client import generate_latest
from sqlalchemy.ext.asyncio import AsyncSession

from dstack._internal.server import settings
Expand All @@ -26,4 +27,6 @@ async def get_prometheus_metrics(
) -> str:
if not settings.ENABLE_PROMETHEUS_METRICS:
raise error_not_found()
return await prometheus.get_metrics(session=session)
custom_metrics = await prometheus.get_metrics(session=session)
prometheus_metrics = generate_latest()
return custom_metrics + prometheus_metrics.decode()
61 changes: 59 additions & 2 deletions src/tests/_internal/server/routers/test_prometheus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime, timedelta, timezone
from textwrap import dedent
from typing import Optional
from unittest.mock import patch

import pytest
from freezegun import freeze_time
Expand Down Expand Up @@ -35,6 +36,55 @@
get_run_spec,
)

BASE_HTTP_METRICS = b"""
# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 13159.0
python_gc_objects_collected_total{generation="1"} 1583.0
python_gc_objects_collected_total{generation="2"} 81.0
# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP python_gc_collections_total Number of times this generation was collected
# TYPE python_gc_collections_total counter
python_gc_collections_total{generation="0"} 1609.0
python_gc_collections_total{generation="1"} 146.0
python_gc_collections_total{generation="2"} 9.0
# HELP python_info Python platform information
# TYPE python_info gauge
python_info{implementation="CPython",major="3",minor="12",patchlevel="2",version="3.12.2"} 1.0
# HELP dstack_server_requests_total Total number of HTTP requests
# TYPE dstack_server_requests_total counter
dstack_server_requests_total{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.0
# HELP dstack_server_requests_created Total number of HTTP requests
# TYPE dstack_server_requests_created gauge
dstack_server_requests_created{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.67262864e+09
# HELP dstack_server_request_duration_seconds HTTP request duration in seconds
# TYPE dstack_server_request_duration_seconds histogram
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.005",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.01",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.025",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.05",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.075",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.1",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.25",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.5",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="0.75",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="1.0",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="2.5",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="5.0",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="7.5",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="10.0",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_bucket{endpoint="/metrics",http_status="200",le="+Inf",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_count{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.0
dstack_server_request_duration_seconds_sum{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 0.0
# HELP dstack_server_request_duration_seconds_created HTTP request duration in seconds
# TYPE dstack_server_request_duration_seconds_created gauge
dstack_server_request_duration_seconds_created{endpoint="/metrics",http_status="200",method="GET",project_name="None"} 1.67262864e+09
"""


@pytest.fixture
def enable_metrics(monkeypatch: pytest.MonkeyPatch):
Expand All @@ -50,6 +100,7 @@ def enable_metrics(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
@pytest.mark.usefixtures("image_config_mock", "test_db", "enable_metrics")
class TestGetPrometheusMetrics:
@patch("dstack._internal.server.routers.prometheus.generate_latest", lambda: BASE_HTTP_METRICS)
async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient):
user = await create_user(session=session, name="test-user", global_role=GlobalRole.USER)
offer = get_instance_offer_with_availability(
Expand Down Expand Up @@ -200,7 +251,8 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
response = await client.get("/metrics")

assert response.status_code == 200
assert response.text == dedent(f"""\
expected = (
dedent(f"""\
# HELP dstack_instance_duration_seconds_total Total seconds the instance is running
# TYPE dstack_instance_duration_seconds_total counter
dstack_instance_duration_seconds_total{{dstack_project_name="project-1",dstack_fleet_name="test-fleet",dstack_fleet_id="{fleet.id}",dstack_instance_name="test-instance",dstack_instance_id="{instance.id}",dstack_instance_type="test-type",dstack_backend="aws",dstack_gpu="V4"}} 3600.0
Expand Down Expand Up @@ -278,11 +330,16 @@ async def test_returns_metrics(self, session: AsyncSession, client: AsyncClient)
FIELD_2{{gpu="0",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 337325.0 1395066363000
FIELD_2{{gpu="1",dstack_project_name="project-1",dstack_user_name="test-user",dstack_run_name="run-1",dstack_run_id="{job_1_1.run_id}",dstack_job_name="run-1-0-0",dstack_job_id="{job_1_1.id}",dstack_job_num="0",dstack_replica_num="0",dstack_run_type="dev-environment",dstack_backend="aws",dstack_gpu="V4"}} 987169.0 1395066363010
""")
+ "\n"
+ BASE_HTTP_METRICS.decode().strip()
)
assert response.text.strip() == expected

@patch("dstack._internal.server.routers.prometheus.generate_latest", lambda: BASE_HTTP_METRICS)
async def test_returns_empty_response_if_no_runs(self, client: AsyncClient):
response = await client.get("/metrics")
assert response.status_code == 200
assert response.text == "\n"
assert response.text.strip() == BASE_HTTP_METRICS.decode().strip()

async def test_returns_404_if_not_enabled(
self, monkeypatch: pytest.MonkeyPatch, client: AsyncClient
Expand Down