Skip to content

Commit c8d7bdd

Browse files
[Feature]: Allow listing available key resources such as gpu, region, and backends #2142 (WIP)
Moved the /gpus API to a separate routing; also moved GPU-related logic from runs.
1 parent 5cf1052 commit c8d7bdd

13 files changed

Lines changed: 961 additions & 893 deletions

File tree

src/dstack/_internal/cli/commands/gpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def _command(self, args: argparse.Namespace):
7676
status = contextlib.nullcontext()
7777

7878
with status:
79-
gpu_response = self.api.client.runs.get_gpus(
79+
gpu_response = self.api.client.gpus.get_gpus(
8080
self.api.project,
8181
run_spec,
8282
group_by=args.group_by,

src/dstack/_internal/server/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
files,
3030
fleets,
3131
gateways,
32+
gpus,
3233
instances,
3334
logs,
3435
metrics,
@@ -204,6 +205,7 @@ def register_routes(app: FastAPI, ui: bool = True):
204205
app.include_router(repos.router)
205206
app.include_router(runs.root_router)
206207
app.include_router(runs.project_router)
208+
app.include_router(gpus.project_router)
207209
app.include_router(metrics.router)
208210
app.include_router(logs.router)
209211
app.include_router(secrets.router)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from typing import Tuple
2+
3+
from fastapi import APIRouter, Depends
4+
from sqlalchemy.ext.asyncio import AsyncSession
5+
6+
from dstack._internal.server.db import get_session
7+
from dstack._internal.server.models import ProjectModel, UserModel
8+
from dstack._internal.server.schemas.gpus import GetRunGpusRequest, RunGpusResponse
9+
from dstack._internal.server.security.permissions import ProjectMember
10+
from dstack._internal.server.services.gpus import get_run_gpus_grouped
11+
from dstack._internal.server.utils.routers import get_base_api_additional_responses
12+
13+
project_router = APIRouter(
14+
prefix="/api/project/{project_name}/gpus",
15+
tags=["gpus"],
16+
responses=get_base_api_additional_responses(),
17+
)
18+
19+
20+
@project_router.post("/list", response_model=RunGpusResponse, response_model_exclude_none=True)
21+
async def get_run_gpus(
22+
body: GetRunGpusRequest,
23+
session: AsyncSession = Depends(get_session),
24+
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25+
) -> RunGpusResponse:
26+
_, project = user_project
27+
return await get_run_gpus_grouped(
28+
session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
29+
)

src/dstack/_internal/server/routers/runs.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@
1010
from dstack._internal.server.schemas.runs import (
1111
ApplyRunPlanRequest,
1212
DeleteRunsRequest,
13-
GetRunGpusRequest,
1413
GetRunPlanRequest,
1514
GetRunRequest,
1615
ListRunsRequest,
17-
RunGpusResponse,
1816
StopRunsRequest,
1917
SubmitRunRequest,
2018
)
@@ -181,18 +179,6 @@ async def delete_runs(
181179
await runs.delete_runs(session=session, project=project, runs_names=body.runs_names)
182180

183181

184-
@project_router.post("/gpus", response_model=RunGpusResponse, response_model_exclude_none=True)
185-
async def get_run_gpus(
186-
body: GetRunGpusRequest,
187-
session: AsyncSession = Depends(get_session),
188-
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
189-
) -> RunGpusResponse:
190-
_, project = user_project
191-
return await runs.get_run_gpus_grouped(
192-
session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
193-
)
194-
195-
196182
# apply_plan replaces submit_run since it can create new runs.
197183
@project_router.post("/submit", deprecated=True)
198184
async def submit_run(
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from typing import List, Literal, Optional
2+
3+
import gpuhunt
4+
from pydantic import Field
5+
6+
from dstack._internal.core.models.backends.base import BackendType
7+
from dstack._internal.core.models.common import CoreModel
8+
from dstack._internal.core.models.instances import InstanceAvailability
9+
from dstack._internal.core.models.resources import Range
10+
from dstack._internal.core.models.runs import RunSpec
11+
12+
13+
class BackendGpu(CoreModel):
14+
"""GPU specification from a backend offer."""
15+
16+
name: str
17+
memory_mib: int
18+
vendor: gpuhunt.AcceleratorVendor
19+
availability: InstanceAvailability
20+
spot: bool
21+
count: int
22+
price: float
23+
24+
25+
class BackendGpus(CoreModel):
26+
"""Backend GPU specifications."""
27+
28+
backend_type: BackendType
29+
gpus: List[BackendGpu]
30+
regions: List[str]
31+
32+
33+
class GetRunGpusRequest(CoreModel):
34+
"""Request for getting run GPUs with optional grouping."""
35+
36+
run_spec: RunSpec
37+
group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
38+
default=None,
39+
description="List of fields to group by. Valid values: 'backend', 'region', 'count'",
40+
)
41+
42+
43+
class GpuGroup(CoreModel):
44+
"""GPU group that can handle all grouping scenarios."""
45+
46+
name: str
47+
memory_mib: int
48+
vendor: gpuhunt.AcceleratorVendor
49+
availability: List[InstanceAvailability]
50+
spot: List[Literal["spot", "on-demand"]]
51+
count: Range[int]
52+
price: Range[float]
53+
backends: Optional[List[BackendType]] = None
54+
backend: Optional[BackendType] = None
55+
regions: Optional[List[str]] = None
56+
region: Optional[str] = None
57+
58+
59+
class RunGpusResponse(CoreModel):
60+
"""Response containing GPU specifications."""
61+
62+
gpus: List[GpuGroup] = Field(
63+
description="List of GPU specifications, grouped according to the group_by parameter"
64+
)
Lines changed: 1 addition & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from datetime import datetime
2-
from typing import Annotated, List, Literal, Optional
2+
from typing import Annotated, List, Optional
33
from uuid import UUID
44

5-
import gpuhunt
65
from pydantic import Field
76

8-
from dstack._internal.core.models.backends.base import BackendType
97
from dstack._internal.core.models.common import CoreModel
10-
from dstack._internal.core.models.instances import InstanceAvailability
11-
from dstack._internal.core.models.resources import Range
128
from dstack._internal.core.models.runs import ApplyRunPlanInput, RunSpec
139

1410

@@ -68,57 +64,3 @@ class StopRunsRequest(CoreModel):
6864

6965
class DeleteRunsRequest(CoreModel):
7066
runs_names: List[str]
71-
72-
73-
class BackendGpu(CoreModel):
74-
"""GPU specification from a backend offer."""
75-
76-
name: str
77-
memory_mib: int
78-
vendor: gpuhunt.AcceleratorVendor
79-
availability: InstanceAvailability
80-
spot: bool
81-
count: int
82-
price: float
83-
84-
85-
class BackendGpus(CoreModel):
86-
"""Backend GPU specifications."""
87-
88-
backend_type: BackendType
89-
gpus: List[BackendGpu]
90-
regions: List[str]
91-
92-
93-
class GetRunGpusRequest(CoreModel):
94-
"""Request for getting run GPUs with optional grouping."""
95-
96-
run_spec: RunSpec
97-
group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
98-
default=None,
99-
description="List of fields to group by. Valid values: 'backend', 'region', 'count'",
100-
)
101-
102-
103-
class GpuGroup(CoreModel):
104-
"""GPU group that can handle all grouping scenarios."""
105-
106-
name: str
107-
memory_mib: int
108-
vendor: gpuhunt.AcceleratorVendor
109-
availability: List[InstanceAvailability]
110-
spot: List[Literal["spot", "on-demand"]]
111-
count: Range[int]
112-
price: Range[float]
113-
backends: Optional[List[BackendType]] = None
114-
backend: Optional[BackendType] = None
115-
regions: Optional[List[str]] = None
116-
region: Optional[str] = None
117-
118-
119-
class RunGpusResponse(CoreModel):
120-
"""Response containing GPU specifications."""
121-
122-
gpus: List[GpuGroup] = Field(
123-
description="List of GPU specifications, grouped according to the group_by parameter"
124-
)

0 commit comments

Comments
 (0)