Skip to content

Commit f9744d4

Browse files
[Feature]: Allow listing available key resources such as gpu, region, and backends #2142
Fixed group_by bug; also added validation (disallow grouping by region without backend)
1 parent 5725924 commit f9744d4

4 files changed

Lines changed: 128 additions & 57 deletions

File tree

src/dstack/_internal/cli/commands/gpu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ def register_args(
3131
"--group-by",
3232
action="append",
3333
choices=["backend", "region", "count"],
34-
help="Group GPUs by backend, region, and/or count. Can be specified multiple times.",
34+
help="Group GPUs by backend, region, and/or count. Can be specified multiple times. "
35+
"Note: 'region' can only be used together with 'backend'.",
3536
)
3637

3738

src/dstack/_internal/server/schemas/gpus.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class BackendGpu(CoreModel):
2020
spot: bool
2121
count: int
2222
price: float
23+
region: str
2324

2425

2526
class BackendGpus(CoreModel):
@@ -36,7 +37,8 @@ class ListGpusRequest(CoreModel):
3637
run_spec: RunSpec
3738
group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
3839
default=None,
39-
description="List of fields to group by. Valid values: 'backend', 'region', 'count'",
40+
description="List of fields to group by. Valid values: 'backend', 'region', 'count'. "
41+
"Note: 'region' can only be used together with 'backend'.",
4042
)
4143

4244

src/dstack/_internal/server/services/gpus.py

Lines changed: 46 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def _process_offers_into_backend_gpus(
8787
spot=offer.instance.resources.spot,
8888
count=gpu_count_in_offer,
8989
price=offer.price,
90+
region=offer.region,
9091
)
9192

9293
backend_gpus_list = []
@@ -204,24 +205,23 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) ->
204205
"""Aggregates GPU specs, grouping them by both backend and region."""
205206
gpu_rows: Dict[Tuple, GpuGroup] = {}
206207
for backend in backend_gpus:
207-
for region in backend.regions:
208-
for gpu in backend.gpus:
209-
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region)
210-
if key not in gpu_rows:
211-
per_gpu_price = gpu.price / gpu.count
212-
gpu_rows[key] = GpuGroup(
213-
name=gpu.name,
214-
memory_mib=gpu.memory_mib,
215-
vendor=gpu.vendor,
216-
availability=[gpu.availability],
217-
spot=["spot" if gpu.spot else "on-demand"],
218-
count=Range[int](min=gpu.count, max=gpu.count),
219-
price=Range[float](min=per_gpu_price, max=per_gpu_price),
220-
backend=backend.backend_type,
221-
region=region,
222-
)
223-
else:
224-
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
208+
for gpu in backend.gpus:
209+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
210+
if key not in gpu_rows:
211+
per_gpu_price = gpu.price / gpu.count
212+
gpu_rows[key] = GpuGroup(
213+
name=gpu.name,
214+
memory_mib=gpu.memory_mib,
215+
vendor=gpu.vendor,
216+
availability=[gpu.availability],
217+
spot=["spot" if gpu.spot else "on-demand"],
218+
count=Range[int](min=gpu.count, max=gpu.count),
219+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
220+
backend=backend.backend_type,
221+
region=gpu.region,
222+
)
223+
else:
224+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
225225

226226
return sorted(
227227
list(gpu_rows.values()),
@@ -313,31 +313,30 @@ def _get_gpus_grouped_by_backend_region_and_count(
313313
"""Aggregates GPU specs, grouping them by backend, region, and GPU count."""
314314
gpu_rows: Dict[Tuple, GpuGroup] = {}
315315
for backend in backend_gpus:
316-
for region in backend.regions:
317-
for gpu in backend.gpus:
318-
key = (
319-
gpu.name,
320-
gpu.memory_mib,
321-
gpu.vendor,
322-
backend.backend_type,
323-
region,
324-
gpu.count,
316+
for gpu in backend.gpus:
317+
key = (
318+
gpu.name,
319+
gpu.memory_mib,
320+
gpu.vendor,
321+
backend.backend_type,
322+
gpu.region,
323+
gpu.count,
324+
)
325+
if key not in gpu_rows:
326+
per_gpu_price = gpu.price / gpu.count
327+
gpu_rows[key] = GpuGroup(
328+
name=gpu.name,
329+
memory_mib=gpu.memory_mib,
330+
vendor=gpu.vendor,
331+
availability=[gpu.availability],
332+
spot=["spot" if gpu.spot else "on-demand"],
333+
count=Range[int](min=gpu.count, max=gpu.count),
334+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
335+
backend=backend.backend_type,
336+
region=gpu.region,
325337
)
326-
if key not in gpu_rows:
327-
per_gpu_price = gpu.price / gpu.count
328-
gpu_rows[key] = GpuGroup(
329-
name=gpu.name,
330-
memory_mib=gpu.memory_mib,
331-
vendor=gpu.vendor,
332-
availability=[gpu.availability],
333-
spot=["spot" if gpu.spot else "on-demand"],
334-
count=Range[int](min=gpu.count, max=gpu.count),
335-
price=Range[float](min=per_gpu_price, max=per_gpu_price),
336-
backend=backend.backend_type,
337-
region=region,
338-
)
339-
else:
340-
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
338+
else:
339+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
341340

342341
return sorted(
343342
list(gpu_rows.values()),
@@ -366,6 +365,11 @@ async def list_gpus_grouped(
366365

367366
group_by_set = set(group_by) if group_by else set()
368367

368+
if "region" in group_by_set and "backend" not in group_by_set:
369+
from dstack._internal.core.errors import ServerClientError
370+
371+
raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
372+
369373
# Determine grouping strategy based on combination
370374
has_backend = "backend" in group_by_set
371375
has_region = "region" in group_by_set

src/tests/_internal/server/routers/test_gpus.py

Lines changed: 77 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,19 @@ async def test_invalid_group_by_rejected(
198198
assert response.status_code == 422
199199
assert "validation error" in response.text.lower() or "invalid" in response.text.lower()
200200

201+
@pytest.mark.asyncio
202+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
203+
async def test_region_without_backend_rejected(
204+
self, test_db, session: AsyncSession, client: AsyncClient
205+
):
206+
user, project, repo, run_spec = await gpu_test_setup(session)
207+
208+
response = await call_gpus_api(
209+
client, project.name, user.token, run_spec, group_by=["region"]
210+
)
211+
212+
assert response.status_code == 400
213+
201214
@pytest.mark.asyncio
202215
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
203216
async def test_exact_aggregation_values(
@@ -422,28 +435,79 @@ async def test_exact_aggregation_values(
422435
assert t4_runpod["price"] == {"min": 0.25, "max": 0.25}
423436
assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75}
424437

425-
# Test region grouping to validate multi-region, multi-backend setup
426438
response_region = await client.post(
427439
f"/api/project/{project.name}/gpus/list",
428440
headers=get_auth_headers(user.token),
429-
json={"run_spec": run_spec.dict(), "group_by": ["region"]},
441+
json={"run_spec": run_spec.dict(), "group_by": ["backend", "region"]},
430442
)
431443
assert response_region.status_code == 200
432444
region_data = response_region.json()
433445

434-
assert len(region_data["gpus"]) == 2
446+
assert len(region_data["gpus"]) == 5
435447

436-
t4_region_group = next(
437-
(gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None
448+
t4_aws_uswest2 = next(
449+
(
450+
gpu
451+
for gpu in region_data["gpus"]
452+
if gpu["name"] == "T4"
453+
and gpu.get("backend") == "aws"
454+
and gpu.get("region") == "us-west-2"
455+
),
456+
None,
438457
)
439-
rtx_region_group = next(
440-
(gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None
458+
t4_runpod_useast1 = next(
459+
(
460+
gpu
461+
for gpu in region_data["gpus"]
462+
if gpu["name"] == "T4"
463+
and gpu.get("backend") == "runpod"
464+
and gpu.get("region") == "us-east-1"
465+
),
466+
None,
441467
)
442468

443-
assert t4_region_group is not None
444-
assert rtx_region_group is not None
469+
rtx_runpod_useast1 = next(
470+
(
471+
gpu
472+
for gpu in region_data["gpus"]
473+
if gpu["name"] == "RTX4090"
474+
and gpu.get("backend") == "runpod"
475+
and gpu.get("region") == "us-east-1"
476+
),
477+
None,
478+
)
479+
rtx_runpod_euwest1 = next(
480+
(
481+
gpu
482+
for gpu in region_data["gpus"]
483+
if gpu["name"] == "RTX4090"
484+
and gpu.get("backend") == "runpod"
485+
and gpu.get("region") == "eu-west-1"
486+
),
487+
None,
488+
)
445489

446-
assert set(t4_region_group["backends"]) == {"aws", "runpod"}
447-
assert set(rtx_region_group["backends"]) == {"runpod"}
448-
assert t4_region_group["price"] == {"min": 0.25, "max": 0.60}
449-
assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75}
490+
assert t4_aws_uswest2 is not None
491+
assert t4_runpod_useast1 is not None
492+
assert rtx_runpod_useast1 is not None
493+
assert rtx_runpod_euwest1 is not None
494+
495+
assert t4_aws_uswest2["backend"] == "aws"
496+
assert t4_aws_uswest2["region"] == "us-west-2"
497+
assert t4_aws_uswest2["price"]["min"] == 0.30
498+
assert t4_aws_uswest2["price"]["max"] == 0.60
499+
500+
assert t4_runpod_useast1["backend"] == "runpod"
501+
assert t4_runpod_useast1["region"] == "us-east-1"
502+
assert t4_runpod_useast1["price"]["min"] == 0.25
503+
assert t4_runpod_useast1["price"]["max"] == 0.25
504+
505+
assert rtx_runpod_useast1["backend"] == "runpod"
506+
assert rtx_runpod_useast1["region"] == "us-east-1"
507+
assert rtx_runpod_useast1["price"]["min"] == 0.75
508+
assert rtx_runpod_useast1["price"]["max"] == 0.75
509+
510+
assert rtx_runpod_euwest1["backend"] == "runpod"
511+
assert rtx_runpod_euwest1["region"] == "eu-west-1"
512+
assert rtx_runpod_euwest1["price"]["min"] == 0.65
513+
assert rtx_runpod_euwest1["price"]["max"] == 0.65

0 commit comments

Comments
 (0)