[Feature]: Allow listing available key resources such as gpu, region, and backends #2142

peterschmidt85 · peterschmidt85 · commit f9744d4489db · 2025-08-18T14:54:47.000+02:00
Fixed group_by bug; also added validation (disallow grouping by region without backend)
diff --git a/src/dstack/_internal/cli/commands/gpu.py b/src/dstack/_internal/cli/commands/gpu.py
@@ -31,7 +31,8 @@ def register_args(
             "--group-by",
             action="append",
             choices=["backend", "region", "count"],
-            help="Group GPUs by backend, region, and/or count. Can be specified multiple times.",
+            help="Group GPUs by backend, region, and/or count. Can be specified multiple times. "
+            "Note: 'region' can only be used together with 'backend'.",
         )
 
 
diff --git a/src/dstack/_internal/server/schemas/gpus.py b/src/dstack/_internal/server/schemas/gpus.py
@@ -20,6 +20,7 @@ class BackendGpu(CoreModel):
     spot: bool
     count: int
     price: float
+    region: str
 
 
 class BackendGpus(CoreModel):
@@ -36,7 +37,8 @@ class ListGpusRequest(CoreModel):
     run_spec: RunSpec
     group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
         default=None,
-        description="List of fields to group by. Valid values: 'backend', 'region', 'count'",
+        description="List of fields to group by. Valid values: 'backend', 'region', 'count'. "
+        "Note: 'region' can only be used together with 'backend'.",
     )
 
 
diff --git a/src/dstack/_internal/server/services/gpus.py b/src/dstack/_internal/server/services/gpus.py
@@ -87,6 +87,7 @@ def _process_offers_into_backend_gpus(
                     spot=offer.instance.resources.spot,
                     count=gpu_count_in_offer,
                     price=offer.price,
+                    region=offer.region,
                 )
 
     backend_gpus_list = []
@@ -204,24 +205,23 @@ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) ->
     """Aggregates GPU specs, grouping them by both backend and region."""
     gpu_rows: Dict[Tuple, GpuGroup] = {}
     for backend in backend_gpus:
-        for region in backend.regions:
-            for gpu in backend.gpus:
-                key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, region)
-                if key not in gpu_rows:
-                    per_gpu_price = gpu.price / gpu.count
-                    gpu_rows[key] = GpuGroup(
-                        name=gpu.name,
-                        memory_mib=gpu.memory_mib,
-                        vendor=gpu.vendor,
-                        availability=[gpu.availability],
-                        spot=["spot" if gpu.spot else "on-demand"],
-                        count=Range[int](min=gpu.count, max=gpu.count),
-                        price=Range[float](min=per_gpu_price, max=per_gpu_price),
-                        backend=backend.backend_type,
-                        region=region,
-                    )
-                else:
-                    _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
+        for gpu in backend.gpus:
+            key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
+            if key not in gpu_rows:
+                per_gpu_price = gpu.price / gpu.count
+                gpu_rows[key] = GpuGroup(
+                    name=gpu.name,
+                    memory_mib=gpu.memory_mib,
+                    vendor=gpu.vendor,
+                    availability=[gpu.availability],
+                    spot=["spot" if gpu.spot else "on-demand"],
+                    count=Range[int](min=gpu.count, max=gpu.count),
+                    price=Range[float](min=per_gpu_price, max=per_gpu_price),
+                    backend=backend.backend_type,
+                    region=gpu.region,
+                )
+            else:
+                _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
 
     return sorted(
         list(gpu_rows.values()),
@@ -313,31 +313,30 @@ def _get_gpus_grouped_by_backend_region_and_count(
     """Aggregates GPU specs, grouping them by backend, region, and GPU count."""
     gpu_rows: Dict[Tuple, GpuGroup] = {}
     for backend in backend_gpus:
-        for region in backend.regions:
-            for gpu in backend.gpus:
-                key = (
-                    gpu.name,
-                    gpu.memory_mib,
-                    gpu.vendor,
-                    backend.backend_type,
-                    region,
-                    gpu.count,
+        for gpu in backend.gpus:
+            key = (
+                gpu.name,
+                gpu.memory_mib,
+                gpu.vendor,
+                backend.backend_type,
+                gpu.region,
+                gpu.count,
+            )
+            if key not in gpu_rows:
+                per_gpu_price = gpu.price / gpu.count
+                gpu_rows[key] = GpuGroup(
+                    name=gpu.name,
+                    memory_mib=gpu.memory_mib,
+                    vendor=gpu.vendor,
+                    availability=[gpu.availability],
+                    spot=["spot" if gpu.spot else "on-demand"],
+                    count=Range[int](min=gpu.count, max=gpu.count),
+                    price=Range[float](min=per_gpu_price, max=per_gpu_price),
+                    backend=backend.backend_type,
+                    region=gpu.region,
                 )
-                if key not in gpu_rows:
-                    per_gpu_price = gpu.price / gpu.count
-                    gpu_rows[key] = GpuGroup(
-                        name=gpu.name,
-                        memory_mib=gpu.memory_mib,
-                        vendor=gpu.vendor,
-                        availability=[gpu.availability],
-                        spot=["spot" if gpu.spot else "on-demand"],
-                        count=Range[int](min=gpu.count, max=gpu.count),
-                        price=Range[float](min=per_gpu_price, max=per_gpu_price),
-                        backend=backend.backend_type,
-                        region=region,
-                    )
-                else:
-                    _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
+            else:
+                _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
 
     return sorted(
         list(gpu_rows.values()),
@@ -366,6 +365,11 @@ async def list_gpus_grouped(
 
     group_by_set = set(group_by) if group_by else set()
 
+    if "region" in group_by_set and "backend" not in group_by_set:
+        from dstack._internal.core.errors import ServerClientError
+
+        raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
+
     # Determine grouping strategy based on combination
     has_backend = "backend" in group_by_set
     has_region = "region" in group_by_set
diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py
@@ -198,6 +198,19 @@ async def test_invalid_group_by_rejected(
         assert response.status_code == 422
         assert "validation error" in response.text.lower() or "invalid" in response.text.lower()
 
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
+    async def test_region_without_backend_rejected(
+        self, test_db, session: AsyncSession, client: AsyncClient
+    ):
+        user, project, repo, run_spec = await gpu_test_setup(session)
+
+        response = await call_gpus_api(
+            client, project.name, user.token, run_spec, group_by=["region"]
+        )
+
+        assert response.status_code == 400
+
     @pytest.mark.asyncio
     @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
     async def test_exact_aggregation_values(
@@ -422,28 +435,79 @@ async def test_exact_aggregation_values(
             assert t4_runpod["price"] == {"min": 0.25, "max": 0.25}
             assert rtx_runpod["price"] == {"min": 0.65, "max": 0.75}
 
-            # Test region grouping to validate multi-region, multi-backend setup
             response_region = await client.post(
                 f"/api/project/{project.name}/gpus/list",
                 headers=get_auth_headers(user.token),
-                json={"run_spec": run_spec.dict(), "group_by": ["region"]},
+                json={"run_spec": run_spec.dict(), "group_by": ["backend", "region"]},
             )
             assert response_region.status_code == 200
             region_data = response_region.json()
 
-            assert len(region_data["gpus"]) == 2
+            assert len(region_data["gpus"]) == 5
 
-            t4_region_group = next(
-                (gpu for gpu in region_data["gpus"] if gpu["name"] == "T4"), None
+            t4_aws_uswest2 = next(
+                (
+                    gpu
+                    for gpu in region_data["gpus"]
+                    if gpu["name"] == "T4"
+                    and gpu.get("backend") == "aws"
+                    and gpu.get("region") == "us-west-2"
+                ),
+                None,
             )
-            rtx_region_group = next(
-                (gpu for gpu in region_data["gpus"] if gpu["name"] == "RTX4090"), None
+            t4_runpod_useast1 = next(
+                (
+                    gpu
+                    for gpu in region_data["gpus"]
+                    if gpu["name"] == "T4"
+                    and gpu.get("backend") == "runpod"
+                    and gpu.get("region") == "us-east-1"
+                ),
+                None,
             )
 
-            assert t4_region_group is not None
-            assert rtx_region_group is not None
+            rtx_runpod_useast1 = next(
+                (
+                    gpu
+                    for gpu in region_data["gpus"]
+                    if gpu["name"] == "RTX4090"
+                    and gpu.get("backend") == "runpod"
+                    and gpu.get("region") == "us-east-1"
+                ),
+                None,
+            )
+            rtx_runpod_euwest1 = next(
+                (
+                    gpu
+                    for gpu in region_data["gpus"]
+                    if gpu["name"] == "RTX4090"
+                    and gpu.get("backend") == "runpod"
+                    and gpu.get("region") == "eu-west-1"
+                ),
+                None,
+            )
 
-            assert set(t4_region_group["backends"]) == {"aws", "runpod"}
-            assert set(rtx_region_group["backends"]) == {"runpod"}
-            assert t4_region_group["price"] == {"min": 0.25, "max": 0.60}
-            assert rtx_region_group["price"] == {"min": 0.65, "max": 0.75}
+            assert t4_aws_uswest2 is not None
+            assert t4_runpod_useast1 is not None
+            assert rtx_runpod_useast1 is not None
+            assert rtx_runpod_euwest1 is not None
+
+            assert t4_aws_uswest2["backend"] == "aws"
+            assert t4_aws_uswest2["region"] == "us-west-2"
+            assert t4_aws_uswest2["price"]["min"] == 0.30
+            assert t4_aws_uswest2["price"]["max"] == 0.60
+
+            assert t4_runpod_useast1["backend"] == "runpod"
+            assert t4_runpod_useast1["region"] == "us-east-1"
+            assert t4_runpod_useast1["price"]["min"] == 0.25
+            assert t4_runpod_useast1["price"]["max"] == 0.25
+
+            assert rtx_runpod_useast1["backend"] == "runpod"
+            assert rtx_runpod_useast1["region"] == "us-east-1"
+            assert rtx_runpod_useast1["price"]["min"] == 0.75
+            assert rtx_runpod_useast1["price"]["max"] == 0.75
+
+            assert rtx_runpod_euwest1["backend"] == "runpod"
+            assert rtx_runpod_euwest1["region"] == "eu-west-1"
+            assert rtx_runpod_euwest1["price"]["min"] == 0.65
+            assert rtx_runpod_euwest1["price"]["max"] == 0.65

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,8 @@ def register_args(`
`31`	`31`	`"--group-by",`
`32`	`32`	`action="append",`
`33`	`33`	`choices=["backend", "region", "count"],`
`34`		`- help="Group GPUs by backend, region, and/or count. Can be specified multiple times.",`
	`34`	`+ help="Group GPUs by backend, region, and/or count. Can be specified multiple times. "`
	`35`	`+ "Note: 'region' can only be used together with 'backend'.",`
`35`	`36`	`)`
`36`	`37`
`37`	`38`