Skip to content

Commit 0d0e654

Browse files
[Feature]: Allow dstack offer to aggregate GPU information (#2992)
1 parent 9a1557c commit 0d0e654

File tree

12 files changed

+1400
-63
lines changed

12 files changed

+1400
-63
lines changed

docs/docs/guides/protips.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,36 @@ Getting offers...
427427

428428
</div>
429429

430+
??? info "Grouping offers"
431+
Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`.
432+
433+
<div class="termy">
434+
435+
```shell
436+
dstack offer --gpu b200 --group-by gpu,backend,region
437+
Project main
438+
User admin
439+
Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1..
440+
Spot policy auto
441+
Max price -
442+
Reservation -
443+
Group by gpu, backend, region
444+
445+
# GPU SPOT $/GPU BACKEND REGION
446+
1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1
447+
2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2
448+
3 B200:180GB:8 on-demand 4.99 lambda us-east-1
449+
4 B200:180GB:8 on-demand 5.5 nebius us-central1
450+
```
451+
452+
</div>
453+
454+
When using `--group-by`, `gpu` must always be `included`.
455+
The `region` value can only be used together with `backend`.
456+
457+
The `offer` command allows you to filter and group offers with various [advanced options](../reference/cli/dstack/offer.md#usage).
458+
459+
430460
## Metrics
431461

432462
`dstack` tracks essential metrics accessible via the CLI and UI. To access advanced metrics like DCGM, configure the server to export metrics to Prometheus. See [Metrics](metrics.md) for details.

docs/docs/reference/cli/dstack/offer.md

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# dstack offer
22

3-
Displays available offers (hardware configurations) with the configured backends (or offers that match already provisioned fleets).
3+
Displays available offers (hardware configurations) from configured backends or from fleets you’ve already provisioned. Supports filtering and grouping.
44

5-
The output includes backend, region, instance type, resources, spot availability, and pricing details.
5+
The output shows backend, region, instance type, resources, spot availability, and pricing.
66

77
## Usage
88

@@ -19,7 +19,7 @@ $ dstack offer --help
1919

2020
## Examples
2121

22-
### List GPU offers
22+
### Filtering offers
2323

2424
The `--gpu` flag accepts the same specification format as the `gpu` property in [`dev environment`](../../../concepts/dev-environments.md), [`task`](../../../concepts/tasks.md),
2525
[`service`](../../../concepts/services.md), and [`fleet`](../../../concepts/fleets.md) configurations.
@@ -71,6 +71,34 @@ Getting offers...
7171

7272
</div>
7373

74+
### Grouping offers
75+
76+
Use `--group-by` to aggregate offers. Accepted values: `gpu`, `backend`, `region`, and `count`.
77+
78+
<div class="termy">
79+
80+
```shell
81+
dstack offer --gpu b200 --group-by gpu,backend,region
82+
Project main
83+
User admin
84+
Resources cpu=2.. mem=8GB.. disk=100GB.. b200:1..
85+
Spot policy auto
86+
Max price -
87+
Reservation -
88+
Group by gpu, backend, region
89+
90+
# GPU SPOT $/GPU BACKEND REGION
91+
1 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod EU-RO-1
92+
2 B200:180GB:1..8 spot, on-demand 3.59..5.99 runpod US-CA-2
93+
3 B200:180GB:8 on-demand 4.99 lambda us-east-1
94+
4 B200:180GB:8 on-demand 5.5 nebius us-central1
95+
```
96+
97+
</div>
98+
99+
When using `--group-by`, `gpu` must always be `included`.
100+
The `region` value can only be used together with `backend`.
101+
74102
### JSON format
75103

76104
Use `--json` to output offers in the JSON format.
Lines changed: 68 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,20 @@
11
import argparse
2-
import contextlib
3-
import json
42
from pathlib import Path
3+
from typing import List
54

65
from dstack._internal.cli.commands import APIBaseCommand
7-
from dstack._internal.cli.services.configurators.run import (
8-
BaseRunConfigurator,
9-
)
6+
from dstack._internal.cli.services.configurators.run import BaseRunConfigurator
107
from dstack._internal.cli.utils.common import console
11-
from dstack._internal.cli.utils.run import print_run_plan
12-
from dstack._internal.core.models.configurations import (
13-
ApplyConfigurationType,
14-
TaskConfiguration,
15-
)
8+
from dstack._internal.cli.utils.gpu import print_gpu_json, print_gpu_table
9+
from dstack._internal.cli.utils.run import print_offers_json, print_run_plan
10+
from dstack._internal.core.errors import CLIError
11+
from dstack._internal.core.models.configurations import ApplyConfigurationType, TaskConfiguration
1612
from dstack._internal.core.models.runs import RunSpec
13+
from dstack._internal.server.schemas.gpus import GpuGroup
1714
from dstack.api.utils import load_profile
1815

1916

2017
class OfferConfigurator(BaseRunConfigurator):
21-
# TODO: The command currently uses `BaseRunConfigurator` to register arguments.
22-
# This includes --env, --retry-policy, and other arguments that are unnecessary for this command.
23-
# Eventually, we should introduce a base `OfferConfigurator` that doesn't include those arguments—
24-
# `BaseRunConfigurator` will inherit from `OfferConfigurator`.
25-
#
26-
# Additionally, it should have its own type: `ApplyConfigurationType.OFFER`.
2718
TYPE = ApplyConfigurationType.TASK
2819

2920
@classmethod
@@ -32,10 +23,18 @@ def register_args(
3223
parser: argparse.ArgumentParser,
3324
):
3425
super().register_args(parser, default_max_offers=50)
26+
parser.add_argument(
27+
"--group-by",
28+
action="append",
29+
help=(
30+
"Group results by fields ([code]gpu[/code], [code]backend[/code], [code]region[/code], [code]count[/code]). "
31+
"Optional, but if used, must include [code]gpu[/code]. "
32+
"The use of [code]region[/code] also requires [code]backend[/code]. "
33+
"Can be repeated or comma-separated (e.g. [code]--group-by gpu,backend[/code])."
34+
),
35+
)
3536

3637

37-
# TODO: Support aggregated offers
38-
# TODO: Add tests
3938
class OfferCommand(APIBaseCommand):
4039
NAME = "offer"
4140
DESCRIPTION = "List offers"
@@ -70,49 +69,58 @@ def _command(self, args: argparse.Namespace):
7069
ssh_key_pub="(dummy)",
7170
profile=profile,
7271
)
72+
73+
if args.group_by:
74+
args.group_by = self._process_group_by_args(args.group_by)
75+
76+
if args.group_by and "gpu" not in args.group_by:
77+
group_values = ", ".join(args.group_by)
78+
raise CLIError(f"Cannot group by '{group_values}' without also grouping by 'gpu'")
79+
7380
if args.format == "plain":
74-
status = console.status("Getting offers...")
81+
with console.status("Getting offers..."):
82+
if args.group_by:
83+
gpus = self._list_gpus(args, run_spec)
84+
print_gpu_table(gpus, run_spec, args.group_by, self.api.project)
85+
else:
86+
run_plan = self.api.client.runs.get_plan(
87+
self.api.project,
88+
run_spec,
89+
max_offers=args.max_offers,
90+
)
91+
print_run_plan(run_plan, include_run_properties=False)
7592
else:
76-
status = contextlib.nullcontext()
77-
with status:
78-
run_plan = self.api.client.runs.get_plan(
79-
self.api.project,
80-
run_spec,
81-
max_offers=args.max_offers,
82-
)
83-
84-
job_plan = run_plan.job_plans[0]
85-
86-
if args.format == "json":
87-
# FIXME: Should use effective_run_spec from run_plan,
88-
# since the spec can be changed by the server and plugins
89-
output = {
90-
"project": run_plan.project_name,
91-
"user": run_plan.user,
92-
"resources": job_plan.job_spec.requirements.resources.dict(),
93-
"max_price": (job_plan.job_spec.requirements.max_price),
94-
"spot": run_spec.configuration.spot_policy,
95-
"reservation": run_plan.run_spec.configuration.reservation,
96-
"offers": [],
97-
"total_offers": job_plan.total_offers,
98-
}
99-
100-
for offer in job_plan.offers:
101-
output["offers"].append(
102-
{
103-
"backend": (
104-
"ssh" if offer.backend.value == "remote" else offer.backend.value
105-
),
106-
"region": offer.region,
107-
"instance_type": offer.instance.name,
108-
"resources": offer.instance.resources.dict(),
109-
"spot": offer.instance.resources.spot,
110-
"price": float(offer.price),
111-
"availability": offer.availability.value,
112-
}
93+
if args.group_by:
94+
gpus = self._list_gpus(args, run_spec)
95+
print_gpu_json(gpus, run_spec, args.group_by, self.api.project)
96+
else:
97+
run_plan = self.api.client.runs.get_plan(
98+
self.api.project,
99+
run_spec,
100+
max_offers=args.max_offers,
113101
)
102+
print_offers_json(run_plan, run_spec)
114103

115-
print(json.dumps(output, indent=2))
116-
return
117-
else:
118-
print_run_plan(run_plan, include_run_properties=False)
104+
def _process_group_by_args(self, group_by_args: List[str]) -> List[str]:
105+
valid_choices = {"gpu", "backend", "region", "count"}
106+
processed = []
107+
108+
for arg in group_by_args:
109+
values = [v.strip() for v in arg.split(",") if v.strip()]
110+
for value in values:
111+
if value in valid_choices:
112+
processed.append(value)
113+
else:
114+
raise CLIError(
115+
f"Invalid group-by value: '{value}'. Valid choices are: {', '.join(sorted(valid_choices))}"
116+
)
117+
118+
return processed
119+
120+
def _list_gpus(self, args: List[str], run_spec: RunSpec) -> List[GpuGroup]:
121+
group_by = [g for g in args.group_by if g != "gpu"] or None
122+
return self.api.client.gpus.list_gpus(
123+
self.api.project,
124+
run_spec,
125+
group_by=group_by,
126+
)

0 commit comments

Comments
 (0)