Skip to content

Commit f730365

Browse files
Clarify why GPU vendor default inference is split between client and server; add TODOs on how this should change in the future (move resource defaults to the server). (#3588)
1 parent f7dacf7 commit f730365

File tree

2 files changed

+21
-12
lines changed

2 files changed

+21
-12
lines changed

src/dstack/_internal/cli/services/configurators/run.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -391,10 +391,14 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
391391
Infers GPU vendor if not set. Defaults to Nvidia when using the default
392392
CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent.
393393
394-
NOTE: We don't set the inferred vendor on gpu_spec for compatibility with
395-
older servers. Servers set the vendor using the same logic in
396-
set_resources_defaults(). The inferred vendor is used here only for
397-
validation and display (see _infer_gpu_vendor).
394+
When vendor is inferred from GPU name (e.g. A100 -> nvidia), it is written to
395+
gpu_spec. When vendor is inferred from image context (no name, no vendor, default
396+
CUDA image -> nvidia), it is NOT written to gpu_spec because 0.19.x servers
397+
(gpuhunt <0.1.12) break on vendor=nvidia + min_gpu_count=0. The server applies
398+
the same default in set_gpu_vendor_default().
399+
400+
TODO: This entire method should move to the server (set_resources_defaults)
401+
so that defaults and validation are equal for CLI and API users.
398402
"""
399403
gpu_spec = conf.resources.gpu
400404
if gpu_spec is None:
@@ -439,11 +443,8 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None:
439443
# Set vendor inferred from name on the spec (server needs it for filtering).
440444
gpu_spec.vendor = vendor
441445
else:
442-
# No vendor or name specified. Default to Nvidia if using the default
443-
# CUDA image, since it's only compatible with Nvidia GPUs.
444-
# We don't set the inferred vendor on the spec — the server does the
445-
# same inference in set_resources_defaults() for compatibility with
446-
# older servers that don't handle vendor + count.min=0 correctly.
446+
# No vendor or name specified. Default to Nvidia if using the
447+
# default CUDA image, since it's only compatible with Nvidia GPUs.
447448
if conf.image is None and conf.docker is not True:
448449
vendor = gpuhunt.AcceleratorVendor.NVIDIA
449450
has_amd_gpu = False

src/dstack/_internal/server/services/resources.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,17 @@ def set_gpu_vendor_default(
2929
docker: Optional[bool],
3030
) -> None:
3131
"""Default GPU vendor to Nvidia when using the default CUDA image,
32-
since it's only compatible with Nvidia GPUs.
33-
Mirrors the client-side logic in validate_gpu_vendor_and_image().
34-
Should only be called for runs (not fleets) since fleets don't have image context."""
32+
since it's only compatible with Nvidia GPUs. Only called for runs
33+
(not fleets) since fleets don't have image context.
34+
35+
The client infers the same default for display and validation
36+
(see validate_gpu_vendor_and_image) but does not write it to the spec
37+
for 0.19.x server compatibility. This server-side function is what
38+
actually sets the vendor before offer matching.
39+
40+
TODO: All resource defaults and validation (gpu vendor, cpu arch, memory,
41+
disk, etc.) should be set here on the server, not split between client
42+
and model-level defaults."""
3543
gpu = resources.gpu
3644
if (
3745
gpu is not None

0 commit comments

Comments
 (0)