Skip to content

Commit b50a3c9

Browse files
committed
Select legacy image conditionally
1 parent c48a07b commit b50a3c9

File tree

8 files changed

+90
-25
lines changed

8 files changed

+90
-25
lines changed

src/dstack/_internal/core/backends/aws/compute.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,11 @@ def create_instance(
276276
image_id, username = self._get_image_id_and_username(
277277
ec2_client=ec2_client,
278278
region=instance_offer.region,
279-
cuda=len(instance_offer.instance.resources.gpus) > 0,
279+
gpu_name=(
280+
instance_offer.instance.resources.gpus[0].name
281+
if len(instance_offer.instance.resources.gpus) > 0
282+
else None
283+
),
280284
instance_type=instance_offer.instance.name,
281285
image_config=self.config.os_images,
282286
)
@@ -882,11 +886,13 @@ def _get_image_id_and_username_cache_key(
882886
self,
883887
ec2_client: botocore.client.BaseClient,
884888
region: str,
885-
cuda: bool,
889+
gpu_name: Optional[str],
886890
instance_type: str,
887891
image_config: Optional[AWSOSImageConfig] = None,
888892
) -> tuple:
889-
return hashkey(region, cuda, instance_type, image_config.json() if image_config else None)
893+
return hashkey(
894+
region, gpu_name, instance_type, image_config.json() if image_config else None
895+
)
890896

891897
@cachedmethod(
892898
cache=lambda self: self._get_image_id_and_username_cache,
@@ -897,13 +903,13 @@ def _get_image_id_and_username(
897903
self,
898904
ec2_client: botocore.client.BaseClient,
899905
region: str,
900-
cuda: bool,
906+
gpu_name: Optional[str],
901907
instance_type: str,
902908
image_config: Optional[AWSOSImageConfig] = None,
903909
) -> tuple[str, str]:
904910
return aws_resources.get_image_id_and_username(
905911
ec2_client=ec2_client,
906-
cuda=cuda,
912+
gpu_name=gpu_name,
907913
instance_type=instance_type,
908914
image_config=image_config,
909915
)

src/dstack/_internal/core/backends/aws/resources.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import dstack.version as version
88
from dstack._internal.core.backends.aws.models import AWSOSImageConfig
9+
from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
10+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
911
from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError
1012
from dstack._internal.utils.logging import get_logger
1113

@@ -17,14 +19,14 @@
1719

1820
def get_image_id_and_username(
1921
ec2_client: botocore.client.BaseClient,
20-
cuda: bool,
22+
gpu_name: Optional[str],
2123
instance_type: str,
2224
image_config: Optional[AWSOSImageConfig] = None,
2325
) -> tuple[str, str]:
2426
if image_config is not None:
25-
image = image_config.nvidia if cuda else image_config.cpu
27+
image = image_config.nvidia if gpu_name else image_config.cpu
2628
if image is None:
27-
logger.warning("%s image not configured", "nvidia" if cuda else "cpu")
29+
logger.warning("%s image not configured", "nvidia" if gpu_name else "cpu")
2830
raise ComputeResourceNotFoundError()
2931
image_name = image.name
3032
image_owner = image.owner
@@ -35,9 +37,12 @@ def get_image_id_and_username(
3537
image_owner = DLAMI_OWNER_ACCOUNT_ID
3638
username = "ubuntu"
3739
else:
38-
image_name = (
39-
f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
40-
)
40+
if gpu_name is None:
41+
image_name = f"dstack-{version.base_image}"
42+
elif not requires_nvidia_proprietary_kernel_modules(gpu_name):
43+
image_name = f"dstack-cuda-{version.base_image}"
44+
else:
45+
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
4146
image_owner = DSTACK_ACCOUNT_ID
4247
username = "ubuntu"
4348
response = ec2_client.describe_images(

src/dstack/_internal/core/backends/azure/compute.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@
4747
get_gateway_user_data,
4848
get_user_data,
4949
merge_tags,
50+
requires_nvidia_proprietary_kernel_modules,
5051
)
5152
from dstack._internal.core.backends.base.offers import get_catalog_offers
53+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
5254
from dstack._internal.core.errors import ComputeError, NoCapacityError
5355
from dstack._internal.core.models.backends.base import BackendType
5456
from dstack._internal.core.models.gateways import (
@@ -369,25 +371,32 @@ def _parse_config_vpc_id(vpc_id: str) -> Tuple[str, str]:
369371
class VMImageVariant(enum.Enum):
370372
GRID = enum.auto()
371373
CUDA = enum.auto()
374+
CUDA_WITH_PROPRIETARY_KERNEL_MODULES = enum.auto()
372375
STANDARD = enum.auto()
373376

374377
@classmethod
375378
def from_instance_type(cls, instance: InstanceType) -> "VMImageVariant":
376379
if "_A10_v5" in instance.name:
377380
return cls.GRID
378381
elif len(instance.resources.gpus) > 0:
379-
return cls.CUDA
382+
if not requires_nvidia_proprietary_kernel_modules(instance.resources.gpus[0].name):
383+
return cls.CUDA
384+
else:
385+
return cls.CUDA_WITH_PROPRIETARY_KERNEL_MODULES
380386
else:
381387
return cls.STANDARD
382388

383389
def get_image_name(self) -> str:
384-
name = "dstack-"
385390
if self is self.GRID:
386-
name += "grid-"
391+
return f"dstack-grid-{version.base_image}"
387392
elif self is self.CUDA:
388-
name += "cuda-"
389-
name += version.base_image
390-
return name
393+
return f"dstack-cuda-{version.base_image}"
394+
elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES:
395+
return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
396+
elif self is self.STANDARD:
397+
return f"dstack-{version.base_image}"
398+
else:
399+
raise ValueError(f"Unexpected image variant {self!r}")
391400

392401

393402
_SUPPORTED_VM_SERIES_PATTERNS = [

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,22 @@
4747
DSTACK_SHIM_BINARY_NAME = "dstack-shim"
4848
DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
4949
DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16")
50+
NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES = frozenset(
51+
# All NVIDIA architectures prior to Turing do not support Open Kernel Modules and require
52+
# proprietary modules. This list is incomplete, update when necessary.
53+
[
54+
"v100",
55+
"p100",
56+
"p40",
57+
"p4",
58+
"m60",
59+
"m40",
60+
"m4",
61+
"k80",
62+
"k40",
63+
"k20",
64+
]
65+
)
5066

5167
GoArchType = Literal["amd64", "arm64"]
5268

@@ -887,3 +903,12 @@ def merge_tags(
887903
for k, v in resource_tags.items():
888904
res.setdefault(k, v)
889905
return res
906+
907+
908+
def requires_nvidia_proprietary_kernel_modules(gpu_name: str) -> bool:
909+
"""
910+
Returns:
911+
Whether this NVIDIA GPU requires NVIDIA proprietary kernel modules
912+
instead of open kernel modules.
913+
"""
914+
return gpu_name.lower() in NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@
3030
get_shim_commands,
3131
get_user_data,
3232
merge_tags,
33+
requires_nvidia_proprietary_kernel_modules,
3334
)
3435
from dstack._internal.core.backends.base.offers import get_catalog_offers
3536
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
3637
from dstack._internal.core.backends.gcp.models import GCPConfig
38+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
3739
from dstack._internal.core.errors import (
3840
ComputeError,
3941
ComputeResourceNotFoundError,
@@ -288,7 +290,11 @@ def create_instance(
288290

289291
image = _get_image(
290292
instance_type_name=instance_offer.instance.name,
291-
cuda=len(instance_offer.instance.resources.gpus) > 0,
293+
gpu_name=(
294+
instance_offer.instance.resources.gpus[0].name
295+
if len(instance_offer.instance.resources.gpus) > 0
296+
else None
297+
),
292298
)
293299

294300
for zone in zones:
@@ -899,7 +905,7 @@ class GCPImage:
899905
is_ufw_installed: bool
900906

901907

902-
def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
908+
def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
903909
if instance_type_name == "a3-megagpu-8g":
904910
image_name = "dstack-a3mega-5"
905911
is_ufw_installed = False
@@ -908,8 +914,11 @@ def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
908914
id="projects/cos-cloud/global/images/cos-105-17412-535-78",
909915
is_ufw_installed=False,
910916
)
911-
elif cuda:
912-
image_name = f"dstack-cuda-{version.base_image}"
917+
elif gpu_name is not None:
918+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
919+
image_name = f"dstack-cuda-{version.base_image}"
920+
else:
921+
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
913922
is_ufw_installed = True
914923
else:
915924
image_name = f"dstack-{version.base_image}"

src/dstack/_internal/core/backends/oci/compute.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,11 @@ def create_instance(
115115
availability_domain = instance_offer.availability_zones[0]
116116

117117
listing, package = resources.get_marketplace_listing_and_package(
118-
cuda=len(instance_offer.instance.resources.gpus) > 0,
118+
gpu_name=(
119+
instance_offer.instance.resources.gpus[0].name
120+
if len(instance_offer.instance.resources.gpus) > 0
121+
else None
122+
),
119123
client=region.marketplace_client,
120124
)
121125
resources.accept_marketplace_listing_agreements(

src/dstack/_internal/core/backends/oci/resources.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
from oci.object_storage.models import CreatePreauthenticatedRequestDetails
2424

2525
from dstack import version
26+
from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
2627
from dstack._internal.core.backends.oci.region import OCIRegionClient
28+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
2729
from dstack._internal.core.errors import BackendError
2830
from dstack._internal.core.models.instances import InstanceOffer
2931
from dstack._internal.utils.common import batched
@@ -352,11 +354,14 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
352354

353355

354356
def get_marketplace_listing_and_package(
355-
cuda: bool, client: oci.marketplace.MarketplaceClient
357+
gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
356358
) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
357359
listing_name = f"dstack-{version.base_image}"
358-
if cuda:
359-
listing_name = f"dstack-cuda-{version.base_image}"
360+
if gpu_name is not None:
361+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
362+
listing_name = f"dstack-cuda-{version.base_image}"
363+
else:
364+
listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
360365

361366
listing_summaries = list_marketplace_listings(listing_name, client)
362367
if len(listing_summaries) != 1:

src/dstack/_internal/core/consts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44
DSTACK_RUNNER_HTTP_PORT = 10999
55
# ssh server (runs alongside the runner inside a container) listen port
66
DSTACK_RUNNER_SSH_PORT = 10022
7+
# legacy AWS, Azure, GCP, and OCI image for older GPUs
8+
DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"

0 commit comments

Comments
 (0)