Skip to content

Commit 5c8450f

Browse files
[Nebius] Support spot instances and B200 (#2965)
* [Nebius] Support spot instances and B200 #2954 * [Nebius] Ensure the SDK supports spot instances
1 parent f2ae93b commit 5c8450f

File tree

4 files changed

+19
-3
lines changed

4 files changed

+19
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ oci = [
176176
"dstack[server]",
177177
]
178178
nebius = [
179-
"nebius>=0.2.19,<0.3; python_version >= '3.10'",
179+
"nebius>=0.2.40,<0.3; python_version >= '3.10'",
180180
"dstack[server]",
181181
]
182182
all = [

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
SUPPORTED_PLATFORMS = [
7575
"gpu-h100-sxm",
7676
"gpu-h200-sxm",
77+
"gpu-b200-sxm",
7778
"gpu-l40s-a",
7879
"gpu-l40s-d",
7980
"cpu-d3",
@@ -150,12 +151,16 @@ def create_instance(
150151
)
151152
if backend_data.cluster is not None:
152153
cluster_id = backend_data.cluster.id
154+
155+
gpus = instance_offer.instance.resources.gpus
153156
create_disk_op = resources.create_disk(
154157
sdk=self._sdk,
155158
name=instance_name,
156159
project_id=self._region_to_project_id[instance_offer.region],
157160
size_mib=instance_offer.instance.resources.disk.size_mib,
158-
image_family="ubuntu22.04-cuda12",
161+
image_family="ubuntu24.04-cuda12"
162+
if gpus and gpus[0].name == "B200"
163+
else "ubuntu22.04-cuda12",
159164
)
160165
create_instance_op = None
161166
try:
@@ -180,6 +185,7 @@ def create_instance(
180185
cluster_id=cluster_id,
181186
disk_id=create_disk_op.resource_id,
182187
subnet_id=self._get_subnet_id(instance_offer.region),
188+
preemptible=instance_offer.instance.resources.spot,
183189
)
184190
_wait_for_instance(self._sdk, create_instance_op)
185191
except BaseException:
@@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
367373

368374
def _supported_instances(offer: InstanceOffer) -> bool:
369375
platform, _ = offer.instance.name.split()
370-
return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
376+
return platform in SUPPORTED_PLATFORMS

src/dstack/_internal/core/backends/nebius/fabrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class InfinibandFabric:
2121
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
2222
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
2323
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
24+
InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"),
2425
]
2526

2627

src/dstack/_internal/core/backends/nebius/resources.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@
2828
GpuClusterSpec,
2929
Instance,
3030
InstanceGpuClusterSpec,
31+
InstanceRecoveryPolicy,
3132
InstanceServiceClient,
3233
InstanceSpec,
3334
IPAddress,
3435
NetworkInterfaceSpec,
36+
PreemptibleSpec,
3537
PublicIPAddress,
3638
ResourcesSpec,
3739
SourceImageFamily,
@@ -283,6 +285,7 @@ def create_instance(
283285
cluster_id: Optional[str],
284286
disk_id: str,
285287
subnet_id: str,
288+
preemptible: bool,
286289
) -> SDKOperation[Operation]:
287290
client = InstanceServiceClient(sdk)
288291
request = CreateInstanceRequest(
@@ -306,6 +309,12 @@ def create_instance(
306309
public_ip_address=PublicIPAddress(static=True),
307310
)
308311
],
312+
preemptible=PreemptibleSpec(
313+
priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP
314+
)
315+
if preemptible
316+
else None,
317+
recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None,
309318
),
310319
)
311320
with wrap_capacity_errors():

0 commit comments

Comments
 (0)