Skip to content

Commit ac3c337

Browse files
committed
Allow to configure Nebius InfiniBand fabrics
Add an option in Nebius backend settings to limit the list of allowed fabrics for InfiniBand clusters. This can be useful for larger customers that have capacity reservations tied to a specific fabric.
1 parent 004b91e commit ac3c337

4 files changed

Lines changed: 75 additions & 33 deletions

File tree

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 9 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import random
33
import shlex
44
import time
5-
from dataclasses import dataclass
65
from functools import cached_property
76
from typing import List, Optional
87

@@ -21,6 +20,7 @@
2120
)
2221
from dstack._internal.core.backends.base.offers import get_catalog_offers
2322
from dstack._internal.core.backends.nebius import resources
23+
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
2424
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
2525
from dstack._internal.core.errors import (
2626
BackendError,
@@ -81,24 +81,6 @@
8181
]
8282

8383

84-
@dataclass(frozen=True)
85-
class InfinibandFabric:
86-
name: str
87-
platform: str
88-
region: str
89-
90-
91-
# https://docs.nebius.com/compute/clusters/gpu#fabrics
92-
INFINIBAND_FABRICS = [
93-
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
94-
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
95-
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
96-
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
97-
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
98-
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
99-
]
100-
101-
10284
class NebiusCompute(
10385
ComputeWithCreateInstanceSupport,
10486
ComputeWithMultinodeSupport,
@@ -280,7 +262,9 @@ def create_placement_group(
280262
backend_data = NebiusPlacementGroupBackendData(cluster=None)
281263
# Only create a Nebius cluster if the instance supports it.
282264
# For other instances, return dummy PlacementGroupProvisioningData.
283-
if fabrics := _get_suitable_infiniband_fabrics(master_instance_offer):
265+
if fabrics := get_suitable_infiniband_fabrics(
266+
master_instance_offer, allowed_fabrics=self.config.fabrics
267+
):
284268
fabric = random.choice(fabrics)
285269
op = resources.create_cluster(
286270
self._sdk,
@@ -319,7 +303,11 @@ def is_suitable_placement_group(
319303
)
320304
return (
321305
backend_data.cluster is None
322-
or backend_data.cluster.fabric in _get_suitable_infiniband_fabrics(instance_offer)
306+
or backend_data.cluster.fabric
307+
in get_suitable_infiniband_fabrics(
308+
instance_offer,
309+
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
310+
)
323311
)
324312

325313

@@ -380,15 +368,3 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
380368
def _supported_instances(offer: InstanceOffer) -> bool:
381369
platform, _ = offer.instance.name.split()
382370
return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
383-
384-
385-
def _get_suitable_infiniband_fabrics(offer: InstanceOffer) -> list[str]:
386-
if len(offer.instance.resources.gpus) < 8:
387-
# From the create VM page in the Nebius Console:
388-
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
389-
# > can be added to the cluster
390-
return []
391-
platform, _ = offer.instance.name.split()
392-
return [
393-
f.name for f in INFINIBAND_FABRICS if f.platform == platform and f.region == offer.region
394-
]

src/dstack/_internal/core/backends/nebius/configurator.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
)
1010
from dstack._internal.core.backends.nebius import resources
1111
from dstack._internal.core.backends.nebius.backend import NebiusBackend
12+
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
1213
from dstack._internal.core.backends.nebius.models import (
1314
AnyNebiusBackendConfig,
1415
NebiusBackendConfig,
@@ -38,6 +39,16 @@ def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_en
3839
fields=[["creds"]],
3940
details=str(e),
4041
)
42+
valid_fabrics = get_all_infiniband_fabrics()
43+
if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
44+
raise_invalid_credentials_error(
45+
fields=[["fabrics"]],
46+
details=(
47+
"These InfiniBand fabrics do not exist or are not known to dstack:"
48+
f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
49+
f" some of the valid options: {sorted(valid_fabrics)}"
50+
),
51+
)
4152

4253
def create_backend(
4354
self, project_name: str, config: NebiusBackendConfigWithCreds
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from collections.abc import Container
2+
from dataclasses import dataclass
3+
from typing import Optional
4+
5+
from dstack._internal.core.models.instances import InstanceOffer
6+
7+
8+
@dataclass(frozen=True)
9+
class InfinibandFabric:
10+
name: str
11+
platform: str
12+
region: str
13+
14+
15+
# https://docs.nebius.com/compute/clusters/gpu#fabrics
16+
INFINIBAND_FABRICS = [
17+
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
18+
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
19+
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
20+
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21+
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22+
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23+
]
24+
25+
26+
def get_suitable_infiniband_fabrics(
27+
offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
28+
) -> list[str]:
29+
if len(offer.instance.resources.gpus) < 8:
30+
# From the create VM page in the Nebius Console:
31+
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
32+
# > can be added to the cluster
33+
return []
34+
platform, _ = offer.instance.name.split()
35+
return [
36+
f.name
37+
for f in INFINIBAND_FABRICS
38+
if (
39+
f.platform == platform
40+
and f.region == offer.region
41+
and (allowed_fabrics is None or f.name in allowed_fabrics)
42+
)
43+
]
44+
45+
46+
def get_all_infiniband_fabrics() -> set[str]:
47+
return {f.name for f in INFINIBAND_FABRICS}

src/dstack/_internal/core/backends/nebius/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
8787
Optional[list[str]],
8888
Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
8989
] = None
90+
fabrics: Annotated[
91+
Optional[list[str]],
92+
Field(
93+
description=(
94+
"The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
95+
)
96+
),
97+
] = None
9098

9199

92100
class NebiusBackendConfigWithCreds(NebiusBackendConfig):

0 commit comments

Comments
 (0)