Skip to content

Commit 004b91e

Browse files
authored
Support Nebius InfiniBand clusters (#2604)
An InfiniBand cluster is now created automatically when provisioning 8xH100 or 8xH200 instances using a fleet configuration with `placement: cluster`. Nebius clusters were supported using `dstack` placement groups. Several changes were made to the placement group management logic: - The offer for the master instance of the fleet is passed to `Compute.create_placement_group`, which allows to set different placement group settings based on the offer. Nebius requires different settings for H100 and H200 clusters. - `Compute.is_suitable_placement_group` is introduced to allow choosing an appropriate placement group when creating the master instance and filtering offers for non-master instances based on backend-specific placement group properties. Nebius currently only provides homogeneous clusters, so offers need to be filtered based on the placement group. - The placement group object is passed to `Compute.create_instance` to allow adding the instance to the placement group using its backend-specific properties, such as cluster ID on Nebius. - The placement group name is generated at master instance provisioning time, not at fleet creation time. This allows to have different placement group names for the same fleet and avoid name conflicts, since multiple placement groups can be created while `dstack` is trying different offers for the master instance. - Placement groups that were created during master instance provisioning but didn't end up being used are now cleaned up. Nebius quotas limit the number of clusters, so unused clusters need to be cleaned up quickly, without waiting for fleet deletion. - If all offers failed for the master instance, `dstack` will no longer attempt to provision other fleet instances to avoid them being provisioned without a placement group or without connectivity at all. - Placement group creation errors are now handled gracefully, so that `dstack` can move on to other master instance offers, which may lead to creating different placement groups. For example, if `dstack` cannot create a cluster in one Nebius region because of a missing quota, it may attempt to create a cluster in another region.
1 parent 81305d0 commit 004b91e

File tree

27 files changed

+715
-117
lines changed

27 files changed

+715
-117
lines changed

src/dstack/_internal/core/backends/aws/compute.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def create_instance(
159159
self,
160160
instance_offer: InstanceOfferWithAvailability,
161161
instance_config: InstanceConfiguration,
162+
placement_group: Optional[PlacementGroup],
162163
) -> JobProvisioningData:
163164
project_name = instance_config.project_name
164165
ec2_resource = self.session.resource("ec2", region_name=instance_offer.region)
@@ -248,7 +249,7 @@ def create_instance(
248249
spot=instance_offer.instance.resources.spot,
249250
subnet_id=subnet_id,
250251
allocate_public_ip=allocate_public_ip,
251-
placement_group_name=instance_config.placement_group_name,
252+
placement_group_name=placement_group.name if placement_group else None,
252253
enable_efa=enable_efa,
253254
max_efa_interfaces=max_efa_interfaces,
254255
reservation_id=instance_config.reservation,
@@ -291,6 +292,7 @@ def create_instance(
291292
def create_placement_group(
292293
self,
293294
placement_group: PlacementGroup,
295+
master_instance_offer: InstanceOffer,
294296
) -> PlacementGroupProvisioningData:
295297
ec2_client = self.session.client("ec2", region_name=placement_group.configuration.region)
296298
logger.debug("Creating placement group %s...", placement_group.name)
@@ -323,6 +325,16 @@ def delete_placement_group(
323325
raise e
324326
logger.debug("Deleted placement group %s", placement_group.name)
325327

328+
def is_suitable_placement_group(
329+
self,
330+
placement_group: PlacementGroup,
331+
instance_offer: InstanceOffer,
332+
) -> bool:
333+
return (
334+
placement_group.configuration.backend == BackendType.AWS
335+
and placement_group.configuration.region == instance_offer.region
336+
)
337+
326338
def create_gateway(
327339
self,
328340
configuration: GatewayComputeConfiguration,

src/dstack/_internal/core/backends/azure/compute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
InstanceOfferWithAvailability,
6363
InstanceType,
6464
)
65+
from dstack._internal.core.models.placement import PlacementGroup
6566
from dstack._internal.core.models.resources import Memory, Range
6667
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
6768
from dstack._internal.utils.logging import get_logger
@@ -109,6 +110,7 @@ def create_instance(
109110
self,
110111
instance_offer: InstanceOfferWithAvailability,
111112
instance_config: InstanceConfiguration,
113+
placement_group: Optional[PlacementGroup],
112114
) -> JobProvisioningData:
113115
instance_name = generate_unique_instance_name(
114116
instance_config, max_length=azure_resources.MAX_RESOURCE_NAME_LEN

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
)
2626
from dstack._internal.core.models.instances import (
2727
InstanceConfiguration,
28+
InstanceOffer,
2829
InstanceOfferWithAvailability,
2930
SSHKey,
3031
)
@@ -144,6 +145,7 @@ def create_instance(
144145
self,
145146
instance_offer: InstanceOfferWithAvailability,
146147
instance_config: InstanceConfiguration,
148+
placement_group: Optional[PlacementGroup],
147149
) -> JobProvisioningData:
148150
"""
149151
Launches a new instance. It should return `JobProvisioningData` ASAP.
@@ -176,7 +178,7 @@ def run_job(
176178
)
177179
instance_offer = instance_offer.copy()
178180
self._restrict_instance_offer_az_to_volumes_az(instance_offer, volumes)
179-
return self.create_instance(instance_offer, instance_config)
181+
return self.create_instance(instance_offer, instance_config, placement_group=None)
180182

181183
def _restrict_instance_offer_az_to_volumes_az(
182184
self,
@@ -225,9 +227,15 @@ class ComputeWithPlacementGroupSupport(ABC):
225227
def create_placement_group(
226228
self,
227229
placement_group: PlacementGroup,
230+
master_instance_offer: InstanceOffer,
228231
) -> PlacementGroupProvisioningData:
229232
"""
230233
Creates a placement group.
234+
235+
Args:
236+
placement_group: details about the placement group to be created
237+
master_instance_offer: the first instance dstack will attempt to add
238+
to the placement group
231239
"""
232240
pass
233241

@@ -242,10 +250,27 @@ def delete_placement_group(
242250
"""
243251
pass
244252

253+
@abstractmethod
254+
def is_suitable_placement_group(
255+
self,
256+
placement_group: PlacementGroup,
257+
instance_offer: InstanceOffer,
258+
) -> bool:
259+
"""
260+
Checks if the instance offer can be provisioned in the placement group.
261+
262+
Should return immediately, without performing API calls.
263+
264+
Can be called with an offer originating from a different backend, because some backends
265+
(BackendType.DSTACK) produce offers on behalf of other backends. Should return `False`
266+
in that case.
267+
"""
268+
pass
269+
245270

246271
class ComputeWithGatewaySupport(ABC):
247272
"""
248-
Must be subclassed and imlemented to support gateways.
273+
Must be subclassed and implemented to support gateways.
249274
"""
250275

251276
@abstractmethod
@@ -418,6 +443,21 @@ def generate_unique_volume_name(
418443
)
419444

420445

446+
def generate_unique_placement_group_name(
447+
project_name: str,
448+
fleet_name: str,
449+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
450+
) -> str:
451+
"""
452+
Generates a unique placement group name valid across all backends.
453+
"""
454+
return generate_unique_backend_name(
455+
resource_name=fleet_name,
456+
project_name=project_name,
457+
max_length=max_length,
458+
)
459+
460+
421461
def generate_unique_backend_name(
422462
resource_name: str,
423463
project_name: Optional[str],

src/dstack/_internal/core/backends/cudo/compute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
InstanceConfiguration,
1919
InstanceOfferWithAvailability,
2020
)
21+
from dstack._internal.core.models.placement import PlacementGroup
2122
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
2223
from dstack._internal.utils.logging import get_logger
2324

@@ -58,6 +59,7 @@ def create_instance(
5859
self,
5960
instance_offer: InstanceOfferWithAvailability,
6061
instance_config: InstanceConfiguration,
62+
placement_group: Optional[PlacementGroup],
6163
) -> JobProvisioningData:
6264
vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
6365
public_keys = instance_config.get_public_keys()

src/dstack/_internal/core/backends/datacrunch/compute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
InstanceOffer,
2121
InstanceOfferWithAvailability,
2222
)
23+
from dstack._internal.core.models.placement import PlacementGroup
2324
from dstack._internal.core.models.resources import Memory, Range
2425
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
2526
from dstack._internal.utils.logging import get_logger
@@ -85,6 +86,7 @@ def create_instance(
8586
self,
8687
instance_offer: InstanceOfferWithAvailability,
8788
instance_config: InstanceConfiguration,
89+
placement_group: Optional[PlacementGroup],
8890
) -> JobProvisioningData:
8991
instance_name = generate_unique_instance_name(
9092
instance_config, max_length=MAX_INSTANCE_NAME_LEN

src/dstack/_internal/core/backends/gcp/auth.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[
1919
credentials, credentials_project_id = get_credentials(creds)
2020
if project_id is None:
2121
# If project_id is not specified explicitly, try using credentials' project_id.
22-
# Explicit project_id takes precedence bacause credentials' project_id may be irrelevant.
22+
# Explicit project_id takes precedence because credentials' project_id may be irrelevant.
2323
# For example, with Workload Identity Federation for GKE, it's cluster project_id.
2424
project_id = credentials_project_id
2525
if project_id is None:

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ def create_instance(
166166
self,
167167
instance_offer: InstanceOfferWithAvailability,
168168
instance_config: InstanceConfiguration,
169+
placement_group: Optional[PlacementGroup],
169170
) -> JobProvisioningData:
170171
instance_name = generate_unique_instance_name(
171172
instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
@@ -199,11 +200,11 @@ def create_instance(
199200
instance_type_name=instance_offer.instance.name,
200201
)
201202
placement_policy = None
202-
if instance_config.placement_group_name is not None:
203+
if placement_group is not None:
203204
placement_policy = gcp_resources.get_placement_policy_resource_name(
204205
project_id=self.config.project_id,
205206
region=instance_offer.region,
206-
placement_policy=instance_config.placement_group_name,
207+
placement_policy=placement_group.name,
207208
)
208209
labels = {
209210
"owner": "dstack",
@@ -406,6 +407,7 @@ def update_provisioning_data(
406407
def create_placement_group(
407408
self,
408409
placement_group: PlacementGroup,
410+
master_instance_offer: InstanceOffer,
409411
) -> PlacementGroupProvisioningData:
410412
policy = compute_v1.ResourcePolicy(
411413
name=placement_group.name,
@@ -440,6 +442,16 @@ def delete_placement_group(
440442
raise PlacementGroupInUseError()
441443
raise
442444

445+
def is_suitable_placement_group(
446+
self,
447+
placement_group: PlacementGroup,
448+
instance_offer: InstanceOffer,
449+
) -> bool:
450+
return (
451+
placement_group.configuration.backend == BackendType.GCP
452+
and placement_group.configuration.region == instance_offer.region
453+
)
454+
443455
def create_gateway(
444456
self,
445457
configuration: GatewayComputeConfiguration,

src/dstack/_internal/core/backends/lambdalabs/compute.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
InstanceOffer,
2121
InstanceOfferWithAvailability,
2222
)
23+
from dstack._internal.core.models.placement import PlacementGroup
2324
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
2425

2526
MAX_INSTANCE_NAME_LEN = 60
@@ -46,7 +47,10 @@ def get_offers(
4647
return offers_with_availability
4748

4849
def create_instance(
49-
self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
50+
self,
51+
instance_offer: InstanceOfferWithAvailability,
52+
instance_config: InstanceConfiguration,
53+
placement_group: Optional[PlacementGroup],
5054
) -> JobProvisioningData:
5155
instance_name = generate_unique_instance_name(
5256
instance_config, max_length=MAX_INSTANCE_NAME_LEN

src/dstack/_internal/core/backends/local/compute.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
InstanceType,
1616
Resources,
1717
)
18+
from dstack._internal.core.models.placement import PlacementGroup
1819
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
1920
from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
2021
from dstack._internal.utils.logging import get_logger
@@ -53,6 +54,7 @@ def create_instance(
5354
self,
5455
instance_offer: InstanceOfferWithAvailability,
5556
instance_config: InstanceConfiguration,
57+
placement_group: Optional[PlacementGroup],
5658
) -> JobProvisioningData:
5759
return JobProvisioningData(
5860
backend=instance_offer.backend,

0 commit comments

Comments
 (0)