Skip to content

Commit cbde51d

Browse files
committed
Use AS_COMPACT collocation for GCP A3 clusters
1 parent f00aeae commit cbde51d

1 file changed

Lines changed: 42 additions & 6 deletions

File tree

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ class GCPVolumeDiskBackendData(CoreModel):
7676
disk_type: str
7777

7878

79+
class GCPPlacementGroupBackendData(CoreModel):
80+
collocation: str
81+
82+
7983
class GCPCompute(
8084
ComputeWithCreateInstanceSupport,
8185
ComputeWithMultinodeSupport,
@@ -409,20 +413,34 @@ def create_placement_group(
409413
placement_group: PlacementGroup,
410414
master_instance_offer: InstanceOffer,
411415
) -> PlacementGroupProvisioningData:
416+
group_placement_policy = compute_v1.ResourcePolicyGroupPlacementPolicy(
417+
availability_domain_count=1,
418+
collocation="COLLOCATED",
419+
)
420+
if _instance_supports_as_compact_placement(master_instance_offer):
421+
group_placement_policy = compute_v1.ResourcePolicyGroupPlacementPolicy(
422+
# GCP documents only collocation="COLLOCATED"
423+
# but collocation="AS_COMPACT" actually places VMs on the same host
424+
# and improves networking performance. Discovered with Gemini.
425+
# Tested to work with A3 instances.
426+
collocation="AS_COMPACT",
427+
)
412428
policy = compute_v1.ResourcePolicy(
413429
name=placement_group.name,
414430
region=placement_group.configuration.region,
415-
group_placement_policy=compute_v1.ResourcePolicyGroupPlacementPolicy(
416-
availability_domain_count=1,
417-
collocation="COLLOCATED",
418-
),
431+
group_placement_policy=group_placement_policy,
419432
)
420433
self.resource_policies_client.insert(
421434
project=self.config.project_id,
422435
region=placement_group.configuration.region,
423436
resource_policy_resource=policy,
424437
)
425-
return PlacementGroupProvisioningData(backend=BackendType.GCP)
438+
return PlacementGroupProvisioningData(
439+
backend=BackendType.GCP,
440+
backend_data=GCPPlacementGroupBackendData(
441+
collocation=group_placement_policy.collocation
442+
).json(),
443+
)
426444

427445
def delete_placement_group(
428446
self,
@@ -447,10 +465,20 @@ def is_suitable_placement_group(
447465
placement_group: PlacementGroup,
448466
instance_offer: InstanceOffer,
449467
) -> bool:
450-
return (
468+
if not (
451469
placement_group.configuration.backend == BackendType.GCP
452470
and placement_group.configuration.region == instance_offer.region
471+
):
472+
return False
473+
provisioning_data = get_or_error(placement_group.provisioning_data)
474+
if provisioning_data.backend_data is None:
475+
return True
476+
backend_data_parsed = GCPPlacementGroupBackendData.parse_raw(
477+
provisioning_data.backend_data
453478
)
479+
if backend_data_parsed.collocation == "AS_COMPACT":
480+
return _instance_supports_as_compact_placement(instance_offer)
481+
return True
454482

455483
def create_gateway(
456484
self,
@@ -916,6 +944,14 @@ def _get_user_data(authorized_keys: List[str], instance_type_name: str) -> str:
916944
)
917945

918946

947+
def _instance_supports_as_compact_placement(instance_offer: InstanceOffer) -> bool:
948+
return instance_offer.instance.name in [
949+
"a3-edgegpu-8g",
950+
"a3-highgpu-8g",
951+
"a3-megagpu-8g",
952+
]
953+
954+
919955
def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
920956
if instance_type_name == "a3-megagpu-8g":
921957
return tcpx_features.get_backend_specific_commands_tcpxo()

0 commit comments

Comments
 (0)