Skip to content

Commit 91d59f6

Browse files
authored
Kubernetes: standardize object labeling (#3891)
Now all dstack-managed resources (jump pod, job pod, gateway, volume, registry-auth secret, services) get the same set of labels: * `app.kubernetes.io/name=dstack-{ssh-proxy|job|gateway|volume}` * `app.kubernetes.io/instance={unique_generated_name}` * `app.kubernetes.io/managed-by=dstack` * `k8s.dstack.ai/project` * `k8s.dstack.ai/name` (if applicable) * `k8s.dstack.ai/user` (if applicable)
1 parent 0bc4300 commit 91d59f6

2 files changed

Lines changed: 108 additions & 28 deletions

File tree

src/dstack/_internal/core/backends/kubernetes/compute.py

Lines changed: 84 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,17 @@
3636
AMD_GPU_NAME_TO_DEVICE_IDS,
3737
AMD_GPU_NODE_TAINT,
3838
AMD_GPU_RESOURCE,
39+
LABEL_VALUE_MAX_LENGTH,
3940
NVIDIA_GPU_NAME_TO_GPU_INFO,
4041
NVIDIA_GPU_NODE_TAINT,
4142
NVIDIA_GPU_PRODUCT_LABEL,
4243
NVIDIA_GPU_RESOURCE,
4344
OBJECT_NAME_MAX_LENGTH,
4445
PodPhase,
4546
TaintEffect,
47+
build_base_labels,
4648
build_dockerconfigjson,
4749
filter_invalid_labels,
48-
format_dstack_label_key,
4950
format_memory,
5051
get_amd_gpu_from_node_labels,
5152
get_gpu_request_from_gpu_spec,
@@ -191,20 +192,38 @@ def run_job(
191192
_create_jump_pod_service_if_not_exists(
192193
api=api,
193194
namespace=namespace,
195+
project_name=run.project_name,
194196
jump_pod_name=jump_pod_name,
195197
jump_pod_service_name=jump_pod_service_name,
196198
jump_pod_port=cluster.proxy_jump.port,
197199
project_ssh_public_key=project_ssh_public_key.strip(),
198200
)
199201

200-
pod_name = generate_unique_instance_name_for_job(run, job)
202+
pod_name = generate_unique_instance_name_for_job(
203+
run, job, max_length=LABEL_VALUE_MAX_LENGTH
204+
)
205+
206+
base_labels = build_base_labels(
207+
component="job",
208+
unique_name=pod_name,
209+
project=run.project_name,
210+
name=job.job_spec.job_name,
211+
user=run.user,
212+
)
213+
labels = merge_tags(
214+
base_tags=base_labels,
215+
resource_tags=run.run_spec.configuration.tags,
216+
)
217+
labels = filter_invalid_labels(labels)
218+
201219
registry_auth_secret_name: Optional[str] = None
202220
with ExitStack() as exit_stack:
203221
if job.job_spec.registry_auth is not None:
204222
registry_auth_secret_name = _get_registry_auth_secret_name(pod_name)
205223
_create_registry_auth_secret(
206224
api=api,
207225
namespace=namespace,
226+
labels=labels,
208227
secret_name=registry_auth_secret_name,
209228
image_name=job.job_spec.image_name,
210229
username=job.job_spec.registry_auth.username,
@@ -224,6 +243,7 @@ def run_job(
224243
_create_job_pod(
225244
api=api,
226245
namespace=namespace,
246+
labels=labels,
227247
pod_name=pod_name,
228248
registry_auth_secret_name=registry_auth_secret_name,
229249
run_spec=run.run_spec,
@@ -264,10 +284,13 @@ def run_job(
264284
api.create_namespaced_service(
265285
namespace=namespace,
266286
body=client.V1Service(
267-
metadata=client.V1ObjectMeta(name=pod_service_name),
287+
metadata=client.V1ObjectMeta(
288+
name=pod_service_name,
289+
labels=labels,
290+
),
268291
spec=client.V1ServiceSpec(
269292
type="ClusterIP",
270-
selector={"app.kubernetes.io/name": pod_name},
293+
selector=_build_service_selector_from_labels(base_labels),
271294
ports=[client.V1ServicePort(port=DSTACK_RUNNER_SSH_PORT)],
272295
),
273296
),
@@ -444,14 +467,30 @@ def create_gateway(
444467
"The `kubernetes` backend does not support the `instance_type`"
445468
" gateway configuration property"
446469
)
447-
instance_name = generate_unique_gateway_instance_name(configuration)
470+
471+
instance_name = generate_unique_gateway_instance_name(
472+
configuration, max_length=LABEL_VALUE_MAX_LENGTH
473+
)
474+
475+
base_labels = build_base_labels(
476+
component="gateway",
477+
unique_name=instance_name,
478+
project=configuration.project_name,
479+
name=configuration.instance_name,
480+
)
481+
labels = merge_tags(
482+
base_tags=base_labels,
483+
resource_tags=configuration.tags,
484+
)
485+
labels = filter_invalid_labels(labels)
486+
448487
commands = _get_gateway_commands(
449488
authorized_keys=[configuration.ssh_key_pub], router=configuration.router
450489
)
451490
pod = client.V1Pod(
452491
metadata=client.V1ObjectMeta(
453492
name=instance_name,
454-
labels={"app.kubernetes.io/name": instance_name},
493+
labels=labels,
455494
),
456495
spec=client.V1PodSpec(
457496
containers=[
@@ -486,10 +525,11 @@ def create_gateway(
486525
service = client.V1Service(
487526
metadata=client.V1ObjectMeta(
488527
name=_get_pod_service_name(instance_name),
528+
labels=labels,
489529
),
490530
spec=client.V1ServiceSpec(
491531
type="LoadBalancer",
492-
selector={"app.kubernetes.io/name": instance_name},
532+
selector=_build_service_selector_from_labels(base_labels),
493533
ports=[
494534
client.V1ServicePort(
495535
name="ssh",
@@ -608,6 +648,7 @@ def register_volume(self, volume: Volume) -> VolumeProvisioningData:
608648

609649
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
610650
assert isinstance(volume.configuration, KubernetesVolumeConfiguration)
651+
assert volume.configuration.size is not None
611652

612653
region = volume.configuration.region
613654
cluster = self.region_cluster_map.get(region)
@@ -618,21 +659,21 @@ def create_volume(self, volume: Volume) -> VolumeProvisioningData:
618659
api = client.CoreV1Api(cluster.api_client)
619660
namespace = cluster.namespace
620661

621-
labels = {
622-
format_dstack_label_key("owner"): "dstack",
623-
format_dstack_label_key("project"): volume.project_name,
624-
format_dstack_label_key("name"): volume.name,
625-
format_dstack_label_key("user"): volume.user,
626-
}
662+
pvc_name = generate_unique_volume_name(volume, max_length=LABEL_VALUE_MAX_LENGTH)
663+
664+
base_labels = build_base_labels(
665+
component="volume",
666+
unique_name=pvc_name,
667+
project=volume.project_name,
668+
name=volume.name,
669+
user=volume.user,
670+
)
627671
labels = merge_tags(
628-
base_tags=labels,
672+
base_tags=base_labels,
629673
resource_tags=volume.configuration.tags,
630674
)
631675
labels = filter_invalid_labels(labels)
632676

633-
assert volume.configuration.size is not None
634-
635-
pvc_name = generate_unique_volume_name(volume, max_length=OBJECT_NAME_MAX_LENGTH)
636677
pvc = client.V1PersistentVolumeClaim(
637678
metadata=client.V1ObjectMeta(
638679
name=pvc_name,
@@ -789,11 +830,19 @@ def _gpu_matches_gpu_spec(gpu: Gpu, gpu_spec: GPUSpec) -> bool:
789830
def _create_jump_pod_service_if_not_exists(
790831
api: client.CoreV1Api,
791832
namespace: str,
833+
project_name: str,
792834
jump_pod_name: str,
793835
jump_pod_service_name: str,
794836
jump_pod_port: Optional[int],
795837
project_ssh_public_key: str,
796838
) -> None:
839+
base_labels = build_base_labels(
840+
component="ssh-proxy",
841+
unique_name=jump_pod_name,
842+
project=project_name,
843+
)
844+
labels = filter_invalid_labels(base_labels)
845+
797846
service: Optional[client.V1Service] = None
798847
pod: Optional[client.V1Pod] = None
799848
_namespace = call_api_method(
@@ -805,7 +854,6 @@ def _create_jump_pod_service_if_not_exists(
805854
_namespace = client.V1Namespace(
806855
metadata=client.V1ObjectMeta(
807856
name=namespace,
808-
labels={"app.kubernetes.io/name": namespace},
809857
),
810858
)
811859
api.create_namespace(body=_namespace)
@@ -867,7 +915,7 @@ def _create_jump_pod_service_if_not_exists(
867915
pod = client.V1Pod(
868916
metadata=client.V1ObjectMeta(
869917
name=jump_pod_name,
870-
labels={"app.kubernetes.io/name": jump_pod_name},
918+
labels=labels,
871919
),
872920
spec=client.V1PodSpec(
873921
containers=[
@@ -897,10 +945,13 @@ def _create_jump_pod_service_if_not_exists(
897945
name=jump_pod_service_name,
898946
)
899947
service = client.V1Service(
900-
metadata=client.V1ObjectMeta(name=jump_pod_service_name),
948+
metadata=client.V1ObjectMeta(
949+
name=jump_pod_service_name,
950+
labels=labels,
951+
),
901952
spec=client.V1ServiceSpec(
902953
type="NodePort",
903-
selector={"app.kubernetes.io/name": jump_pod_name},
954+
selector=_build_service_selector_from_labels(base_labels),
904955
ports=[
905956
client.V1ServicePort(
906957
port=JUMP_POD_SSH_PORT,
@@ -1038,6 +1089,7 @@ def _get_jump_pod_commands(authorized_keys: list[str]) -> list[str]:
10381089
def _create_registry_auth_secret(
10391090
api: client.CoreV1Api,
10401091
namespace: str,
1092+
labels: dict[str, str],
10411093
secret_name: str,
10421094
image_name: str,
10431095
username: str,
@@ -1049,7 +1101,10 @@ def _create_registry_auth_secret(
10491101
password=password,
10501102
)
10511103
secret = client.V1Secret(
1052-
metadata=client.V1ObjectMeta(name=secret_name),
1104+
metadata=client.V1ObjectMeta(
1105+
name=secret_name,
1106+
labels=labels,
1107+
),
10531108
type="kubernetes.io/dockerconfigjson",
10541109
string_data={".dockerconfigjson": dockerconfigjson},
10551110
)
@@ -1062,6 +1117,7 @@ def _create_registry_auth_secret(
10621117
def _create_job_pod(
10631118
api: client.CoreV1Api,
10641119
namespace: str,
1120+
labels: dict[str, str],
10651121
pod_name: str,
10661122
registry_auth_secret_name: Optional[str],
10671123
run_spec: RunSpec,
@@ -1186,7 +1242,7 @@ def _create_job_pod(
11861242
pod = client.V1Pod(
11871243
metadata=client.V1ObjectMeta(
11881244
name=pod_name,
1189-
labels={"app.kubernetes.io/name": pod_name},
1245+
labels=labels,
11901246
),
11911247
spec=client.V1PodSpec(
11921248
containers=[
@@ -1399,6 +1455,11 @@ def _run_ssh_command(
13991455
return proc.returncode, proc.stdout
14001456

14011457

1458+
def _build_service_selector_from_labels(labels: dict[str, str]) -> dict[str, str]:
1459+
label_key = "app.kubernetes.io/instance"
1460+
return {label_key: labels[label_key]}
1461+
1462+
14021463
def _get_pod_service_name(pod_name: str) -> str:
14031464
return f"{pod_name}-service"
14041465

src/dstack/_internal/core/backends/kubernetes/resources.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from collections.abc import Mapping
66
from decimal import Decimal
77
from enum import Enum
8-
from typing import Callable, Optional, Union, cast
8+
from typing import Callable, Literal, Optional, Union, cast
99

1010
from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor
1111

@@ -135,6 +135,29 @@ def __sub__(self, other: Self) -> Self:
135135
return type(self)(**dct)
136136

137137

138+
def build_base_labels(
139+
*,
140+
component: Literal["ssh-proxy", "job", "gateway", "volume"],
141+
unique_name: str,
142+
project: str,
143+
name: Optional[str] = None,
144+
user: Optional[str] = None,
145+
) -> dict[str, str]:
146+
labels = {
147+
"app.kubernetes.io/name": f"dstack-{component}",
148+
# app.kubernetes.io/component would be redundant as app.kubernetes.io/name already includes
149+
# it with dstack- prefix
150+
"app.kubernetes.io/instance": unique_name,
151+
"app.kubernetes.io/managed-by": "dstack",
152+
"k8s.dstack.ai/project": project,
153+
}
154+
if name is not None:
155+
labels["k8s.dstack.ai/name"] = name
156+
if user is not None:
157+
labels["k8s.dstack.ai/user"] = user
158+
return labels
159+
160+
138161
def filter_invalid_labels(labels: dict[str, str]) -> dict[str, str]:
139162
filtered_labels: dict[str, str] = {}
140163
for k, v in labels.items():
@@ -178,10 +201,6 @@ def validate_label_value(value: str) -> None:
178201
raise ValueError("Invalid value")
179202

180203

181-
def format_dstack_label_key(name: str) -> str:
182-
return f"k8s.dstack.ai/{name}"
183-
184-
185204
def build_dockerconfigjson(image_name: str, username: str, password: str) -> str:
186205
registry = docker_utils.parse_image_name(image_name).registry
187206
if registry is None or docker_utils.is_default_registry(registry):

0 commit comments

Comments
 (0)