3636 AMD_GPU_NAME_TO_DEVICE_IDS ,
3737 AMD_GPU_NODE_TAINT ,
3838 AMD_GPU_RESOURCE ,
39+ LABEL_VALUE_MAX_LENGTH ,
3940 NVIDIA_GPU_NAME_TO_GPU_INFO ,
4041 NVIDIA_GPU_NODE_TAINT ,
4142 NVIDIA_GPU_PRODUCT_LABEL ,
4243 NVIDIA_GPU_RESOURCE ,
4344 OBJECT_NAME_MAX_LENGTH ,
4445 PodPhase ,
4546 TaintEffect ,
47+ build_base_labels ,
4648 build_dockerconfigjson ,
4749 filter_invalid_labels ,
48- format_dstack_label_key ,
4950 format_memory ,
5051 get_amd_gpu_from_node_labels ,
5152 get_gpu_request_from_gpu_spec ,
@@ -191,20 +192,38 @@ def run_job(
191192 _create_jump_pod_service_if_not_exists (
192193 api = api ,
193194 namespace = namespace ,
195+ project_name = run .project_name ,
194196 jump_pod_name = jump_pod_name ,
195197 jump_pod_service_name = jump_pod_service_name ,
196198 jump_pod_port = cluster .proxy_jump .port ,
197199 project_ssh_public_key = project_ssh_public_key .strip (),
198200 )
199201
200- pod_name = generate_unique_instance_name_for_job (run , job )
202+ pod_name = generate_unique_instance_name_for_job (
203+ run , job , max_length = LABEL_VALUE_MAX_LENGTH
204+ )
205+
206+ base_labels = build_base_labels (
207+ component = "job" ,
208+ unique_name = pod_name ,
209+ project = run .project_name ,
210+ name = job .job_spec .job_name ,
211+ user = run .user ,
212+ )
213+ labels = merge_tags (
214+ base_tags = base_labels ,
215+ resource_tags = run .run_spec .configuration .tags ,
216+ )
217+ labels = filter_invalid_labels (labels )
218+
201219 registry_auth_secret_name : Optional [str ] = None
202220 with ExitStack () as exit_stack :
203221 if job .job_spec .registry_auth is not None :
204222 registry_auth_secret_name = _get_registry_auth_secret_name (pod_name )
205223 _create_registry_auth_secret (
206224 api = api ,
207225 namespace = namespace ,
226+ labels = labels ,
208227 secret_name = registry_auth_secret_name ,
209228 image_name = job .job_spec .image_name ,
210229 username = job .job_spec .registry_auth .username ,
@@ -224,6 +243,7 @@ def run_job(
224243 _create_job_pod (
225244 api = api ,
226245 namespace = namespace ,
246+ labels = labels ,
227247 pod_name = pod_name ,
228248 registry_auth_secret_name = registry_auth_secret_name ,
229249 run_spec = run .run_spec ,
@@ -264,10 +284,13 @@ def run_job(
264284 api .create_namespaced_service (
265285 namespace = namespace ,
266286 body = client .V1Service (
267- metadata = client .V1ObjectMeta (name = pod_service_name ),
287+ metadata = client .V1ObjectMeta (
288+ name = pod_service_name ,
289+ labels = labels ,
290+ ),
268291 spec = client .V1ServiceSpec (
269292 type = "ClusterIP" ,
270- selector = { "app.kubernetes.io/name" : pod_name } ,
293+ selector = _build_service_selector_from_labels ( base_labels ) ,
271294 ports = [client .V1ServicePort (port = DSTACK_RUNNER_SSH_PORT )],
272295 ),
273296 ),
@@ -444,14 +467,30 @@ def create_gateway(
444467 "The `kubernetes` backend does not support the `instance_type`"
445468 " gateway configuration property"
446469 )
447- instance_name = generate_unique_gateway_instance_name (configuration )
470+
471+ instance_name = generate_unique_gateway_instance_name (
472+ configuration , max_length = LABEL_VALUE_MAX_LENGTH
473+ )
474+
475+ base_labels = build_base_labels (
476+ component = "gateway" ,
477+ unique_name = instance_name ,
478+ project = configuration .project_name ,
479+ name = configuration .instance_name ,
480+ )
481+ labels = merge_tags (
482+ base_tags = base_labels ,
483+ resource_tags = configuration .tags ,
484+ )
485+ labels = filter_invalid_labels (labels )
486+
448487 commands = _get_gateway_commands (
449488 authorized_keys = [configuration .ssh_key_pub ], router = configuration .router
450489 )
451490 pod = client .V1Pod (
452491 metadata = client .V1ObjectMeta (
453492 name = instance_name ,
454- labels = { "app.kubernetes.io/name" : instance_name } ,
493+ labels = labels ,
455494 ),
456495 spec = client .V1PodSpec (
457496 containers = [
@@ -486,10 +525,11 @@ def create_gateway(
486525 service = client .V1Service (
487526 metadata = client .V1ObjectMeta (
488527 name = _get_pod_service_name (instance_name ),
528+ labels = labels ,
489529 ),
490530 spec = client .V1ServiceSpec (
491531 type = "LoadBalancer" ,
492- selector = { "app.kubernetes.io/name" : instance_name } ,
532+ selector = _build_service_selector_from_labels ( base_labels ) ,
493533 ports = [
494534 client .V1ServicePort (
495535 name = "ssh" ,
@@ -608,6 +648,7 @@ def register_volume(self, volume: Volume) -> VolumeProvisioningData:
608648
609649 def create_volume (self , volume : Volume ) -> VolumeProvisioningData :
610650 assert isinstance (volume .configuration , KubernetesVolumeConfiguration )
651+ assert volume .configuration .size is not None
611652
612653 region = volume .configuration .region
613654 cluster = self .region_cluster_map .get (region )
@@ -618,21 +659,21 @@ def create_volume(self, volume: Volume) -> VolumeProvisioningData:
618659 api = client .CoreV1Api (cluster .api_client )
619660 namespace = cluster .namespace
620661
621- labels = {
622- format_dstack_label_key ("owner" ): "dstack" ,
623- format_dstack_label_key ("project" ): volume .project_name ,
624- format_dstack_label_key ("name" ): volume .name ,
625- format_dstack_label_key ("user" ): volume .user ,
626- }
662+ pvc_name = generate_unique_volume_name (volume , max_length = LABEL_VALUE_MAX_LENGTH )
663+
664+ base_labels = build_base_labels (
665+ component = "volume" ,
666+ unique_name = pvc_name ,
667+ project = volume .project_name ,
668+ name = volume .name ,
669+ user = volume .user ,
670+ )
627671 labels = merge_tags (
628- base_tags = labels ,
672+ base_tags = base_labels ,
629673 resource_tags = volume .configuration .tags ,
630674 )
631675 labels = filter_invalid_labels (labels )
632676
633- assert volume .configuration .size is not None
634-
635- pvc_name = generate_unique_volume_name (volume , max_length = OBJECT_NAME_MAX_LENGTH )
636677 pvc = client .V1PersistentVolumeClaim (
637678 metadata = client .V1ObjectMeta (
638679 name = pvc_name ,
@@ -789,11 +830,19 @@ def _gpu_matches_gpu_spec(gpu: Gpu, gpu_spec: GPUSpec) -> bool:
789830def _create_jump_pod_service_if_not_exists (
790831 api : client .CoreV1Api ,
791832 namespace : str ,
833+ project_name : str ,
792834 jump_pod_name : str ,
793835 jump_pod_service_name : str ,
794836 jump_pod_port : Optional [int ],
795837 project_ssh_public_key : str ,
796838) -> None :
839+ base_labels = build_base_labels (
840+ component = "ssh-proxy" ,
841+ unique_name = jump_pod_name ,
842+ project = project_name ,
843+ )
844+ labels = filter_invalid_labels (base_labels )
845+
797846 service : Optional [client .V1Service ] = None
798847 pod : Optional [client .V1Pod ] = None
799848 _namespace = call_api_method (
@@ -805,7 +854,6 @@ def _create_jump_pod_service_if_not_exists(
805854 _namespace = client .V1Namespace (
806855 metadata = client .V1ObjectMeta (
807856 name = namespace ,
808- labels = {"app.kubernetes.io/name" : namespace },
809857 ),
810858 )
811859 api .create_namespace (body = _namespace )
@@ -867,7 +915,7 @@ def _create_jump_pod_service_if_not_exists(
867915 pod = client .V1Pod (
868916 metadata = client .V1ObjectMeta (
869917 name = jump_pod_name ,
870- labels = { "app.kubernetes.io/name" : jump_pod_name } ,
918+ labels = labels ,
871919 ),
872920 spec = client .V1PodSpec (
873921 containers = [
@@ -897,10 +945,13 @@ def _create_jump_pod_service_if_not_exists(
897945 name = jump_pod_service_name ,
898946 )
899947 service = client .V1Service (
900- metadata = client .V1ObjectMeta (name = jump_pod_service_name ),
948+ metadata = client .V1ObjectMeta (
949+ name = jump_pod_service_name ,
950+ labels = labels ,
951+ ),
901952 spec = client .V1ServiceSpec (
902953 type = "NodePort" ,
903- selector = { "app.kubernetes.io/name" : jump_pod_name } ,
954+ selector = _build_service_selector_from_labels ( base_labels ) ,
904955 ports = [
905956 client .V1ServicePort (
906957 port = JUMP_POD_SSH_PORT ,
@@ -1038,6 +1089,7 @@ def _get_jump_pod_commands(authorized_keys: list[str]) -> list[str]:
10381089def _create_registry_auth_secret (
10391090 api : client .CoreV1Api ,
10401091 namespace : str ,
1092+ labels : dict [str , str ],
10411093 secret_name : str ,
10421094 image_name : str ,
10431095 username : str ,
@@ -1049,7 +1101,10 @@ def _create_registry_auth_secret(
10491101 password = password ,
10501102 )
10511103 secret = client .V1Secret (
1052- metadata = client .V1ObjectMeta (name = secret_name ),
1104+ metadata = client .V1ObjectMeta (
1105+ name = secret_name ,
1106+ labels = labels ,
1107+ ),
10531108 type = "kubernetes.io/dockerconfigjson" ,
10541109 string_data = {".dockerconfigjson" : dockerconfigjson },
10551110 )
@@ -1062,6 +1117,7 @@ def _create_registry_auth_secret(
10621117def _create_job_pod (
10631118 api : client .CoreV1Api ,
10641119 namespace : str ,
1120+ labels : dict [str , str ],
10651121 pod_name : str ,
10661122 registry_auth_secret_name : Optional [str ],
10671123 run_spec : RunSpec ,
@@ -1186,7 +1242,7 @@ def _create_job_pod(
11861242 pod = client .V1Pod (
11871243 metadata = client .V1ObjectMeta (
11881244 name = pod_name ,
1189- labels = { "app.kubernetes.io/name" : pod_name } ,
1245+ labels = labels ,
11901246 ),
11911247 spec = client .V1PodSpec (
11921248 containers = [
@@ -1399,6 +1455,11 @@ def _run_ssh_command(
13991455 return proc .returncode , proc .stdout
14001456
14011457
1458+ def _build_service_selector_from_labels (labels : dict [str , str ]) -> dict [str , str ]:
1459+ label_key = "app.kubernetes.io/instance"
1460+ return {label_key : labels [label_key ]}
1461+
1462+
14021463def _get_pod_service_name (pod_name : str ) -> str :
14031464 return f"{ pod_name } -service"
14041465
0 commit comments