Skip to content

Commit de1ceac

Browse files
committed
refactor: adapt gpustack operator
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 917af32 commit de1ceac

1 file changed

Lines changed: 20 additions & 17 deletions

File tree

gpustack_runtime/deployer/kuberentes.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,7 +1060,7 @@ def _create_pod(
10601060
# Request quantity of devices with Kueue admission.
10611061
if not envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_NO_KUEUE_ADMISSION:
10621062
pod.metadata.labels["kueue.x-k8s.io/queue-name"] = (
1063-
self._find_kdp_devices_group(ren)
1063+
self._find_kueue_queue_name(ren)
10641064
)
10651065
continue
10661066
# Request device via visible devices env.
@@ -1088,7 +1088,7 @@ def _create_pod(
10881088
# Request quantity of devices with Kueue admission.
10891089
if not envs.GPUSTACK_RUNTIME_KUBERNETES_KDP_NO_KUEUE_ADMISSION:
10901090
pod.metadata.labels["kueue.x-k8s.io/queue-name"] = (
1091-
self._find_kdp_devices_group(ren)
1091+
self._find_kueue_queue_name(ren)
10921092
)
10931093
continue
10941094

@@ -1547,26 +1547,29 @@ def _find_self_pod(self) -> kubernetes.client.V1Pod | None:
15471547
namespace=self_pod_namespace,
15481548
)
15491549

1550-
def _find_kdp_devices_group(self, runtime_env: str) -> str:
1550+
def _find_kueue_queue_name(self, runtime_env: str) -> str:
15511551
manu = self.get_manufacturer(runtime_env)
1552-
crd_api = kubernetes.client.CustomObjectsApi(self._client)
1553-
1552+
core_api = kubernetes.client.CoreV1Api(self._client)
15541553
try:
1555-
devices = crd_api.get_cluster_custom_object(
1556-
group="worker.gpustack.ai",
1557-
version="v1",
1558-
plural="devices",
1559-
name=self._node_name,
1560-
)
1554+
# Iterate the labels of node to find the key as below:
1555+
# "feature.gpustack.ai/${manu}-${device-group-id}.profile-queue: <profile>",
1556+
# then combine the manufacturer, device group id and profile to get the queue name for Kueue admission.
1557+
#
1558+
# For example, for `feature.gpustack.ai/nvidia-tesla-t4.profile-queue: 12c-46g-1d`,
1559+
# the node key is `nvidia-tesla-t4`,
1560+
# the queue name is `gpustack-nvidia-tesla-t4-12c-46g-1d`.
1561+
node = core_api.read_node(name=self._node_name)
1562+
prefix = "feature.gpustack.ai/"
1563+
suffix = ".profile-queue"
1564+
for k, v in node.metadata.labels.items():
1565+
if k.startswith(f"{prefix}{manu}-") and k.endswith(suffix):
1566+
node_key = k[len(prefix) : -len(suffix)]
1567+
return f"gpustack-{node_key}-{v}"
15611568
except kubernetes.client.exceptions.ApiException as e:
1562-
msg = f"Failed to get KDP devices of node {self._node_name} for runtime environment {runtime_env}{_detail_api_call_error(e)}"
1569+
msg = f"Failed to get Kueue queue name on node {self._node_name} for runtime environment {runtime_env}{_detail_api_call_error(e)}"
15631570
raise OperationError(msg) from e
15641571

1565-
for group in devices["spec"]["groups"] or []:
1566-
if group["manufacturer"] == manu:
1567-
return f"gpustack-{group['id']!s}"
1568-
1569-
msg = f"Failed to find KDP devices group for runtime environment {runtime_env} on node {self._node_name}"
1572+
msg = f"Failed to find Kueue queue name on node {self._node_name} for runtime environment {runtime_env}"
15701573
raise OperationError(msg)
15711574

15721575
@_supported

0 commit comments

Comments
 (0)