Skip to content

Commit 773ea85

Browse files
authored
Merge pull request opensandbox-group#752 from Pangjiping/feat/server/windows-k8s
feat(server): windows profile for batchsandbox
2 parents 56ba428 + 6b0efbd commit 773ea85

11 files changed

Lines changed: 481 additions & 60 deletions

server/opensandbox_server/services/k8s/agent_sandbox_provider.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)
4040
from opensandbox_server.services.k8s.volume_helper import apply_volumes_to_pod_spec
4141
from opensandbox_server.services.k8s.workload_provider import WorkloadProvider
42+
from opensandbox_server.services.k8s.windows_profile import is_windows_profile
4243
from opensandbox_server.services.runtime_resolver import SecureRuntimeResolver
4344

4445
logger = logging.getLogger(__name__)
@@ -140,12 +141,11 @@ def create_workload(
140141
egress_mode: str = EGRESS_MODE_DNS,
141142
) -> Dict[str, Any]:
142143
"""Create an agent-sandbox Sandbox CRD workload."""
144+
if is_windows_profile(platform):
145+
raise ValueError("agent-sandbox does not support platform.os=windows.")
146+
143147
if self.runtime_class:
144-
logger.info(
145-
"Using Kubernetes RuntimeClass '%s' for sandbox %s",
146-
self.runtime_class,
147-
sandbox_id,
148-
)
148+
logger.info(f"Using Kubernetes RuntimeClass '{self.runtime_class}' for sandbox {sandbox_id}")
149149

150150
pod_spec = self._build_pod_spec(
151151
image_spec=image_spec,
@@ -260,7 +260,6 @@ def _build_pod_spec(
260260
entrypoint=entrypoint,
261261
env=env,
262262
resource_limits=resource_limits,
263-
include_execd_volume=True,
264263
has_network_policy=network_policy is not None,
265264
)
266265

@@ -363,7 +362,7 @@ def get_expiration(self, workload: Dict[str, Any]) -> Optional[datetime]:
363362
try:
364363
return datetime.fromisoformat(shutdown_time_str.replace("Z", "+00:00"))
365364
except (ValueError, TypeError) as e:
366-
logger.warning("Invalid shutdownTime format: %s, error: %s", shutdown_time_str, e)
365+
logger.warning(f"Invalid shutdownTime format: {shutdown_time_str}, error: {e}")
367366
return None
368367

369368
def get_status(self, workload: Dict[str, Any]) -> Dict[str, Any]:
@@ -515,7 +514,7 @@ def get_endpoint_info(self, workload: Dict[str, Any], port: int, sandbox_id: str
515514
if pod.status and pod.status.pod_ip and pod.status.phase == "Running":
516515
return Endpoint(endpoint=f"{pod.status.pod_ip}:{port}")
517516
except Exception as e:
518-
logger.warning("Failed to resolve pod endpoint: %s", e)
517+
logger.warning(f"Failed to resolve pod endpoint: {e}")
519518

520519
service_fqdn = status.get("serviceFQDN")
521520
if service_fqdn:

server/opensandbox_server/services/k8s/batchsandbox_provider.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@
4646
_extract_platform_unschedulable_message_from_pod,
4747
_workload_platform_constraint_scope,
4848
)
49+
from opensandbox_server.services.k8s.windows_profile import (
50+
apply_windows_profile_arch_selector,
51+
apply_windows_profile_overrides,
52+
is_windows_profile,
53+
validate_windows_profile_resource_limits,
54+
)
4955
from opensandbox_server.services.k8s.volume_helper import apply_volumes_to_pod_spec
5056
from opensandbox_server.services.k8s.workload_provider import WorkloadProvider
5157
from opensandbox_server.services.runtime_resolver import SecureRuntimeResolver
@@ -67,7 +73,7 @@ def __init__(
6773
k8s_config = app_config.kubernetes if app_config else None
6874
template_file_path = k8s_config.batchsandbox_template_file if k8s_config else None
6975
if template_file_path:
70-
logger.info("Using BatchSandbox template file: %s", template_file_path)
76+
logger.info(f"Using BatchSandbox template file: {template_file_path}")
7177
self.execd_init_resources = k8s_config.execd_init_resources if k8s_config else None
7278

7379
self.resolver = SecureRuntimeResolver(app_config) if app_config else None
@@ -113,13 +119,10 @@ def create_workload(
113119
) -> Dict[str, Any]:
114120
"""Create a BatchSandbox in template mode or pool mode."""
115121
extensions = extensions or {}
122+
windows_profile = is_windows_profile(platform)
116123

117124
if self.runtime_class:
118-
logger.info(
119-
"Using Kubernetes RuntimeClass '%s' for sandbox %s",
120-
self.runtime_class,
121-
sandbox_id,
122-
)
125+
logger.info(f"Using Kubernetes RuntimeClass '{self.runtime_class}' for sandbox {sandbox_id}")
123126

124127
if extensions.get("poolRef"):
125128
if platform is not None:
@@ -145,6 +148,9 @@ def create_workload(
145148

146149
extra_volumes, extra_mounts = self._extract_template_pod_extras()
147150

151+
if windows_profile:
152+
validate_windows_profile_resource_limits(resource_limits)
153+
148154
disable_ipv6_for_egress = (
149155
network_policy is not None
150156
and egress_image is not None
@@ -161,13 +167,11 @@ def create_workload(
161167
entrypoint=entrypoint,
162168
env=env,
163169
resource_limits=resource_limits,
164-
include_execd_volume=True,
165170
has_network_policy=network_policy is not None,
166171
)
167172

168173
containers = [_container_to_dict(main_container)]
169-
170-
pod_spec: Dict[str, Any] = {
174+
pod_spec = {
171175
"initContainers": [_container_to_dict(init_container)],
172176
"containers": containers,
173177
"volumes": [
@@ -177,8 +181,29 @@ def create_workload(
177181
}
178182
],
179183
}
180-
self._apply_platform_node_selector(pod_spec, platform)
184+
if windows_profile:
185+
apply_windows_profile_overrides(
186+
pod_spec=pod_spec,
187+
entrypoint=entrypoint,
188+
env=env,
189+
resource_limits=resource_limits,
190+
disable_ipv6_for_egress=disable_ipv6_for_egress,
191+
)
192+
template = self.template_manager.get_base_template()
193+
template_spec = (
194+
template.get("spec", {})
195+
.get("template", {})
196+
.get("spec", {})
197+
)
198+
apply_windows_profile_arch_selector(
199+
pod_spec=pod_spec,
200+
template_spec=template_spec if isinstance(template_spec, dict) else {},
201+
platform=platform,
202+
)
203+
else:
204+
self._apply_platform_node_selector(pod_spec, platform)
181205

206+
containers = pod_spec.get("containers", [])
182207
if self.runtime_class:
183208
pod_spec["runtimeClassName"] = self.runtime_class
184209

@@ -222,7 +247,7 @@ def create_workload(
222247
else:
223248
batchsandbox["spec"]["expireTime"] = expires_at.isoformat()
224249
self._merge_pod_spec_extras(batchsandbox, extra_volumes, extra_mounts)
225-
if platform is not None:
250+
if platform is not None and not windows_profile:
226251
merged_pod_spec = batchsandbox.get("spec", {}).get("template", {}).get("spec", {})
227252
WorkloadProvider.ensure_platform_compatible_with_affinity(merged_pod_spec, platform)
228253

@@ -245,9 +270,9 @@ def create_workload(
245270
)
246271
try:
247272
self.k8s_client.create_secret(namespace=namespace, body=secret)
248-
logger.info("Created imagePullSecret for sandbox %s", sandbox_id)
273+
logger.info(f"Created imagePullSecret for sandbox {sandbox_id}")
249274
except Exception:
250-
logger.warning("Failed to create imagePullSecret for sandbox %s, rolling back BatchSandbox", sandbox_id)
275+
logger.warning(f"Failed to create imagePullSecret for sandbox {sandbox_id}, rolling back BatchSandbox")
251276
try:
252277
self.k8s_client.delete_custom_object(
253278
group=self.group,
@@ -258,7 +283,7 @@ def create_workload(
258283
grace_period_seconds=0,
259284
)
260285
except Exception as del_exc:
261-
logger.warning("Failed to rollback BatchSandbox %s: %s", sandbox_id, del_exc)
286+
logger.warning(f"Failed to rollback BatchSandbox {sandbox_id}: {del_exc}")
262287
raise
263288

264289
return {
@@ -502,7 +527,7 @@ def get_expiration(self, workload: Dict[str, Any]) -> Optional[datetime]:
502527
try:
503528
return datetime.fromisoformat(expire_time_str.replace('Z', '+00:00'))
504529
except (ValueError, TypeError) as e:
505-
logger.warning("Invalid expireTime format: %s, error: %s", expire_time_str, e)
530+
logger.warning(f"Invalid expireTime format: {expire_time_str}, error: {e}")
506531
return None
507532

508533
def _parse_pod_ip(self, workload: Dict[str, Any]) -> Optional[str]:

server/opensandbox_server/services/k8s/client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def _get_informer(self, group: str, version: str, plural: str, namespace: str) -
113113
try:
114114
informer.start()
115115
except Exception as exc: # pragma: no cover - defensive
116-
logger.warning("Failed to start informer for %s/%s: %s", plural, namespace, exc)
116+
logger.warning(f"Failed to start informer for {plural}/{namespace}: {exc}")
117117
self._informers.pop(key, None)
118118
return None
119119
return informer

server/opensandbox_server/services/k8s/informer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,12 @@ def _run(self) -> None:
142142
self._resource_version = None
143143
self._has_synced = False
144144
else:
145-
logger.warning("Informer watch error: %s", exc, exc_info=True)
145+
logger.warning(f"Informer watch error: {exc}", exc_info=True)
146146
self._has_synced = False
147147
self._stop_event.wait(min(backoff, 30.0))
148148
backoff = min(backoff * 2, 30.0)
149149
except Exception as exc: # pragma: no cover - defensive
150-
logger.warning("Unexpected informer error: %s", exc, exc_info=True)
150+
logger.warning(f"Unexpected informer error: {exc}", exc_info=True)
151151
self._has_synced = False
152152
self._stop_event.wait(min(backoff, 30.0))
153153
backoff = min(backoff * 2, 30.0)

server/opensandbox_server/services/k8s/kubernetes_service.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -316,14 +316,13 @@ def _ensure_pvc_volumes(self, volumes: list) -> None:
316316
except ApiException as e:
317317
if e.status == 403:
318318
logger.warning(
319-
"No RBAC permission to read PVC '%s', skipping auto-create. "
320-
"Grant 'get' and 'create' on 'persistentvolumeclaims' to enable.",
321-
claim_name,
319+
f"No RBAC permission to read PVC '{claim_name}', skipping auto-create. "
320+
"Grant 'get' and 'create' on 'persistentvolumeclaims' to enable."
322321
)
323322
return # Skip all remaining PVCs — same SA, same permissions
324323
raise
325324
if existing is not None:
326-
logger.debug("PVC '%s' already exists in namespace '%s'", claim_name, self.namespace)
325+
logger.debug(f"PVC '{claim_name}' already exists in namespace '{self.namespace}'")
327326
continue
328327

329328
storage = vol.pvc.storage or default_size
@@ -346,18 +345,17 @@ def _ensure_pvc_volumes(self, volumes: list) -> None:
346345
try:
347346
self.k8s_client.create_pvc(self.namespace, pvc_body)
348347
logger.info(
349-
"Auto-created PVC '%s' (size=%s, class=%s) in namespace '%s'",
350-
claim_name, storage, storage_class or "<default>", self.namespace,
348+
f"Auto-created PVC '{claim_name}' (size={storage}, class={storage_class or '<default>'}) "
349+
f"in namespace '{self.namespace}'"
351350
)
352351
except ApiException as e:
353352
if e.status == 409:
354353
# Race condition: another request created it between our check and create
355-
logger.info("PVC '%s' was created concurrently, proceeding", claim_name)
354+
logger.info(f"PVC '{claim_name}' was created concurrently, proceeding")
356355
elif e.status == 403:
357356
logger.warning(
358-
"No RBAC permission to create PVC '%s', skipping. "
359-
"The PVC must be pre-created or RBAC must be updated.",
360-
claim_name,
357+
f"No RBAC permission to create PVC '{claim_name}', skipping. "
358+
"The PVC must be pre-created or RBAC must be updated."
361359
)
362360
elif e.status in (400, 422):
363361
# Invalid PVC spec from user-provided hints
@@ -371,7 +369,7 @@ def _ensure_pvc_volumes(self, volumes: list) -> None:
371369
},
372370
) from e
373371
else:
374-
logger.error("Failed to create PVC '%s': %s", claim_name, e)
372+
logger.error(f"Failed to create PVC '{claim_name}': {e}")
375373
raise HTTPException(
376374
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
377375
detail={

server/opensandbox_server/services/k8s/pool_service.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def create_pool(self, request: CreatePoolRequest) -> PoolResponse:
119119
plural=_PLURAL,
120120
body=manifest,
121121
)
122-
logger.info("Created pool: name=%s, namespace=%s", request.name, self._namespace)
122+
logger.info(f"Created pool: name={request.name}, namespace={self._namespace}")
123123
return self._pool_from_raw(created)
124124

125125
except ApiException as e:
@@ -131,7 +131,7 @@ def create_pool(self, request: CreatePoolRequest) -> PoolResponse:
131131
"message": f"Pool '{request.name}' already exists.",
132132
},
133133
) from e
134-
logger.error("Kubernetes API error creating pool %s: %s", request.name, e)
134+
logger.error(f"Kubernetes API error creating pool {request.name}: {e}")
135135
raise HTTPException(
136136
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
137137
detail={
@@ -140,7 +140,7 @@ def create_pool(self, request: CreatePoolRequest) -> PoolResponse:
140140
},
141141
) from e
142142
except Exception as e:
143-
logger.error("Unexpected error creating pool %s: %s", request.name, e)
143+
logger.error(f"Unexpected error creating pool {request.name}: {e}")
144144
raise HTTPException(
145145
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
146146
detail={
@@ -170,7 +170,7 @@ def get_pool(self, pool_name: str) -> PoolResponse:
170170
"message": f"Pool '{pool_name}' not found.",
171171
},
172172
) from e
173-
logger.error("Kubernetes API error getting pool %s: %s", pool_name, e)
173+
logger.error(f"Kubernetes API error getting pool {pool_name}: {e}")
174174
raise HTTPException(
175175
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
176176
detail={
@@ -181,7 +181,7 @@ def get_pool(self, pool_name: str) -> PoolResponse:
181181
except HTTPException:
182182
raise
183183
except Exception as e:
184-
logger.error("Unexpected error getting pool %s: %s", pool_name, e)
184+
logger.error(f"Unexpected error getting pool {pool_name}: {e}")
185185
raise HTTPException(
186186
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
187187
detail={
@@ -209,7 +209,7 @@ def list_pools(self) -> ListPoolsResponse:
209209
# CRD not installed — return empty list gracefully
210210
logger.warning("Pool CRD not found (404); returning empty list.")
211211
return ListPoolsResponse(items=[])
212-
logger.error("Kubernetes API error listing pools: %s", e)
212+
logger.error(f"Kubernetes API error listing pools: {e}")
213213
raise HTTPException(
214214
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
215215
detail={
@@ -218,7 +218,7 @@ def list_pools(self) -> ListPoolsResponse:
218218
},
219219
) from e
220220
except Exception as e:
221-
logger.error("Unexpected error listing pools: %s", e)
221+
logger.error(f"Unexpected error listing pools: {e}")
222222
raise HTTPException(
223223
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
224224
detail={
@@ -249,7 +249,7 @@ def update_pool(self, pool_name: str, request: UpdatePoolRequest) -> PoolRespons
249249
name=pool_name,
250250
body=patch_body,
251251
)
252-
logger.info("Updated pool capacity: name=%s", pool_name)
252+
logger.info(f"Updated pool capacity: name={pool_name}")
253253
return self._pool_from_raw(updated)
254254

255255
except ApiException as e:
@@ -261,7 +261,7 @@ def update_pool(self, pool_name: str, request: UpdatePoolRequest) -> PoolRespons
261261
"message": f"Pool '{pool_name}' not found.",
262262
},
263263
) from e
264-
logger.error("Kubernetes API error updating pool %s: %s", pool_name, e)
264+
logger.error(f"Kubernetes API error updating pool {pool_name}: {e}")
265265
raise HTTPException(
266266
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
267267
detail={
@@ -272,7 +272,7 @@ def update_pool(self, pool_name: str, request: UpdatePoolRequest) -> PoolRespons
272272
except HTTPException:
273273
raise
274274
except Exception as e:
275-
logger.error("Unexpected error updating pool %s: %s", pool_name, e)
275+
logger.error(f"Unexpected error updating pool {pool_name}: {e}")
276276
raise HTTPException(
277277
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
278278
detail={
@@ -292,7 +292,7 @@ def delete_pool(self, pool_name: str) -> None:
292292
name=pool_name,
293293
grace_period_seconds=0,
294294
)
295-
logger.info("Deleted pool: name=%s, namespace=%s", pool_name, self._namespace)
295+
logger.info(f"Deleted pool: name={pool_name}, namespace={self._namespace}")
296296

297297
except ApiException as e:
298298
if e.status == 404:
@@ -303,7 +303,7 @@ def delete_pool(self, pool_name: str) -> None:
303303
"message": f"Pool '{pool_name}' not found.",
304304
},
305305
) from e
306-
logger.error("Kubernetes API error deleting pool %s: %s", pool_name, e)
306+
logger.error(f"Kubernetes API error deleting pool {pool_name}: {e}")
307307
raise HTTPException(
308308
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
309309
detail={
@@ -314,7 +314,7 @@ def delete_pool(self, pool_name: str) -> None:
314314
except HTTPException:
315315
raise
316316
except Exception as e:
317-
logger.error("Unexpected error deleting pool %s: %s", pool_name, e)
317+
logger.error(f"Unexpected error deleting pool {pool_name}: {e}")
318318
raise HTTPException(
319319
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
320320
detail={

0 commit comments

Comments
 (0)