Skip to content

Commit 9b435c8

Browse files
authored
feat(networking): Net admin configurablity (#39)
* . * k8s vals * Fix tests * Clean up
1 parent cae7c79 commit 9b435c8

9 files changed

Lines changed: 111 additions & 20 deletions

File tree

code-interpreter/app/app_configs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@
2020
os.environ.get("KUBERNETES_EXECUTOR_IMAGE") or "onyxdotapp/python-executor-sci"
2121
)
2222
KUBERNETES_EXECUTOR_SERVICE_ACCOUNT = os.environ.get("KUBERNETES_EXECUTOR_SERVICE_ACCOUNT") or ""
23+
# When true, executor pods run a privileged (NET_ADMIN) init container that uses
24+
# iptables to drop all outbound traffic before the executor container starts. This
25+
# avoids the race where a pod can reach the network before the CNI enforces a
26+
# NetworkPolicy. Environments whose CNI applies NetworkPolicies without that race
27+
# (or that disallow NET_ADMIN) can set this to false and rely on a NetworkPolicy.
28+
KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN = (
29+
os.environ.get("KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN") or "true"
30+
).lower() not in ("false", "0", "no")
2331

2432
# Execution limits
2533
MAX_EXEC_TIMEOUT_MS = int(os.environ.get("MAX_EXEC_TIMEOUT_MS") or 60_000)

code-interpreter/app/services/executor_kubernetes.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from app.app_configs import (
2626
KUBERNETES_EXECUTOR_IMAGE,
2727
KUBERNETES_EXECUTOR_NAMESPACE,
28+
KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN,
2829
KUBERNETES_EXECUTOR_SERVICE_ACCOUNT,
2930
)
3031
from app.services.executor_base import (
@@ -95,6 +96,7 @@ def __init__(self) -> None:
9596
self.namespace = KUBERNETES_EXECUTOR_NAMESPACE
9697
self.image = KUBERNETES_EXECUTOR_IMAGE
9798
self.service_account = KUBERNETES_EXECUTOR_SERVICE_ACCOUNT
99+
self.net_admin_lockdown = KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN
98100

99101
def check_health(self) -> HealthCheck:
100102
"""Verify Kubernetes API is reachable and we can create pods in the namespace."""
@@ -196,26 +198,34 @@ def _create_pod_manifest(
196198
# executor container as well. This eliminates the race condition
197199
# where the pod can send network requests before the Kubernetes
198200
# NetworkPolicy is enforced by the CNI.
199-
iptables_script = "set -e && iptables -A OUTPUT -j DROP && ip6tables -A OUTPUT -j DROP"
200-
network_lockdown_container = V1Container(
201-
name="network-lockdown",
202-
image=self.image,
203-
command=["sh", "-c", iptables_script],
204-
security_context={
205-
"runAsUser": 0,
206-
"runAsNonRoot": False,
207-
"allowPrivilegeEscalation": False,
208-
"readOnlyRootFilesystem": True,
209-
"capabilities": {"drop": ["ALL"], "add": ["NET_ADMIN"]},
210-
},
211-
resources={
212-
"limits": {"cpu": "100m", "memory": "32Mi"},
213-
"requests": {"cpu": "10m", "memory": "16Mi"},
214-
},
215-
)
201+
#
202+
# This requires the NET_ADMIN capability. Environments whose CNI
203+
# enforces NetworkPolicies without that race (or that disallow
204+
# NET_ADMIN) can disable this and rely on a NetworkPolicy instead.
205+
init_containers: list[V1Container] = []
206+
if self.net_admin_lockdown:
207+
iptables_script = "set -e && iptables -A OUTPUT -j DROP && ip6tables -A OUTPUT -j DROP"
208+
init_containers.append(
209+
V1Container(
210+
name="network-lockdown",
211+
image=self.image,
212+
command=["sh", "-c", iptables_script],
213+
security_context={
214+
"runAsUser": 0,
215+
"runAsNonRoot": False,
216+
"allowPrivilegeEscalation": False,
217+
"readOnlyRootFilesystem": True,
218+
"capabilities": {"drop": ["ALL"], "add": ["NET_ADMIN"]},
219+
},
220+
resources={
221+
"limits": {"cpu": "100m", "memory": "32Mi"},
222+
"requests": {"cpu": "10m", "memory": "16Mi"},
223+
},
224+
)
225+
)
216226

217227
spec = V1PodSpec(
218-
init_containers=[network_lockdown_container],
228+
init_containers=init_containers or None,
219229
containers=[container],
220230
restart_policy="Never",
221231
active_deadline_seconds=active_deadline_seconds,

code-interpreter/tests/integration_tests/test_binary_file_integrity.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def executor() -> KubernetesExecutor:
2525
inst.namespace = "test"
2626
inst.image = "test:latest"
2727
inst.service_account = ""
28+
inst.net_admin_lockdown = True
2829
return inst
2930

3031

code-interpreter/tests/integration_tests/test_kubernetes_streaming.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def executor() -> KubernetesExecutor:
3232
inst.namespace = "test"
3333
inst.image = "test:latest"
3434
inst.service_account = ""
35+
inst.net_admin_lockdown = True
3536
pod_mock = MagicMock()
3637
pod_mock.status.phase = "Running"
3738

code-interpreter/tests/integration_tests/test_session_bash_kubernetes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def executor() -> KubernetesExecutor:
1818
inst.namespace = "test"
1919
inst.image = "test:latest"
2020
inst.service_account = ""
21+
inst.net_admin_lockdown = True
2122
return inst
2223

2324

code-interpreter/tests/integration_tests/test_sessions_kubernetes.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def executor() -> KubernetesExecutor:
3232
inst.namespace = "test"
3333
inst.image = "test:latest"
3434
inst.service_account = ""
35+
inst.net_admin_lockdown = True
3536
pod_mock = MagicMock()
3637
pod_mock.status.phase = "Running"
3738
inst.v1.read_namespaced_pod.return_value = pod_mock
@@ -79,6 +80,32 @@ def test_create_session_sets_active_deadline(executor: KubernetesExecutor) -> No
7980
assert pod.spec.containers[0].command == ["sleep", "600"]
8081

8182

83+
def test_create_session_adds_net_admin_lockdown_init_container(
84+
executor: KubernetesExecutor,
85+
) -> None:
86+
"""With lockdown enabled, a NET_ADMIN init container drops outbound traffic."""
87+
executor.create_session(ttl_seconds=600)
88+
89+
pod = executor.v1.create_namespaced_pod.call_args.kwargs["body"]
90+
init_containers = pod.spec.init_containers
91+
assert init_containers is not None
92+
assert len(init_containers) == 1
93+
lockdown = init_containers[0]
94+
assert lockdown.name == "network-lockdown"
95+
assert lockdown.security_context["capabilities"]["add"] == ["NET_ADMIN"]
96+
97+
98+
def test_create_session_omits_lockdown_when_disabled(
99+
executor: KubernetesExecutor,
100+
) -> None:
101+
"""With lockdown disabled, no privileged init container is added (rely on NetworkPolicy)."""
102+
executor.net_admin_lockdown = False
103+
executor.create_session(ttl_seconds=600)
104+
105+
pod = executor.v1.create_namespaced_pod.call_args.kwargs["body"]
106+
assert pod.spec.init_containers is None
107+
108+
82109
def test_create_session_stages_files(executor: KubernetesExecutor) -> None:
83110
with patch.object(executor, "_upload_tar_to_pod") as upload:
84111
info = executor.create_session(

kubernetes/code-interpreter/templates/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ spec:
6161
value: {{ include "code-interpreter.kubernetesNamespace" . | quote }}
6262
- name: KUBERNETES_EXECUTOR_IMAGE
6363
value: {{ .Values.codeInterpreter.kubernetesExecutor.image | quote }}
64+
- name: KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN
65+
value: {{ .Values.codeInterpreter.kubernetesExecutor.netAdminLockdown | quote }}
6466
{{- if .Values.codeInterpreter.kubernetesExecutor.serviceAccount }}
6567
- name: KUBERNETES_EXECUTOR_SERVICE_ACCOUNT
6668
value: {{ .Values.codeInterpreter.kubernetesExecutor.serviceAccount | quote }}

kubernetes/code-interpreter/templates/networkpolicy.yaml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,28 @@ spec:
1919
egress:
2020
{{- toYaml . | nindent 4 }}
2121
{{- end }}
22-
{{- end }}
22+
{{- end }}
23+
---
24+
# Deny-all-egress policy for the executor/session pods, which run untrusted user
25+
# code. Always applied (independent of networkPolicy.enabled, which only governs
26+
# the API-server policy above) and pinned to the executor namespace.
27+
apiVersion: networking.k8s.io/v1
28+
kind: NetworkPolicy
29+
metadata:
30+
name: {{ include "code-interpreter.fullname" . }}-executor
31+
namespace: {{ include "code-interpreter.kubernetesNamespace" . }}
32+
labels:
33+
{{- include "code-interpreter.labels" . | nindent 4 }}
34+
spec:
35+
podSelector:
36+
matchLabels:
37+
app: code-interpreter
38+
matchExpressions:
39+
- key: component
40+
operator: In
41+
values:
42+
- executor
43+
- session
44+
policyTypes:
45+
- Egress
46+
egress: [] # No egress rules => all outbound traffic is denied.

kubernetes/code-interpreter/values.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,17 @@ codeInterpreter:
111111
namespace: "" # Empty means use the same namespace as the service
112112
image: "" # Empty means used the default in code: `
113113
serviceAccount: "" # Empty means use default service account for pods
114+
# When true, executor pods run a privileged init container that uses the
115+
# NET_ADMIN capability to apply iptables rules dropping all outbound traffic
116+
# before the executor starts — an OS-level lockdown that avoids the race
117+
# where a pod can reach the network before the CNI enforces a NetworkPolicy.
118+
# Set to false in environments whose CNI applies NetworkPolicies without that
119+
# race (or that disallow the NET_ADMIN capability).
120+
#
121+
# When this is false, the always-applied executor/session NetworkPolicy
122+
# (templates/networkpolicy.yaml) becomes the sole network restriction on
123+
# executor pods, so it must be enforced by the cluster CNI to have effect.
124+
netAdminLockdown: true
114125
# Resource limits for execution pods
115126
podResources:
116127
limits:
@@ -162,7 +173,13 @@ readinessProbe:
162173
timeoutSeconds: 3
163174
failureThreshold: 3
164175

165-
# Network Policy
176+
# Network Policy for the API-server (deployment) pod, configured by the fields
177+
# below. Off by default; its defaults (deny-all ingress, DNS-only egress) are a
178+
# starting point that most installs will need to customize before enabling.
179+
#
180+
# NOTE: this flag does NOT govern the executor/session pods. Those pods run user
181+
# code and are ALWAYS given a deny-all-egress NetworkPolicy
182+
# (templates/networkpolicy.yaml), independent of this setting.
166183
networkPolicy:
167184
enabled: false
168185
policyTypes:

0 commit comments

Comments
 (0)