feat(networking): Net admin configurablity (#39)

Danelegend · web-flow · commit 9b435c8a6c74 · 2026-05-27T18:01:34.000-07:00
* .

* k8s vals

* Fix tests

* Clean up
diff --git a/code-interpreter/app/app_configs.py b/code-interpreter/app/app_configs.py
@@ -20,6 +20,14 @@
     os.environ.get("KUBERNETES_EXECUTOR_IMAGE") or "onyxdotapp/python-executor-sci"
 )
 KUBERNETES_EXECUTOR_SERVICE_ACCOUNT = os.environ.get("KUBERNETES_EXECUTOR_SERVICE_ACCOUNT") or ""
+# When true, executor pods run a privileged (NET_ADMIN) init container that uses
+# iptables to drop all outbound traffic before the executor container starts. This
+# avoids the race where a pod can reach the network before the CNI enforces a
+# NetworkPolicy. Environments whose CNI applies NetworkPolicies without that race
+# (or that disallow NET_ADMIN) can set this to false and rely on a NetworkPolicy.
+KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN = (
+    os.environ.get("KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN") or "true"
+).lower() not in ("false", "0", "no")
 
 # Execution limits
 MAX_EXEC_TIMEOUT_MS = int(os.environ.get("MAX_EXEC_TIMEOUT_MS") or 60_000)
diff --git a/code-interpreter/app/services/executor_kubernetes.py b/code-interpreter/app/services/executor_kubernetes.py
@@ -25,6 +25,7 @@
 from app.app_configs import (
     KUBERNETES_EXECUTOR_IMAGE,
     KUBERNETES_EXECUTOR_NAMESPACE,
+    KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN,
     KUBERNETES_EXECUTOR_SERVICE_ACCOUNT,
 )
 from app.services.executor_base import (
@@ -95,6 +96,7 @@ def __init__(self) -> None:
         self.namespace = KUBERNETES_EXECUTOR_NAMESPACE
         self.image = KUBERNETES_EXECUTOR_IMAGE
         self.service_account = KUBERNETES_EXECUTOR_SERVICE_ACCOUNT
+        self.net_admin_lockdown = KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN
 
     def check_health(self) -> HealthCheck:
         """Verify Kubernetes API is reachable and we can create pods in the namespace."""
@@ -196,26 +198,34 @@ def _create_pod_manifest(
         # executor container as well. This eliminates the race condition
         # where the pod can send network requests before the Kubernetes
         # NetworkPolicy is enforced by the CNI.
-        iptables_script = "set -e && iptables -A OUTPUT -j DROP && ip6tables -A OUTPUT -j DROP"
-        network_lockdown_container = V1Container(
-            name="network-lockdown",
-            image=self.image,
-            command=["sh", "-c", iptables_script],
-            security_context={
-                "runAsUser": 0,
-                "runAsNonRoot": False,
-                "allowPrivilegeEscalation": False,
-                "readOnlyRootFilesystem": True,
-                "capabilities": {"drop": ["ALL"], "add": ["NET_ADMIN"]},
-            },
-            resources={
-                "limits": {"cpu": "100m", "memory": "32Mi"},
-                "requests": {"cpu": "10m", "memory": "16Mi"},
-            },
-        )
+        #
+        # This requires the NET_ADMIN capability. Environments whose CNI
+        # enforces NetworkPolicies without that race (or that disallow
+        # NET_ADMIN) can disable this and rely on a NetworkPolicy instead.
+        init_containers: list[V1Container] = []
+        if self.net_admin_lockdown:
+            iptables_script = "set -e && iptables -A OUTPUT -j DROP && ip6tables -A OUTPUT -j DROP"
+            init_containers.append(
+                V1Container(
+                    name="network-lockdown",
+                    image=self.image,
+                    command=["sh", "-c", iptables_script],
+                    security_context={
+                        "runAsUser": 0,
+                        "runAsNonRoot": False,
+                        "allowPrivilegeEscalation": False,
+                        "readOnlyRootFilesystem": True,
+                        "capabilities": {"drop": ["ALL"], "add": ["NET_ADMIN"]},
+                    },
+                    resources={
+                        "limits": {"cpu": "100m", "memory": "32Mi"},
+                        "requests": {"cpu": "10m", "memory": "16Mi"},
+                    },
+                )
+            )
 
         spec = V1PodSpec(
-            init_containers=[network_lockdown_container],
+            init_containers=init_containers or None,
             containers=[container],
             restart_policy="Never",
             active_deadline_seconds=active_deadline_seconds,
diff --git a/code-interpreter/tests/integration_tests/test_binary_file_integrity.py b/code-interpreter/tests/integration_tests/test_binary_file_integrity.py
@@ -25,6 +25,7 @@ def executor() -> KubernetesExecutor:
     inst.namespace = "test"
     inst.image = "test:latest"
     inst.service_account = ""
+    inst.net_admin_lockdown = True
     return inst
 
 
diff --git a/code-interpreter/tests/integration_tests/test_kubernetes_streaming.py b/code-interpreter/tests/integration_tests/test_kubernetes_streaming.py
@@ -32,6 +32,7 @@ def executor() -> KubernetesExecutor:
     inst.namespace = "test"
     inst.image = "test:latest"
     inst.service_account = ""
+    inst.net_admin_lockdown = True
     pod_mock = MagicMock()
     pod_mock.status.phase = "Running"
 
diff --git a/code-interpreter/tests/integration_tests/test_session_bash_kubernetes.py b/code-interpreter/tests/integration_tests/test_session_bash_kubernetes.py
@@ -18,6 +18,7 @@ def executor() -> KubernetesExecutor:
     inst.namespace = "test"
     inst.image = "test:latest"
     inst.service_account = ""
+    inst.net_admin_lockdown = True
     return inst
 
 
diff --git a/code-interpreter/tests/integration_tests/test_sessions_kubernetes.py b/code-interpreter/tests/integration_tests/test_sessions_kubernetes.py
@@ -32,6 +32,7 @@ def executor() -> KubernetesExecutor:
     inst.namespace = "test"
     inst.image = "test:latest"
     inst.service_account = ""
+    inst.net_admin_lockdown = True
     pod_mock = MagicMock()
     pod_mock.status.phase = "Running"
     inst.v1.read_namespaced_pod.return_value = pod_mock
@@ -79,6 +80,32 @@ def test_create_session_sets_active_deadline(executor: KubernetesExecutor) -> No
     assert pod.spec.containers[0].command == ["sleep", "600"]
 
 
+def test_create_session_adds_net_admin_lockdown_init_container(
+    executor: KubernetesExecutor,
+) -> None:
+    """With lockdown enabled, a NET_ADMIN init container drops outbound traffic."""
+    executor.create_session(ttl_seconds=600)
+
+    pod = executor.v1.create_namespaced_pod.call_args.kwargs["body"]
+    init_containers = pod.spec.init_containers
+    assert init_containers is not None
+    assert len(init_containers) == 1
+    lockdown = init_containers[0]
+    assert lockdown.name == "network-lockdown"
+    assert lockdown.security_context["capabilities"]["add"] == ["NET_ADMIN"]
+
+
+def test_create_session_omits_lockdown_when_disabled(
+    executor: KubernetesExecutor,
+) -> None:
+    """With lockdown disabled, no privileged init container is added (rely on NetworkPolicy)."""
+    executor.net_admin_lockdown = False
+    executor.create_session(ttl_seconds=600)
+
+    pod = executor.v1.create_namespaced_pod.call_args.kwargs["body"]
+    assert pod.spec.init_containers is None
+
+
 def test_create_session_stages_files(executor: KubernetesExecutor) -> None:
     with patch.object(executor, "_upload_tar_to_pod") as upload:
         info = executor.create_session(
diff --git a/kubernetes/code-interpreter/templates/deployment.yaml b/kubernetes/code-interpreter/templates/deployment.yaml
@@ -61,6 +61,8 @@ spec:
               value: {{ include "code-interpreter.kubernetesNamespace" . | quote }}
             - name: KUBERNETES_EXECUTOR_IMAGE
               value: {{ .Values.codeInterpreter.kubernetesExecutor.image | quote }}
+            - name: KUBERNETES_EXECUTOR_NET_ADMIN_LOCKDOWN
+              value: {{ .Values.codeInterpreter.kubernetesExecutor.netAdminLockdown | quote }}
             {{- if .Values.codeInterpreter.kubernetesExecutor.serviceAccount }}
             - name: KUBERNETES_EXECUTOR_SERVICE_ACCOUNT
               value: {{ .Values.codeInterpreter.kubernetesExecutor.serviceAccount | quote }}
diff --git a/kubernetes/code-interpreter/templates/networkpolicy.yaml b/kubernetes/code-interpreter/templates/networkpolicy.yaml
@@ -19,4 +19,28 @@ spec:
   egress:
     {{- toYaml . | nindent 4 }}
   {{- end }}
-{{- end }}
+{{- end }}
+---
+# Deny-all-egress policy for the executor/session pods, which run untrusted user
+# code. Always applied (independent of networkPolicy.enabled, which only governs
+# the API-server policy above) and pinned to the executor namespace.
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: {{ include "code-interpreter.fullname" . }}-executor
+  namespace: {{ include "code-interpreter.kubernetesNamespace" . }}
+  labels:
+    {{- include "code-interpreter.labels" . | nindent 4 }}
+spec:
+  podSelector:
+    matchLabels:
+      app: code-interpreter
+    matchExpressions:
+      - key: component
+        operator: In
+        values:
+          - executor
+          - session
+  policyTypes:
+    - Egress
+  egress: []  # No egress rules => all outbound traffic is denied.
diff --git a/kubernetes/code-interpreter/values.yaml b/kubernetes/code-interpreter/values.yaml
@@ -111,6 +111,17 @@ codeInterpreter:
     namespace: ""  # Empty means use the same namespace as the service
     image: ""  # Empty means used the default in code: `
     serviceAccount: ""  # Empty means use default service account for pods
+    # When true, executor pods run a privileged init container that uses the
+    # NET_ADMIN capability to apply iptables rules dropping all outbound traffic
+    # before the executor starts — an OS-level lockdown that avoids the race
+    # where a pod can reach the network before the CNI enforces a NetworkPolicy.
+    # Set to false in environments whose CNI applies NetworkPolicies without that
+    # race (or that disallow the NET_ADMIN capability).
+    #
+    # When this is false, the always-applied executor/session NetworkPolicy
+    # (templates/networkpolicy.yaml) becomes the sole network restriction on
+    # executor pods, so it must be enforced by the cluster CNI to have effect.
+    netAdminLockdown: true
     # Resource limits for execution pods
     podResources:
       limits:
@@ -162,7 +173,13 @@ readinessProbe:
   timeoutSeconds: 3
   failureThreshold: 3
 
-# Network Policy
+# Network Policy for the API-server (deployment) pod, configured by the fields
+# below. Off by default; its defaults (deny-all ingress, DNS-only egress) are a
+# starting point that most installs will need to customize before enabling.
+#
+# NOTE: this flag does NOT govern the executor/session pods. Those pods run user
+# code and are ALWAYS given a deny-all-egress NetworkPolicy
+# (templates/networkpolicy.yaml), independent of this setting.
 networkPolicy:
   enabled: false
   policyTypes: