Skip to content

Commit 33a6146

Browse files
feat: sandbox runtime and capability policy (#1171)
* first pass at sandbox and capability policy Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> * sandbox policy and tier cleanup Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> * replace ad-hoc execution flags with executiontier and capabilitypolicy Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> * meet docstring quality gate Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> * address review feedback Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> * update compability matrix tier test Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com> --------- Signed-off-by: AngeloDanducci <angelo.danducci.ii@ibm.com>
1 parent 8338f76 commit 33a6146

5 files changed

Lines changed: 1258 additions & 153 deletions

File tree

mellea/stdlib/requirements/python_reqs.py

Lines changed: 159 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
"""Requirements for Python code generation validation."""
22

3+
import dataclasses
4+
import warnings
35
from collections.abc import Callable
6+
from typing import Literal
47

8+
from mellea.stdlib.tools.execution_policy import (
9+
DOCKER_POLICY,
10+
LOCAL_POLICY,
11+
CapabilityPolicy,
12+
ExecutionTier,
13+
)
514
from mellea.stdlib.tools.interpreter import (
615
ExecutionEnvironment,
7-
LLMSandboxEnvironment,
8-
StaticAnalysisEnvironment,
9-
UnsafeEnvironment,
16+
make_execution_environment,
1017
)
1118

1219
from ...core import Context, MelleaLogger, Requirement, ValidationResult
@@ -104,16 +111,12 @@ def _has_python_code_listing(ctx: Context) -> ValidationResult:
104111

105112

106113
def _python_executes_without_error(
107-
ctx: Context,
108-
timeout: int = 5,
109-
allow_unsafe: bool = False,
110-
allowed_imports: list[str] | None = None,
111-
use_sandbox: bool = False,
114+
ctx: Context, environment: ExecutionEnvironment
112115
) -> ValidationResult:
113116
"""Validate that Python code executes without raising exceptions.
114117
115118
First extracts the highest-scoring Python code block from the context,
116-
then validates/executes it based on the specified execution mode.
119+
then validates/executes it using the given environment.
117120
"""
118121
extraction_result = _has_python_code_listing(ctx)
119122
if not extraction_result.as_bool():
@@ -125,15 +128,7 @@ def _python_executes_without_error(
125128
code = extraction_result.reason
126129
assert code is not None
127130

128-
environment: ExecutionEnvironment
129-
if use_sandbox:
130-
environment = LLMSandboxEnvironment(allowed_imports=allowed_imports)
131-
elif allow_unsafe:
132-
environment = UnsafeEnvironment(allowed_imports=allowed_imports)
133-
else:
134-
environment = StaticAnalysisEnvironment(allowed_imports=allowed_imports)
135-
136-
result = environment.execute(code, timeout)
131+
result = environment.execute(code)
137132
return ValidationResult(
138133
result=result.success, reason=result.to_validationresult_reason()
139134
)
@@ -143,16 +138,28 @@ class PythonExecutionReq(Requirement):
143138
"""Verifies that Python code runs without raising exceptions.
144139
145140
Extracts the highest-scoring Python code block from the model's last output
146-
and validates or executes it according to the configured execution mode.
141+
and validates or executes it according to the configured execution tier.
142+
143+
Use ``execution_tier`` to select behavior by intent:
144+
145+
- ``"static"`` (default) — parse and import-check only, no execution.
146+
- ``"local_unsafe"`` — subprocess execution, no policy restrictions.
147+
- ``"local"`` — subprocess execution with a declared capability policy.
148+
- ``"docker_unsafe"`` — Docker-isolated execution, no policy restrictions.
149+
- ``"docker"`` — Docker-isolated execution with a declared capability policy.
147150
148151
Args:
149-
timeout (int): Maximum seconds to allow code to run. Defaults to `5`.
150-
allow_unsafe_execution (bool): If `True`, execute code directly with
151-
subprocess. Use only with trusted sources.
152+
execution_tier (str): One of ``"static"``, ``"local_unsafe"``, ``"local"``,
153+
``"docker_unsafe"``, or ``"docker"``. Defaults to ``"static"``.
154+
policy (CapabilityPolicy | None): Override the tier's default policy.
155+
Ignored for ``"static"`` and unsafe tiers unless explicitly provided.
152156
allowed_imports (list[str] | None): Allowlist of importable top-level
153-
modules. `None` allows any import.
154-
use_sandbox (bool): If `True`, use `llm-sandbox` for Docker-based
155-
isolated execution.
157+
modules. ``None`` allows any import.
158+
timeout (int | None): Deprecated. Pass ``policy=CapabilityPolicy(timeout=N)``
159+
instead. When provided, overrides the policy timeout.
160+
allow_unsafe_execution (bool): Deprecated. Use
161+
``execution_tier="local_unsafe"`` instead.
162+
use_sandbox (bool): Deprecated. Use ``execution_tier="docker"`` instead.
156163
157164
Attributes:
158165
validation_fn (Callable[[Context], ValidationResult]): The validation
@@ -161,46 +168,148 @@ class PythonExecutionReq(Requirement):
161168

162169
def __init__(
163170
self,
164-
timeout: int = 5,
165-
allow_unsafe_execution: bool = False,
171+
execution_tier: ExecutionTier = "static",
172+
*,
173+
policy: CapabilityPolicy | None = None,
166174
allowed_imports: list[str] | None = None,
175+
# Deprecated kwargs — kept for backward compatibility
176+
timeout: int | None = None,
177+
allow_unsafe_execution: bool = False,
167178
use_sandbox: bool = False,
168179
):
169-
"""Initialize PythonExecutionReq with execution mode, timeout, and import allowlist settings."""
170-
self._timeout = timeout
171-
self._allow_unsafe = allow_unsafe_execution
180+
"""Initialize PythonExecutionReq with an execution tier and optional policy."""
181+
# Legacy positional-integer shim: old signature was PythonExecutionReq(timeout: int).
182+
if isinstance(execution_tier, int):
183+
warnings.warn(
184+
"Passing an integer as the first argument to PythonExecutionReq() is "
185+
"deprecated. The first parameter is now execution_tier (a string). "
186+
"Use PythonExecutionReq(policy=CapabilityPolicy(timeout=N)) instead.",
187+
DeprecationWarning,
188+
stacklevel=2,
189+
)
190+
timeout = execution_tier # type: ignore[assignment]
191+
execution_tier = "static"
192+
193+
# --- Deprecation shims ---
194+
_local_tiers = ("local_unsafe", "local")
195+
_docker_tiers = ("docker_unsafe", "docker")
196+
197+
if allow_unsafe_execution:
198+
if execution_tier not in _local_tiers:
199+
if execution_tier in _docker_tiers:
200+
# Caller is already on a docker tier — warn but don't downgrade.
201+
warnings.warn(
202+
f"allow_unsafe_execution is deprecated and has no effect when "
203+
f"execution_tier='{execution_tier}' is already set. "
204+
"Remove the flag.",
205+
DeprecationWarning,
206+
stacklevel=2,
207+
)
208+
else:
209+
warnings.warn(
210+
"allow_unsafe_execution is deprecated. Use execution_tier='local_unsafe' instead.",
211+
DeprecationWarning,
212+
stacklevel=2,
213+
)
214+
if execution_tier == "static":
215+
# Promote to "local" when timeout is also set so the
216+
# timeout shim below can synthesise a policy — "local_unsafe"
217+
# has no policy and would silently discard the timeout value.
218+
execution_tier = (
219+
"local" if timeout is not None else "local_unsafe"
220+
)
221+
222+
if use_sandbox:
223+
if execution_tier not in _docker_tiers:
224+
# Only warn and promote when the flag actually changes something.
225+
warnings.warn(
226+
"use_sandbox is deprecated. Use execution_tier='docker' instead.",
227+
DeprecationWarning,
228+
stacklevel=2,
229+
)
230+
if execution_tier in ("static", "local_unsafe", "local"):
231+
execution_tier = "docker"
232+
elif execution_tier == "docker_unsafe":
233+
# Already in Docker but without a policy — nudge toward 'docker'.
234+
warnings.warn(
235+
"use_sandbox is deprecated. Use execution_tier='docker' (with policy) "
236+
"instead of 'docker_unsafe' for capability enforcement.",
237+
DeprecationWarning,
238+
stacklevel=2,
239+
)
240+
241+
if timeout is not None:
242+
if execution_tier == "static":
243+
warnings.warn(
244+
"timeout has no effect on the static tier (no code is executed).",
245+
DeprecationWarning,
246+
stacklevel=2,
247+
)
248+
elif execution_tier in ("local_unsafe", "docker_unsafe"):
249+
warnings.warn(
250+
f"timeout is ignored for the '{execution_tier}' tier (no policy is applied). "
251+
"Use execution_tier='local' or 'docker' with policy=CapabilityPolicy(timeout=N) "
252+
"to enforce a custom timeout.",
253+
DeprecationWarning,
254+
stacklevel=2,
255+
)
256+
else:
257+
warnings.warn(
258+
"timeout is deprecated. Pass policy=CapabilityPolicy(timeout=N) instead.",
259+
DeprecationWarning,
260+
stacklevel=2,
261+
)
262+
if policy is None:
263+
base = DOCKER_POLICY if execution_tier == "docker" else LOCAL_POLICY
264+
policy = dataclasses.replace(base, timeout=timeout)
265+
else:
266+
policy = dataclasses.replace(policy, timeout=timeout)
267+
268+
self._tier = execution_tier
269+
self._policy = policy
172270
self._allowed_imports = allowed_imports
173-
self._use_sandbox = use_sandbox
174271

175-
if allow_unsafe_execution and not use_sandbox:
272+
environment: ExecutionEnvironment = make_execution_environment(
273+
tier=execution_tier, policy=policy, allowed_imports=allowed_imports
274+
)
275+
276+
if execution_tier in ("local_unsafe", "local"):
176277
logger.warning(
177-
"⚠️ UNSAFE: Executing untrusted code directly. Only use with trusted sources!"
278+
"⚠️ UNSAFE: Executing untrusted code without container isolation. "
279+
"Only use with trusted sources!"
178280
)
179281

180-
if use_sandbox and allow_unsafe_execution:
181-
execution_mode = f"sandbox execution (timeout: {timeout}s)"
182-
elif allow_unsafe_execution:
183-
execution_mode = f"unsafe execution (timeout: {timeout}s)"
184-
elif use_sandbox:
185-
execution_mode = f"sandbox execution (timeout: {timeout}s)"
186-
else:
187-
execution_mode = "validation only"
282+
tier_label = _tier_label(execution_tier, policy)
188283

189284
super().__init__(
190-
description=f"The Python code should execute without errors ({execution_mode}).",
191-
validation_fn=lambda ctx: _python_executes_without_error(
192-
ctx,
193-
self._timeout,
194-
self._allow_unsafe,
195-
self._allowed_imports,
196-
self._use_sandbox,
197-
),
285+
description=f"The Python code should execute without errors ({tier_label}).",
286+
validation_fn=lambda ctx: _python_executes_without_error(ctx, environment),
198287
check_only=True,
199288
)
200289

201-
# Add type hint to validation_fn here. It's always set for this requirement.
202290
self.validation_fn: Callable[[Context], ValidationResult]
203291
assert self.validation_fn is not None
204292

205293

294+
def _tier_label(tier: str, policy: CapabilityPolicy | None) -> str:
295+
timeout = policy.timeout if policy is not None else None
296+
match tier:
297+
case "static":
298+
return "validation only"
299+
case "local_unsafe":
300+
effective = timeout if timeout is not None else LOCAL_POLICY.timeout
301+
return f"local execution, no policy (timeout: {effective}s)"
302+
case "local":
303+
effective = timeout if timeout is not None else LOCAL_POLICY.timeout
304+
return f"local execution with policy (timeout: {effective}s)"
305+
case "docker_unsafe":
306+
effective = timeout if timeout is not None else DOCKER_POLICY.timeout
307+
return f"docker execution, no policy (timeout: {effective}s)"
308+
case "docker":
309+
effective = timeout if timeout is not None else DOCKER_POLICY.timeout
310+
return f"docker execution with policy (timeout: {effective}s)"
311+
case _:
312+
return tier
313+
314+
206315
# endregion

mellea/stdlib/tools/__init__.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,37 @@
11
"""Implementations of tools."""
22

3-
from .interpreter import code_interpreter, local_code_interpreter
3+
from .execution_policy import (
4+
COMPATIBILITY_MATRIX,
5+
DOCKER_POLICY,
6+
LOCAL_POLICY,
7+
Artifact,
8+
CapabilityPolicy,
9+
ExecutionTier,
10+
)
11+
from .interpreter import (
12+
ExecutionEnvironment,
13+
ExecutionResult,
14+
LLMSandboxEnvironment,
15+
StaticAnalysisEnvironment,
16+
UnsafeEnvironment,
17+
code_interpreter,
18+
local_code_interpreter,
19+
make_execution_environment,
20+
)
421

5-
__all__ = ["code_interpreter", "local_code_interpreter"]
22+
__all__ = [
23+
"COMPATIBILITY_MATRIX",
24+
"DOCKER_POLICY",
25+
"LOCAL_POLICY",
26+
"Artifact",
27+
"CapabilityPolicy",
28+
"ExecutionEnvironment",
29+
"ExecutionResult",
30+
"ExecutionTier",
31+
"LLMSandboxEnvironment",
32+
"StaticAnalysisEnvironment",
33+
"UnsafeEnvironment",
34+
"code_interpreter",
35+
"local_code_interpreter",
36+
"make_execution_environment",
37+
]

0 commit comments

Comments
 (0)