11"""Requirements for Python code generation validation."""
22
3+ import dataclasses
4+ import warnings
35from collections .abc import Callable
6+ from typing import Literal
47
8+ from mellea .stdlib .tools .execution_policy import (
9+ DOCKER_POLICY ,
10+ LOCAL_POLICY ,
11+ CapabilityPolicy ,
12+ ExecutionTier ,
13+ )
514from mellea .stdlib .tools .interpreter import (
615 ExecutionEnvironment ,
7- LLMSandboxEnvironment ,
8- StaticAnalysisEnvironment ,
9- UnsafeEnvironment ,
16+ make_execution_environment ,
1017)
1118
1219from ...core import Context , MelleaLogger , Requirement , ValidationResult
@@ -104,16 +111,12 @@ def _has_python_code_listing(ctx: Context) -> ValidationResult:
104111
105112
106113def _python_executes_without_error (
107- ctx : Context ,
108- timeout : int = 5 ,
109- allow_unsafe : bool = False ,
110- allowed_imports : list [str ] | None = None ,
111- use_sandbox : bool = False ,
114+ ctx : Context , environment : ExecutionEnvironment
112115) -> ValidationResult :
113116 """Validate that Python code executes without raising exceptions.
114117
115118 First extracts the highest-scoring Python code block from the context,
116- then validates/executes it based on the specified execution mode .
119+ then validates/executes it using the given environment .
117120 """
118121 extraction_result = _has_python_code_listing (ctx )
119122 if not extraction_result .as_bool ():
@@ -125,15 +128,7 @@ def _python_executes_without_error(
125128 code = extraction_result .reason
126129 assert code is not None
127130
128- environment : ExecutionEnvironment
129- if use_sandbox :
130- environment = LLMSandboxEnvironment (allowed_imports = allowed_imports )
131- elif allow_unsafe :
132- environment = UnsafeEnvironment (allowed_imports = allowed_imports )
133- else :
134- environment = StaticAnalysisEnvironment (allowed_imports = allowed_imports )
135-
136- result = environment .execute (code , timeout )
131+ result = environment .execute (code )
137132 return ValidationResult (
138133 result = result .success , reason = result .to_validationresult_reason ()
139134 )
@@ -143,16 +138,28 @@ class PythonExecutionReq(Requirement):
143138 """Verifies that Python code runs without raising exceptions.
144139
145140 Extracts the highest-scoring Python code block from the model's last output
146- and validates or executes it according to the configured execution mode.
141+ and validates or executes it according to the configured execution tier.
142+
143+ Use ``execution_tier`` to select behavior by intent:
144+
145+ - ``"static"`` (default) — parse and import-check only, no execution.
146+ - ``"local_unsafe"`` — subprocess execution, no policy restrictions.
147+ - ``"local"`` — subprocess execution with a declared capability policy.
148+ - ``"docker_unsafe"`` — Docker-isolated execution, no policy restrictions.
149+ - ``"docker"`` — Docker-isolated execution with a declared capability policy.
147150
148151 Args:
149- timeout (int): Maximum seconds to allow code to run. Defaults to `5`.
150- allow_unsafe_execution (bool): If `True`, execute code directly with
151- subprocess. Use only with trusted sources.
152+ execution_tier (str): One of ``"static"``, ``"local_unsafe"``, ``"local"``,
153+ ``"docker_unsafe"``, or ``"docker"``. Defaults to ``"static"``.
154+ policy (CapabilityPolicy | None): Override the tier's default policy.
155+ Ignored for ``"static"`` and unsafe tiers unless explicitly provided.
152156 allowed_imports (list[str] | None): Allowlist of importable top-level
153- modules. `None` allows any import.
154- use_sandbox (bool): If `True`, use `llm-sandbox` for Docker-based
155- isolated execution.
157+ modules. ``None`` allows any import.
158+ timeout (int | None): Deprecated. Pass ``policy=CapabilityPolicy(timeout=N)``
159+ instead. When provided, overrides the policy timeout.
160+ allow_unsafe_execution (bool): Deprecated. Use
161+ ``execution_tier="local_unsafe"`` instead.
162+ use_sandbox (bool): Deprecated. Use ``execution_tier="docker"`` instead.
156163
157164 Attributes:
158165 validation_fn (Callable[[Context], ValidationResult]): The validation
@@ -161,46 +168,148 @@ class PythonExecutionReq(Requirement):
161168
162169 def __init__ (
163170 self ,
164- timeout : int = 5 ,
165- allow_unsafe_execution : bool = False ,
171+ execution_tier : ExecutionTier = "static" ,
172+ * ,
173+ policy : CapabilityPolicy | None = None ,
166174 allowed_imports : list [str ] | None = None ,
175+ # Deprecated kwargs — kept for backward compatibility
176+ timeout : int | None = None ,
177+ allow_unsafe_execution : bool = False ,
167178 use_sandbox : bool = False ,
168179 ):
169- """Initialize PythonExecutionReq with execution mode, timeout, and import allowlist settings."""
170- self ._timeout = timeout
171- self ._allow_unsafe = allow_unsafe_execution
180+ """Initialize PythonExecutionReq with an execution tier and optional policy."""
181+ # Legacy positional-integer shim: old signature was PythonExecutionReq(timeout: int).
182+ if isinstance (execution_tier , int ):
183+ warnings .warn (
184+ "Passing an integer as the first argument to PythonExecutionReq() is "
185+ "deprecated. The first parameter is now execution_tier (a string). "
186+ "Use PythonExecutionReq(policy=CapabilityPolicy(timeout=N)) instead." ,
187+ DeprecationWarning ,
188+ stacklevel = 2 ,
189+ )
190+ timeout = execution_tier # type: ignore[assignment]
191+ execution_tier = "static"
192+
193+ # --- Deprecation shims ---
194+ _local_tiers = ("local_unsafe" , "local" )
195+ _docker_tiers = ("docker_unsafe" , "docker" )
196+
197+ if allow_unsafe_execution :
198+ if execution_tier not in _local_tiers :
199+ if execution_tier in _docker_tiers :
200+ # Caller is already on a docker tier — warn but don't downgrade.
201+ warnings .warn (
202+ f"allow_unsafe_execution is deprecated and has no effect when "
203+ f"execution_tier='{ execution_tier } ' is already set. "
204+ "Remove the flag." ,
205+ DeprecationWarning ,
206+ stacklevel = 2 ,
207+ )
208+ else :
209+ warnings .warn (
210+ "allow_unsafe_execution is deprecated. Use execution_tier='local_unsafe' instead." ,
211+ DeprecationWarning ,
212+ stacklevel = 2 ,
213+ )
214+ if execution_tier == "static" :
215+ # Promote to "local" when timeout is also set so the
216+ # timeout shim below can synthesise a policy — "local_unsafe"
217+ # has no policy and would silently discard the timeout value.
218+ execution_tier = (
219+ "local" if timeout is not None else "local_unsafe"
220+ )
221+
222+ if use_sandbox :
223+ if execution_tier not in _docker_tiers :
224+ # Only warn and promote when the flag actually changes something.
225+ warnings .warn (
226+ "use_sandbox is deprecated. Use execution_tier='docker' instead." ,
227+ DeprecationWarning ,
228+ stacklevel = 2 ,
229+ )
230+ if execution_tier in ("static" , "local_unsafe" , "local" ):
231+ execution_tier = "docker"
232+ elif execution_tier == "docker_unsafe" :
233+ # Already in Docker but without a policy — nudge toward 'docker'.
234+ warnings .warn (
235+ "use_sandbox is deprecated. Use execution_tier='docker' (with policy) "
236+ "instead of 'docker_unsafe' for capability enforcement." ,
237+ DeprecationWarning ,
238+ stacklevel = 2 ,
239+ )
240+
241+ if timeout is not None :
242+ if execution_tier == "static" :
243+ warnings .warn (
244+ "timeout has no effect on the static tier (no code is executed)." ,
245+ DeprecationWarning ,
246+ stacklevel = 2 ,
247+ )
248+ elif execution_tier in ("local_unsafe" , "docker_unsafe" ):
249+ warnings .warn (
250+ f"timeout is ignored for the '{ execution_tier } ' tier (no policy is applied). "
251+ "Use execution_tier='local' or 'docker' with policy=CapabilityPolicy(timeout=N) "
252+ "to enforce a custom timeout." ,
253+ DeprecationWarning ,
254+ stacklevel = 2 ,
255+ )
256+ else :
257+ warnings .warn (
258+ "timeout is deprecated. Pass policy=CapabilityPolicy(timeout=N) instead." ,
259+ DeprecationWarning ,
260+ stacklevel = 2 ,
261+ )
262+ if policy is None :
263+ base = DOCKER_POLICY if execution_tier == "docker" else LOCAL_POLICY
264+ policy = dataclasses .replace (base , timeout = timeout )
265+ else :
266+ policy = dataclasses .replace (policy , timeout = timeout )
267+
268+ self ._tier = execution_tier
269+ self ._policy = policy
172270 self ._allowed_imports = allowed_imports
173- self ._use_sandbox = use_sandbox
174271
175- if allow_unsafe_execution and not use_sandbox :
272+ environment : ExecutionEnvironment = make_execution_environment (
273+ tier = execution_tier , policy = policy , allowed_imports = allowed_imports
274+ )
275+
276+ if execution_tier in ("local_unsafe" , "local" ):
176277 logger .warning (
177- "⚠️ UNSAFE: Executing untrusted code directly. Only use with trusted sources!"
278+ "⚠️ UNSAFE: Executing untrusted code without container isolation. "
279+ "Only use with trusted sources!"
178280 )
179281
180- if use_sandbox and allow_unsafe_execution :
181- execution_mode = f"sandbox execution (timeout: { timeout } s)"
182- elif allow_unsafe_execution :
183- execution_mode = f"unsafe execution (timeout: { timeout } s)"
184- elif use_sandbox :
185- execution_mode = f"sandbox execution (timeout: { timeout } s)"
186- else :
187- execution_mode = "validation only"
282+ tier_label = _tier_label (execution_tier , policy )
188283
189284 super ().__init__ (
190- description = f"The Python code should execute without errors ({ execution_mode } )." ,
191- validation_fn = lambda ctx : _python_executes_without_error (
192- ctx ,
193- self ._timeout ,
194- self ._allow_unsafe ,
195- self ._allowed_imports ,
196- self ._use_sandbox ,
197- ),
285+ description = f"The Python code should execute without errors ({ tier_label } )." ,
286+ validation_fn = lambda ctx : _python_executes_without_error (ctx , environment ),
198287 check_only = True ,
199288 )
200289
201- # Add type hint to validation_fn here. It's always set for this requirement.
202290 self .validation_fn : Callable [[Context ], ValidationResult ]
203291 assert self .validation_fn is not None
204292
205293
294+ def _tier_label (tier : str , policy : CapabilityPolicy | None ) -> str :
295+ timeout = policy .timeout if policy is not None else None
296+ match tier :
297+ case "static" :
298+ return "validation only"
299+ case "local_unsafe" :
300+ effective = timeout if timeout is not None else LOCAL_POLICY .timeout
301+ return f"local execution, no policy (timeout: { effective } s)"
302+ case "local" :
303+ effective = timeout if timeout is not None else LOCAL_POLICY .timeout
304+ return f"local execution with policy (timeout: { effective } s)"
305+ case "docker_unsafe" :
306+ effective = timeout if timeout is not None else DOCKER_POLICY .timeout
307+ return f"docker execution, no policy (timeout: { effective } s)"
308+ case "docker" :
309+ effective = timeout if timeout is not None else DOCKER_POLICY .timeout
310+ return f"docker execution with policy (timeout: { effective } s)"
311+ case _:
312+ return tier
313+
314+
206315# endregion
0 commit comments