diff --git a/README.md b/README.md index 0b9804402..96156a6e3 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ We now ask: **What if our agent was 100x simpler, and still worked nearly as wel - **Minimal**: Just some 100 lines of python for the [agent class](https://github.com/SWE-agent/mini-swe-agent/blob/main/src/minisweagent/agents/default.py) (and a bit more for the [environment](https://github.com/SWE-agent/mini-swe-agent/blob/main/src/minisweagent/environments/local.py), [model](https://github.com/SWE-agent/mini-swe-agent/blob/main/src/minisweagent/models/litellm_model.py), and [run script](https://github.com/SWE-agent/mini-swe-agent/blob/main/src/minisweagent/run/hello_world.py)) — no fancy dependencies! - **Performant:** Scores >74% on the [SWE-bench verified benchmark](https://www.swebench.com/); starts much faster than Claude Code -- **Deployable:** Supports **local environments**, **docker/podman**, **singularity/apptainer**, **bublewrap**, **contree**, and more +- **Deployable:** Supports **local environments**, **docker/podman**, **singularity/apptainer**, **bubblewrap**, **contree**, **[E2B](https://e2b.dev)** (no local Docker required), and more - **Compatible:** Supports all models via **litellm**, **openrouter**, **portkey**, and more. Support for `/completion` and `/response` endpoints, interleaved thinking etc. - Built by the Princeton & Stanford team behind [SWE-bench](https://swebench.com), [SWE-agent](https://swe-agent.com), and more - **Tested:** [![Codecov](https://img.shields.io/codecov/c/github/swe-agent/mini-swe-agent?style=flat-square)](https://codecov.io/gh/SWE-agent/mini-swe-agent) diff --git a/docs/advanced/environments.md b/docs/advanced/environments.md index 7d672755d..3e30d3959 100644 --- a/docs/advanced/environments.md +++ b/docs/advanced/environments.md @@ -30,3 +30,5 @@ On top, there are a few more specialized environment classes that you can use: * **`contree`** ([`ContreeEnvironment`](../reference/environments/contree.md)) - Uses [ConTree](https://contree.dev/) for safe code execution sandboxing. Platform that built for agents and supports Git-like execution. +* **`e2b`** ([`E2BEnvironment`](../reference/environments/e2b.md)) - [E2B](https://e2b.dev) cloud sandbox execution. Converts Docker images into persistent E2B templates so **no local Docker daemon is required**. Suitable for large-scale, fully-remote SWE-bench evaluations. + diff --git a/docs/reference/environments/e2b.md b/docs/reference/environments/e2b.md new file mode 100644 index 000000000..25085afd1 --- /dev/null +++ b/docs/reference/environments/e2b.md @@ -0,0 +1,78 @@ +# E2B + +!!! note "E2B Environment class" + + - [Read on GitHub](https://github.com/swe-agent/mini-swe-agent/blob/main/src/minisweagent/environments/extra/e2b.py) + - Requires an [E2B](https://e2b.dev) account and API key + + ??? note "Full source code" + + ```python + --8<-- "src/minisweagent/environments/extra/e2b.py" + ``` + +::: minisweagent.environments.extra.e2b + +This environment executes commands in [E2B](https://e2b.dev) cloud sandboxes. +E2B converts Docker images into persistent sandbox templates, so **no local Docker daemon is required** — everything runs in the cloud. + +This makes it well-suited for: + +- Large-scale, fully-remote SWE-bench evaluations +- Environments where Docker is unavailable (CI, serverless) +- Parallel agent runs without managing local container infrastructure + +## How it works + +The first time a Docker image is used, `E2BEnvironment` builds a persistent E2B template from that image (via `Template.build`). Subsequent runs reuse the cached template, so the build cost is paid only once per unique image. + +## Setup + +1. Install the E2B extra: + ```bash + pip install "mini-swe-agent[e2b]" + ``` + +2. Set your E2B API key: + ```bash + export E2B_API_KEY="your-e2b-api-key" + ``` + +## Usage + +Evaluate on SWE-bench using E2B as the sandbox backend: +```bash +mini-extra swebench \ + --subset verified \ + --split test \ + --workers 50 \ + --environment-class e2b +``` + +Or specify it in your YAML config: +```yaml +environment: + environment_class: e2b + sandbox_timeout: 3600 # seconds the sandbox stays alive + cpu_count: 2 + memory_mb: 2048 +``` + +## Configuration reference + +| Field | Default | Description | +|-------|---------|-------------| +| `image` | *(required)* | Docker Hub image to use as the sandbox base | +| `cwd` | `/` | Default working directory for commands | +| `timeout` | `30` | Per-command timeout in seconds | +| `env` | `{}` | Environment variables set in every command | +| `sandbox_timeout` | `3600` | How long the sandbox stays alive (seconds) | +| `cpu_count` | `2` | vCPUs allocated to the sandbox | +| `memory_mb` | `2048` | Memory allocated to the sandbox (MiB) | +| `build_timeout` | `1800` | Max seconds to wait for a template build | +| `skip_cache` | `False` | Force-rebuild the template even if it exists | +| `api_key` | `None` | E2B API key (falls back to `E2B_API_KEY` env var) | +| `registry_username` | `None` | Username for private Docker registry auth | +| `registry_password` | `None` | Password for private Docker registry auth | + +{% include-markdown "../../_footer.md" %} diff --git a/pyproject.toml b/pyproject.toml index 808e9e10b..fdfe58008 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,10 @@ contree = [ "contree-sdk>=0.2.0", ] +e2b = [ + "e2b>=1.0.0", +] + [project.urls] Documentation = "https://mini-swe-agent.com/latest/" Repository = "https://github.com/SWE-agent/mini-swe-agent" diff --git a/src/minisweagent/environments/__init__.py b/src/minisweagent/environments/__init__.py index 08ae2c65c..8104b6fd7 100644 --- a/src/minisweagent/environments/__init__.py +++ b/src/minisweagent/environments/__init__.py @@ -13,6 +13,7 @@ "swerex_modal": "minisweagent.environments.extra.swerex_modal.SwerexModalEnvironment", "bubblewrap": "minisweagent.environments.extra.bubblewrap.BubblewrapEnvironment", "contree": "minisweagent.environments.extra.contree.ContreeEnvironment", + "e2b": "minisweagent.environments.extra.e2b.E2BEnvironment", } diff --git a/src/minisweagent/environments/extra/e2b.py b/src/minisweagent/environments/extra/e2b.py new file mode 100644 index 000000000..144380b0c --- /dev/null +++ b/src/minisweagent/environments/extra/e2b.py @@ -0,0 +1,306 @@ +"""E2B cloud sandbox environment implementation.""" + +from __future__ import annotations + +import atexit +import concurrent.futures +import hashlib +import logging +import re +from typing import Any + +from pydantic import BaseModel, Field + +# Module-level registry of live sandboxes for best-effort cleanup on exit +# (covers Ctrl+C and unhandled exceptions where __del__ may not be called). +_active_sandboxes: set[E2BEnvironment] = set() + + +def _cleanup_all_sandboxes() -> None: + """Kill all sandboxes that are still alive at interpreter shutdown.""" + for env in list(_active_sandboxes): + env.cleanup() + + +atexit.register(_cleanup_all_sandboxes) + + +class E2BEnvironmentConfig(BaseModel): + image: str + """Docker Hub image name to use as the E2B template base. + Example: ``'swebench/sweb.eval.x86_64.django__django-11099:latest'`` + """ + cwd: str = "/" + """Working directory in which to execute commands.""" + timeout: int = 30 + """Timeout for executing commands in the sandbox.""" + env: dict[str, str] = Field(default_factory=dict) + """Environment variables to set when executing commands.""" + sandbox_timeout: int = 3600 + """How long (in seconds) the sandbox is allowed to stay alive.""" + + # Template build options (passed to Template.build()) + cpu_count: int = 2 + """Number of vCPUs allocated to the sandbox.""" + memory_mb: int = 2048 + """Memory allocated to the sandbox in MiB. Default is higher than E2B's 1024 MiB default + to accommodate larger SWE-bench images.""" + skip_cache: bool = False + """If True, force-rebuild the template even if it already exists.""" + tags: list[str] = Field(default_factory=list) + """Optional tags to attach to the template.""" + build_timeout: int = 1800 + """Timeout for template builds in seconds (default 30 min to handle large images).""" + + # E2B authentication (can also be set via the E2B_API_KEY env var) + api_key: str | None = None + """E2B API key. Falls back to the E2B_API_KEY environment variable.""" + + # Private registry credentials (passed to Template().from_image()) + registry_username: str | None = None + """Username for authenticating against a private Docker registry.""" + registry_password: str | None = None + """Password for authenticating against a private Docker registry.""" + + +class E2BTemplateManager: + """Converts Docker images to E2B templates and manages their lifecycle. + + Can be used independently of :class:`E2BEnvironment` for pre-building + templates in batch scripts. + """ + + def __init__(self, config: E2BEnvironmentConfig) -> None: + self.config = config + self.logger = logging.getLogger("minisweagent.environment.e2b") + + @staticmethod + def _image_to_template_name(docker_image: str) -> str: + """Deterministically map a Docker image name to a valid E2B template name. + + A sha256 8-character suffix is appended to avoid collisions between + images that produce the same sanitized prefix. The result is at most + 63 characters and contains only lower-case alphanumerics and hyphens. + + Example:: + + 'swebench/sweb.eval.x86_64.django__django-11099:latest' + → 'swebench-sweb-eval-x86-64-django--django-11099-l-a1b2c3d4' + """ + hash_suffix = hashlib.sha256(docker_image.encode()).hexdigest()[:8] + name = re.sub(r"[^a-zA-Z0-9-]", "-", docker_image) + name = re.sub(r"-{3,}", "--", name) + name = name.lower() + # Reserve 9 characters for "-" + 8-char hash suffix → prefix max 54 chars + prefix = name[:54].strip("-") + if not prefix: + return hash_suffix + return f"{prefix}-{hash_suffix}" + + def get_or_build(self, docker_image: str) -> str: + """Return the E2B template name for *docker_image*, building it if needed.""" + from e2b import Template + + template_name = self._image_to_template_name(docker_image) + if not Template.exists(template_name, api_key=self.config.api_key) or self.config.skip_cache: + self.logger.info( + "E2B template %s not found. Starting build (up to %d seconds)...", + template_name, + self.config.build_timeout, + ) + self._build_template(docker_image, template_name) + self.logger.info("E2B template %s built successfully.", template_name) + else: + self.logger.debug("E2B template %s already exists.", template_name) + return template_name + + def rebuild(self, docker_image: str) -> str: + """Force-rebuild the E2B template for *docker_image*.""" + template_name = self._image_to_template_name(docker_image) + self.logger.info("Rebuilding E2B template %s...", template_name) + self._build_template(docker_image, template_name) + self.logger.info("E2B template %s rebuilt successfully.", template_name) + return template_name + + def _build_template(self, docker_image: str, template_name: str) -> None: + """Build an E2B template from *docker_image*. + + Uses :class:`concurrent.futures.ThreadPoolExecutor` for timeout + enforcement because ``signal.alarm`` only works on the main thread + and this method may be called from worker threads. + """ + from e2b import Template + + template = Template().from_image( + docker_image, + username=self.config.registry_username, + password=self.config.registry_password, + ) + + def _do_build() -> None: + Template.build( + template, + template_name, + cpu_count=self.config.cpu_count, + memory_mb=self.config.memory_mb, + skip_cache=self.config.skip_cache, + tags=self.config.tags or None, + api_key=self.config.api_key, + ) + + executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + future = executor.submit(_do_build) + try: + future.result(timeout=self.config.build_timeout) + except concurrent.futures.TimeoutError as e: + executor.shutdown(wait=False, cancel_futures=True) + msg = f"E2B template build timed out after {self.config.build_timeout}s: {template_name}" + raise TimeoutError(msg) from e + except Exception: + executor.shutdown(wait=False, cancel_futures=True) + raise + else: + executor.shutdown(wait=True) + + +class E2BEnvironment: + """Executes bash commands inside an E2B cloud sandbox. + + `E2B `_ provides isolated cloud sandboxes that can run + arbitrary Docker images without requiring a local Docker daemon. This + makes it suitable for large-scale, fully-remote SWE-bench evaluations. + + The first time a Docker image is used it is converted into a persistent + E2B template; subsequent runs reuse the cached template. + + See :class:`E2BEnvironmentConfig` for keyword arguments. + """ + + #: Config fields that must never leak into prompts or saved trajectories. + _SECRET_FIELDS = {"api_key", "registry_password", "registry_username"} + + @staticmethod + def _is_stale_template_error(e: Exception) -> bool: + """Return True if *e* is a 'template not found' (HTTP 404) error. + + e2b surfaces a missing template as a ``SandboxException`` whose message is + formatted as ``"{status_code}: {message}"`` (see ``e2b.api.handle_api_exception``). + Match the leading 404 status code rather than a bare ``"404"`` substring, + which could appear inside a sandbox id or path and trigger a costly, + unnecessary template rebuild. + """ + match = re.match(r"\s*(\d{3})\b", str(e)) + return match is not None and match.group(1) == "404" + + def __init__(self, **kwargs: Any) -> None: + from e2b import Sandbox + from e2b.exceptions import SandboxException + + self.logger = logging.getLogger("minisweagent.environment.e2b") + self.config = E2BEnvironmentConfig(**kwargs) + manager = E2BTemplateManager(self.config) + template_name = manager.get_or_build(self.config.image) + self.logger.info("Creating E2B sandbox (template: %s)...", template_name) + try: + self.sandbox = Sandbox.create( + template=template_name, + timeout=self.config.sandbox_timeout, + api_key=self.config.api_key, + ) + except SandboxException as e: + if not self._is_stale_template_error(e): + raise + self.logger.warning("Template %s not found (stale cache). Rebuilding...", template_name) + manager.rebuild(self.config.image) + self.sandbox = Sandbox.create( + template=template_name, + timeout=self.config.sandbox_timeout, + api_key=self.config.api_key, + ) + self.logger.info("E2B sandbox ready (id: %s)", self.sandbox.sandbox_id) + _active_sandboxes.add(self) + + def execute(self, action: dict, cwd: str = "", *, timeout: int | None = None) -> dict[str, Any]: + """Execute a command in the sandbox and return the output.""" + command = action.get("command", "") if isinstance(action, dict) else action + try: + result = self.sandbox.commands.run( + command, + user="root", + cwd=cwd or self.config.cwd, + timeout=timeout or self.config.timeout, + envs=self.config.env or None, + ) + output: dict[str, Any] = { + "output": result.stdout + result.stderr, + "returncode": result.exit_code, + "exception_info": "", + } + except Exception as e: + # e2b raises ``CommandExitException`` (carrying stdout/stderr/exit_code) + # for any non-zero exit. That is a normal command result, not an + # infrastructure error, so surface the real output and exit code + # instead of masking it as a generic failure. + if (exit_code := getattr(e, "exit_code", None)) is not None: + output = { + "output": getattr(e, "stdout", "") + getattr(e, "stderr", ""), + "returncode": exit_code, + "exception_info": "", + } + else: + output = { + "output": "", + "returncode": -1, + "exception_info": f"An error occurred while executing the command: {e}", + "extra": {"exception_type": type(e).__name__, "exception": str(e)}, + } + self._check_finished(output) + return output + + def _check_finished(self, output: dict) -> None: + """Raise :class:`~minisweagent.exceptions.Submitted` when the task-submission marker is detected.""" + from minisweagent.exceptions import Submitted + + lines = output.get("output", "").lstrip().splitlines(keepends=True) + if lines and lines[0].strip() == "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT" and output["returncode"] == 0: + submission = "".join(lines[1:]) + raise Submitted( + { + "role": "exit", + "content": submission, + "extra": {"exit_status": "Submitted", "submission": submission}, + } + ) + + def get_template_vars(self, **kwargs: Any) -> dict[str, Any]: + import platform + + from minisweagent.utils.serialize import recursive_merge + + config = self.config.model_dump(exclude=self._SECRET_FIELDS) + return recursive_merge(config, platform.uname()._asdict(), kwargs) + + def serialize(self) -> dict: + return { + "info": { + "config": { + "environment": self.config.model_dump( + mode="json", + exclude=self._SECRET_FIELDS, + ), + "environment_type": f"{self.__class__.__module__}.{self.__class__.__name__}", + } + } + } + + def cleanup(self) -> None: + _active_sandboxes.discard(self) + sandbox = getattr(self, "sandbox", None) + if sandbox is not None: + try: + sandbox.kill() + except Exception: + pass + + def __del__(self) -> None: + self.cleanup() diff --git a/src/minisweagent/run/benchmarks/swebench.py b/src/minisweagent/run/benchmarks/swebench.py index a708eee51..65a284af8 100644 --- a/src/minisweagent/run/benchmarks/swebench.py +++ b/src/minisweagent/run/benchmarks/swebench.py @@ -94,7 +94,7 @@ def get_sb_environment(config: dict, instance: dict) -> Environment: env_config = config.setdefault("environment", {}) env_config["environment_class"] = env_config.get("environment_class", "docker") image_name = get_swebench_docker_image_name(instance) - if env_config["environment_class"] in ["docker", "swerex_modal"]: + if env_config["environment_class"] in ["docker", "swerex_modal", "e2b"]: env_config["image"] = image_name elif env_config["environment_class"] in ["singularity", "contree"]: env_config["image"] = "docker://" + image_name @@ -133,6 +133,20 @@ def remove_from_preds_file(output_path: Path, instance_id: str): output_path.write_text(json.dumps(output_data, indent=2)) +def _teardown_environment(env: Environment | None) -> None: + """Release the per-instance environment resource (container / cloud sandbox). + + Environments expose teardown as either ``cleanup()`` (docker, singularity, + bubblewrap) or ``stop()`` (swerex_modal); call whichever exists. + """ + if env is None: + return + for teardown_name in ("cleanup", "stop"): + if callable(teardown := getattr(env, teardown_name, None)): + teardown() + break + + def process_instance( instance: dict, output_dir: Path, @@ -156,6 +170,7 @@ def process_instance( result = None extra_info = {} + env = None try: env = get_sb_environment(config, instance) agent = ProgressTrackingAgent( @@ -173,22 +188,28 @@ def process_instance( exit_status, result = type(e).__name__, "" extra_info = {"traceback": traceback.format_exc(), "exception_str": str(e)} finally: - if agent is not None: - traj_path = instance_dir / f"{instance_id}.traj.json" - agent.save( - traj_path, - { - "info": { - "exit_status": exit_status, - "submission": result, - **extra_info, + # Teardown lives in its own finally so the environment (container / cloud + # sandbox) is always released even if saving the trajectory or updating + # the predictions file raises. + try: + if agent is not None: + traj_path = instance_dir / f"{instance_id}.traj.json" + agent.save( + traj_path, + { + "info": { + "exit_status": exit_status, + "submission": result, + **extra_info, + }, + "instance_id": instance_id, }, - "instance_id": instance_id, - }, - ) - logger.info(f"Saved trajectory to '{traj_path}'") - update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result) - progress_manager.on_instance_end(instance_id, exit_status) + ) + logger.info(f"Saved trajectory to '{traj_path}'") + update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result) + progress_manager.on_instance_end(instance_id, exit_status) + finally: + _teardown_environment(env) def filter_instances( diff --git a/tests/environments/extra/test_e2b.py b/tests/environments/extra/test_e2b.py new file mode 100644 index 000000000..ea7ab48e1 --- /dev/null +++ b/tests/environments/extra/test_e2b.py @@ -0,0 +1,304 @@ +"""Tests for the E2B cloud sandbox environment.""" + +from types import ModuleType +from unittest.mock import MagicMock, patch + +import pytest + +from minisweagent.environments.extra.e2b import ( + E2BEnvironment, + E2BEnvironmentConfig, + E2BTemplateManager, +) +from minisweagent.exceptions import Submitted + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_mock_e2b() -> ModuleType: + """Return a minimal mock of the `e2b` module.""" + mock_e2b = MagicMock() + mock_e2b.Template = MagicMock() + mock_e2b.Sandbox = MagicMock() + return mock_e2b + + +def _make_env(**kwargs) -> E2BEnvironment: + """Create an E2BEnvironment without touching real E2B infrastructure.""" + with patch.object(E2BEnvironment, "__init__", lambda self, **kw: None): + env = E2BEnvironment() + env.config = E2BEnvironmentConfig(image="swebench/test-image:latest", **kwargs) + env.sandbox = MagicMock() + env.logger = MagicMock() + return env + + +# --------------------------------------------------------------------------- +# E2BEnvironmentConfig +# --------------------------------------------------------------------------- + + +class TestE2BEnvironmentConfig: + def test_defaults(self): + cfg = E2BEnvironmentConfig(image="python:3.11") + assert cfg.cwd == "/" + assert cfg.timeout == 30 + assert cfg.sandbox_timeout == 3600 + assert cfg.cpu_count == 2 + assert cfg.memory_mb == 2048 + assert cfg.skip_cache is False + assert cfg.tags == [] + assert cfg.build_timeout == 1800 + assert cfg.api_key is None + assert cfg.registry_username is None + assert cfg.registry_password is None + + def test_custom_values(self): + cfg = E2BEnvironmentConfig(image="my-image:tag", sandbox_timeout=7200, cpu_count=4) + assert cfg.sandbox_timeout == 7200 + assert cfg.cpu_count == 4 + + +# --------------------------------------------------------------------------- +# E2BTemplateManager._image_to_template_name +# --------------------------------------------------------------------------- + + +class TestImageToTemplateName: + def test_basic_sanitization(self): + name = E2BTemplateManager._image_to_template_name("python:3.11") + assert re.match(r"^[a-z0-9-]+$", name), f"Invalid chars in: {name}" + + def test_length_limit(self): + long_image = "a" * 100 + ":latest" + name = E2BTemplateManager._image_to_template_name(long_image) + assert len(name) <= 63 + + def test_deterministic(self): + image = "swebench/sweb.eval.x86_64.django__django-11099:latest" + assert E2BTemplateManager._image_to_template_name(image) == E2BTemplateManager._image_to_template_name(image) + + def test_different_images_different_names(self): + a = E2BTemplateManager._image_to_template_name("image-a:latest") + b = E2BTemplateManager._image_to_template_name("image-b:latest") + assert a != b + + def test_no_triple_hyphens(self): + # Dots and slashes become hyphens; consecutive runs are collapsed to "--" + name = E2BTemplateManager._image_to_template_name("a/b/c.d.e:latest") + assert "---" not in name + + def test_empty_prefix_falls_back_to_hash(self): + # An image that sanitizes to only hyphens should return just the hash + name = E2BTemplateManager._image_to_template_name("---") + assert len(name) == 8 # just the 8-char sha256 prefix + + +import re # noqa: E402 (needed after class definitions above for clarity) + +# --------------------------------------------------------------------------- +# E2BEnvironment._is_stale_template_error +# --------------------------------------------------------------------------- + + +class TestIsStaleTemplateError: + def test_404_status_prefix_matches(self): + # e2b formats API errors as "{status_code}: {message}". + exc = Exception("404: template foo not found") + assert E2BEnvironment._is_stale_template_error(exc) is True + + def test_other_status_does_not_match(self): + assert E2BEnvironment._is_stale_template_error(Exception("500: internal error")) is False + assert E2BEnvironment._is_stale_template_error(Exception("429: rate limited")) is False + + def test_incidental_404_substring_does_not_match(self): + # "404" appearing inside an id/path must not trigger a costly rebuild. + assert E2BEnvironment._is_stale_template_error(Exception("Sandbox abc404def failed")) is False + assert E2BEnvironment._is_stale_template_error(Exception("error in /path/404/x")) is False + + +# --------------------------------------------------------------------------- +# E2BEnvironment.execute +# --------------------------------------------------------------------------- + + +class TestE2BEnvironmentExecute: + def test_execute_dict_action(self): + env = _make_env() + mock_result = MagicMock() + mock_result.stdout = "hello\n" + mock_result.stderr = "" + mock_result.exit_code = 0 + env.sandbox.commands.run.return_value = mock_result + + output = env.execute({"command": "echo hello"}) + + assert output["output"] == "hello\n" + assert output["returncode"] == 0 + assert output["exception_info"] == "" + + def test_execute_string_action(self): + env = _make_env() + mock_result = MagicMock() + mock_result.stdout = "ok\n" + mock_result.stderr = "" + mock_result.exit_code = 0 + env.sandbox.commands.run.return_value = mock_result + + output = env.execute("echo ok") + + assert output["output"] == "ok\n" + + def test_execute_nonzero_exit(self): + # e2b's commands.run() RAISES CommandExitException (carrying stdout/stderr/ + # exit_code) on any non-zero exit. A failing command is a normal result, + # not an infrastructure error: its real output and exit code must survive. + env = _make_env() + exc = Exception("Command exited with code 1") + exc.stdout = "partial stdout\n" + exc.stderr = "boom\n" + exc.exit_code = 1 + env.sandbox.commands.run.side_effect = exc + + output = env.execute({"command": "false"}) + + assert output["returncode"] == 1 + assert "boom" in output["output"] + assert "partial stdout" in output["output"] + assert output["exception_info"] == "" + + def test_execute_exception(self): + env = _make_env() + env.sandbox.commands.run.side_effect = RuntimeError("connection lost") + + output = env.execute({"command": "ls"}) + + assert output["returncode"] == -1 + assert "connection lost" in output["exception_info"] + + def test_execute_raises_submitted(self): + env = _make_env() + mock_result = MagicMock() + mock_result.stdout = "COMPLETE_TASK_AND_SUBMIT_FINAL_OUTPUT\ndiff --git a/f.py b/f.py\n" + mock_result.stderr = "" + mock_result.exit_code = 0 + env.sandbox.commands.run.return_value = mock_result + + with pytest.raises(Submitted) as exc_info: + env.execute({"command": "submit"}) + + msg = exc_info.value.messages[0] + assert msg["extra"]["exit_status"] == "Submitted" + assert "diff --git" in msg["extra"]["submission"] + + +# --------------------------------------------------------------------------- +# E2BEnvironment.get_template_vars +# --------------------------------------------------------------------------- + + +class TestE2BEnvironmentTemplateVars: + def test_includes_platform_uname(self): + # Default configs (mini/default) render {{system}}/{{machine}}/... under + # Jinja StrictUndefined, so these keys must be present like docker/local. + env = _make_env() + result = env.get_template_vars() + for key in ("system", "release", "version", "machine", "node", "processor"): + assert key in result + + def test_excludes_credentials(self): + # Template vars feed the Jinja prompt context; secrets must not leak there. + env = _make_env(api_key="secret-key", registry_password="secret-pass", registry_username="user") + result = env.get_template_vars() + assert "api_key" not in result + assert "registry_password" not in result + assert "registry_username" not in result + + def test_kwargs_override(self): + env = _make_env() + result = env.get_template_vars(extra="value") + assert result["extra"] == "value" + + +# --------------------------------------------------------------------------- +# E2BEnvironment.serialize +# --------------------------------------------------------------------------- + + +class TestE2BEnvironmentSerialize: + def test_serialize_structure(self): + env = _make_env() + result = env.serialize() + + assert "info" in result + assert "config" in result["info"] + assert "environment" in result["info"]["config"] + assert "environment_type" in result["info"]["config"] + assert "E2BEnvironment" in result["info"]["config"]["environment_type"] + + def test_serialize_excludes_credentials(self): + env = _make_env() + env.config.api_key = "secret-key" + env.config.registry_password = "secret-pass" + + result = env.serialize() + env_cfg = result["info"]["config"]["environment"] + + assert "api_key" not in env_cfg + assert "registry_password" not in env_cfg + + +# --------------------------------------------------------------------------- +# E2BEnvironment.cleanup / __del__ +# --------------------------------------------------------------------------- + + +class TestE2BEnvironmentCleanup: + def test_cleanup_kills_sandbox(self): + env = _make_env() + env.cleanup() + env.sandbox.kill.assert_called_once() + + def test_cleanup_tolerates_missing_sandbox(self): + with patch.object(E2BEnvironment, "__init__", lambda self, **kw: None): + env = E2BEnvironment() + # sandbox was never set + env.cleanup() # should not raise + + def test_cleanup_tolerates_kill_exception(self): + env = _make_env() + env.sandbox.kill.side_effect = RuntimeError("already dead") + env.cleanup() # should not raise + + +# --------------------------------------------------------------------------- +# atexit cleanup registry +# --------------------------------------------------------------------------- + + +class TestAtexitCleanup: + def test_cleanup_removes_from_active_sandboxes(self): + from minisweagent.environments.extra import e2b as e2b_mod + + env = _make_env() + e2b_mod._active_sandboxes.add(env) + assert env in e2b_mod._active_sandboxes + + env.cleanup() + assert env not in e2b_mod._active_sandboxes + + def test_cleanup_all_sandboxes_kills_all(self): + from minisweagent.environments.extra import e2b as e2b_mod + + env1 = _make_env() + env2 = _make_env() + e2b_mod._active_sandboxes.update([env1, env2]) + + e2b_mod._cleanup_all_sandboxes() + + env1.sandbox.kill.assert_called_once() + env2.sandbox.kill.assert_called_once() + assert env1 not in e2b_mod._active_sandboxes + assert env2 not in e2b_mod._active_sandboxes diff --git a/tests/run/test_swebench.py b/tests/run/test_swebench.py index cab09c0ae..244a326db 100644 --- a/tests/run/test_swebench.py +++ b/tests/run/test_swebench.py @@ -1,6 +1,6 @@ import json import re -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from pydantic import BaseModel @@ -8,6 +8,7 @@ from minisweagent import package_dir from minisweagent.models.test_models import DeterministicModel, make_output from minisweagent.run.benchmarks.swebench import ( + _teardown_environment, filter_instances, get_swebench_docker_image_name, main, @@ -16,6 +17,25 @@ ) +class TestTeardownEnvironment: + def test_prefers_cleanup_over_stop(self): + env = MagicMock(spec=["cleanup", "stop"]) + _teardown_environment(env) + env.cleanup.assert_called_once() + env.stop.assert_not_called() + + def test_falls_back_to_stop(self): + env = MagicMock(spec=["stop"]) + _teardown_environment(env) + env.stop.assert_called_once() + + def test_tolerates_env_without_teardown(self): + _teardown_environment(MagicMock(spec=[])) # should not raise + + def test_tolerates_none(self): + _teardown_environment(None) # should not raise + + def _make_model_from_fixture(text_outputs: list[str], cost_per_call: float = 1.0, **kwargs) -> DeterministicModel: """Create a DeterministicModel from trajectory fixture data (raw text outputs)."""