Skip to content

Commit d709063

Browse files
runpod-Henrikclaudedeanq
authored
feat(e2e): add CPU E2E test suite with provisioner and rolling release tests (#326)
* feat(e2e): add CPU E2E test suite with provisioner and rolling release tests Adds the full E2E test infrastructure built and validated during v1.14.0 QA: - provisioner.py: session-scoped endpoint pool with parallel provisioning - test_cpu_smoke.py: updated deploy → invoke → undeploy smoke test - test_cpu_suite.py: QB function (smoke, empty string, unicode, concurrent), deps (numpy/pandas), class, and LB endpoint tests (9 pass, 1 xfail AE-2744) - test_rolling_release.py: no-spurious-release and config-change-triggers-drift - test_redeploy.py: scale-to-zero and multi-worker (scale-to-zero + always-on) recycle tests; single-slot always-on failures split to test_redeploy_always_on.py - e2e.yml: enable push/PR CI triggers; inject FLASH_SDK_GIT_REF All 15 CPU tests confirmed passing locally (v1.14.0). GPU smoke included; may timeout in CI when GPU inventory is constrained. Excluded from this PR (tracked separately): - test_redeploy_always_on.py: single-slot always-on recycle (AE-2940/2941/2942) - test_source_fingerprint.py: needs assertion update - test_concurrency_modifier.py: inconclusive — needs redesign Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore(ci): remove scheduled e2e trigger Keep workflow_dispatch-only trigger; schedule can be added back once the E2E account quota and test batching are sorted out. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(e2e): address PR #326 review comments in test_rolling_release - Correct TestRollingReleaseNoSpuriousRelease docstring: remove false claim about 'cached' in output; describe actual worker_id comparison - Make LOG_LEVEL=INFO explicit in _deploy_env so the "Updating endpoint" log.info assertion is reliable regardless of caller environment Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style: ruff format test_rolling_release Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Dean Quiñanola <deanq@users.noreply.github.com>
1 parent cd30c77 commit d709063

9 files changed

Lines changed: 1058 additions & 25 deletions

File tree

.github/workflows/e2e.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ jobs:
9292
- name: Run E2E tests
9393
env:
9494
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
95+
FLASH_SDK_GIT_REF: ${{ github.sha }}
9596
run: |
9697
uv run pytest e2e/ \
9798
${{ inputs.tests != '' && format('-k "{0}"', inputs.tests) || '' }} \

e2e/conftest.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,21 @@
77
import asyncio
88
import os
99
import pickle
10+
import sys
1011
from pathlib import Path
1112

12-
import pytest
13+
# Ensure the e2e/ directory is on sys.path so test files can import local
14+
# modules (provisioner, etc.) regardless of how pytest resolves the rootdir.
15+
_E2E_DIR = str(Path(__file__).parent)
16+
if _E2E_DIR not in sys.path:
17+
sys.path.insert(0, _E2E_DIR)
18+
19+
import pytest # noqa: E402
1320

1421
try:
15-
import tomllib
22+
import tomllib # noqa: E402
1623
except ImportError:
17-
import tomli as tomllib # type: ignore[no-redef]
24+
import tomli as tomllib # type: ignore[no-redef] # noqa: E402
1825

1926

2027
def _api_key_from_config() -> str | None:
@@ -25,7 +32,8 @@ def _api_key_from_config() -> str | None:
2532
try:
2633
data = tomllib.loads(config_file.read_text())
2734
return data.get("default", {}).get("api_key")
28-
except Exception:
35+
except Exception as exc:
36+
print(f"Warning: could not parse ~/.runpod/config.toml: {exc}")
2937
return None
3038

3139

@@ -38,29 +46,38 @@ def endpoint_id_from_state(project_dir: Path) -> str:
3846
3947
The state file is a (resources_dict, config_hashes_dict) tuple.
4048
resources_dict keys are "ResourceType:name", values are resource objects with .id.
49+
50+
Raises FileNotFoundError if the state file is missing (deploy did not complete).
51+
Raises ValueError if the file exists but contains no endpoint ID (format may have changed).
4152
"""
4253
state_file = project_dir / ".flash" / "resources.pkl"
4354
if not state_file.exists():
4455
raise FileNotFoundError(f"State file not found: {state_file}")
45-
with open(state_file, "rb") as f:
46-
data = pickle.load(f)
56+
try:
57+
with open(state_file, "rb") as f:
58+
data = pickle.load(f)
59+
except Exception as exc:
60+
raise ValueError(
61+
f"Failed to deserialize state file {state_file} — "
62+
f"the .flash/resources.pkl format may have changed: {exc}"
63+
) from exc
4764
resources = data[0] if isinstance(data, tuple) else data
4865
for _key, resource in resources.items():
4966
endpoint_id = getattr(resource, "id", None)
5067
if endpoint_id:
5168
return endpoint_id
52-
raise ValueError(f"No endpoint ID found in state file. Keys: {list(resources)}")
69+
raise ValueError(
70+
f"No endpoint ID found in state file {state_file}. "
71+
f"Keys present: {list(resources)}. "
72+
f"Check that the resource object has an 'id' attribute."
73+
)
5374

5475

55-
def sweep_endpoints(api_key: str) -> None:
56-
"""Delete all endpoints on the account.
76+
def sweep_endpoints(api_key: str, *, prefix: str = "flash-qa-") -> None:
77+
"""Delete endpoints whose names start with prefix.
5778
58-
The e2e RUNPOD_API_KEY is dedicated to testing. Call this in every test's
59-
finally block to ensure quota is fully released regardless of whether the
60-
graceful undeploy succeeded.
61-
62-
To restrict cleanup to smoke-test endpoints only, swap the list comprehension:
63-
endpoints = [ep for ep in endpoints if ep.get("name", "").startswith("flash-qa-smoke-")]
79+
Defaults to "flash-qa-" so only test-created endpoints are removed.
80+
Pass prefix="" to delete all endpoints on the account (use with caution).
6481
"""
6582
from runpod_flash.core.api.runpod import RunpodGraphQLClient
6683

@@ -69,7 +86,12 @@ async def _run(key: str) -> None:
6986
result = await client._execute_graphql(
7087
"query { myself { endpoints { id name } } }"
7188
)
72-
endpoints = result.get("myself", {}).get("endpoints", [])
89+
all_endpoints = result.get("myself", {}).get("endpoints", [])
90+
endpoints = [
91+
ep
92+
for ep in all_endpoints
93+
if not prefix or ep.get("name", "").startswith(prefix)
94+
]
7395
for ep in endpoints:
7496
eid, ename = ep["id"], ep.get("name", ep["id"])
7597
try:
@@ -95,3 +117,9 @@ def restore_real_credentials(monkeypatch: pytest.MonkeyPatch) -> None:
95117
)
96118
else:
97119
pytest.skip("No credentials available — skipping E2E test")
120+
121+
122+
@pytest.fixture
123+
def api_key() -> str:
124+
"""Return the RunPod API key for tests that need to pass it explicitly."""
125+
return _REAL_API_KEY # type: ignore[return-value] # guaranteed set by restore_real_credentials autouse

e2e/provisioner.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
"""Endpoint provisioner for E2E session-scoped fixtures.
2+
3+
provision() deploys a Flash worker and returns its endpoint_id.
4+
All shared endpoints are deployed in parallel at session start.
5+
6+
Git ref injection
7+
-----------------
8+
Set FLASH_SDK_GIT_REF to a commit SHA or branch name to install that exact
9+
version of runpod-flash inside the worker container instead of the latest
10+
PyPI release. In CI, set this to github.sha so workers run the branch under
11+
test rather than the last published release.
12+
13+
FLASH_SDK_GIT_REF=${{ github.sha }} # in CI workflow
14+
"""
15+
16+
import os
17+
import shutil
18+
import subprocess
19+
import tempfile
20+
from pathlib import Path
21+
22+
from conftest import endpoint_id_from_state
23+
24+
# ---------------------------------------------------------------------------
25+
# Git ref injection
26+
# ---------------------------------------------------------------------------
27+
28+
FLASH_GIT_REF: str = os.environ.get("FLASH_SDK_GIT_REF", "")
29+
FLASH_LOCAL_PATH: str = os.environ.get("FLASH_SDK_LOCAL_PATH", "")
30+
_FLASH_REPO = "https://github.com/runpod/runpod-flash"
31+
32+
33+
def flash_dep() -> str:
34+
"""Return the runpod-flash pip requirement string for worker pyproject.toml.
35+
36+
CI (FLASH_SDK_GIT_REF set): installs the exact commit under test.
37+
Local dev with local path (FLASH_SDK_LOCAL_PATH set): installs from local
38+
checkout — useful when the fix is not yet on PyPI and the git repo is private.
39+
Local dev (unset): installs the latest PyPI release.
40+
"""
41+
if FLASH_LOCAL_PATH:
42+
return f"runpod-flash @ file://{FLASH_LOCAL_PATH}"
43+
if FLASH_GIT_REF:
44+
return f"runpod-flash @ git+{_FLASH_REPO}@{FLASH_GIT_REF}"
45+
return "runpod-flash"
46+
47+
48+
# ---------------------------------------------------------------------------
49+
# Provisioner
50+
# ---------------------------------------------------------------------------
51+
52+
_PYPROJECT_TMPL = """\
53+
[project]
54+
name = "{name}"
55+
version = "0.1.0"
56+
requires-python = ">=3.11,<3.13"
57+
dependencies = [{deps}]
58+
"""
59+
60+
61+
def provision(
62+
worker_code: str,
63+
*,
64+
name: str,
65+
api_key: str,
66+
extra_deps: list[str] | None = None,
67+
deploy_timeout: int = 600,
68+
) -> tuple[str, Path]:
69+
"""Deploy a Flash worker and return (endpoint_id, project_dir).
70+
71+
The returned project_dir is a temporary directory that owns the .flash
72+
state. The caller is responsible for cleanup — call shutil.rmtree() on
73+
project_dir when the endpoint is no longer needed.
74+
75+
Args:
76+
worker_code: Python source of the worker file.
77+
name: Endpoint name (must be unique per CI run).
78+
api_key: RunPod API key passed explicitly to the subprocess env.
79+
extra_deps: Additional pip requirements (beyond runpod-flash).
80+
deploy_timeout: Seconds before subprocess.run times out.
81+
82+
Returns:
83+
(endpoint_id, project_dir)
84+
85+
Raises:
86+
RuntimeError: If flash deploy exits non-zero.
87+
"""
88+
deps = [flash_dep()]
89+
if extra_deps:
90+
deps.extend(extra_deps)
91+
deps_quoted = ", ".join(f'"{d}"' for d in deps)
92+
pyproject = _PYPROJECT_TMPL.format(name=name, deps=deps_quoted)
93+
94+
tmp_dir = Path(tempfile.mkdtemp(prefix=f"flash-e2e-{name}-"))
95+
(tmp_dir / "worker.py").write_text(worker_code)
96+
(tmp_dir / "pyproject.toml").write_text(pyproject)
97+
98+
env = os.environ.copy()
99+
env["RUNPOD_API_KEY"] = api_key # explicit — does not depend on autouse fixture
100+
101+
try:
102+
result = subprocess.run(
103+
["uv", "run", "flash", "deploy"],
104+
cwd=tmp_dir,
105+
env=env,
106+
capture_output=True,
107+
text=True,
108+
timeout=deploy_timeout,
109+
)
110+
except Exception:
111+
shutil.rmtree(tmp_dir, ignore_errors=True)
112+
raise
113+
114+
if result.returncode != 0:
115+
shutil.rmtree(tmp_dir, ignore_errors=True)
116+
raise RuntimeError(
117+
f"flash deploy failed for '{name}' (exit {result.returncode}):\n"
118+
f"stdout: {result.stdout}\nstderr: {result.stderr}"
119+
)
120+
121+
endpoint_id = endpoint_id_from_state(tmp_dir)
122+
return endpoint_id, tmp_dir

e2e/test_cpu_smoke.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import runpod
1212

1313
from conftest import endpoint_id_from_state, sweep_endpoints
14+
from provisioner import flash_dep
1415

1516
WORKER_NAME = f"flash-qa-smoke-{uuid.uuid4().hex[:8]}"
1617

@@ -28,22 +29,20 @@ async def echo(msg: str = "") -> dict:
2829
name = "{WORKER_NAME}"
2930
version = "0.1.0"
3031
requires-python = ">=3.11,<3.13"
31-
dependencies = ["runpod-flash"]
32+
dependencies = ["{flash_dep()}"]
3233
'''
3334

3435

3536
class TestCpuSmoke:
3637
"""CPU smoke: deploy → invoke → undeploy."""
3738

3839
def test_deploy_invoke_undeploy(self, tmp_path: Path) -> None:
39-
"""Deploy a minimal CPU worker, invoke it, verify output, undeploy."""
4040
env = os.environ.copy()
4141

4242
(tmp_path / "worker.py").write_text(WORKER_CODE)
4343
(tmp_path / "pyproject.toml").write_text(PYPROJECT_TOML)
4444

4545
try:
46-
# Deploy
4746
result = subprocess.run(
4847
["uv", "run", "flash", "deploy"],
4948
cwd=tmp_path,
@@ -59,8 +58,7 @@ def test_deploy_invoke_undeploy(self, tmp_path: Path) -> None:
5958

6059
endpoint_id = endpoint_id_from_state(tmp_path)
6160

62-
# Invoke
63-
runpod.api_key = env.get("RUNPOD_API_KEY")
61+
runpod.api_key = env["RUNPOD_API_KEY"]
6462
output = runpod.Endpoint(endpoint_id).run_sync(
6563
{"msg": "smoke"}, timeout=180
6664
)
@@ -70,7 +68,7 @@ def test_deploy_invoke_undeploy(self, tmp_path: Path) -> None:
7068
assert output.get("status") == "ok", f"Unexpected status: {output}"
7169

7270
finally:
73-
# Attempt graceful undeploy first
71+
# Exercise the undeploy CLI path; sweep catches any quota leak if this fails.
7472
try:
7573
undeploy = subprocess.run(
7674
["uv", "run", "flash", "undeploy", WORKER_NAME, "--force"],
@@ -88,6 +86,5 @@ def test_deploy_invoke_undeploy(self, tmp_path: Path) -> None:
8886
except subprocess.TimeoutExpired:
8987
print("WARNING: undeploy timed out after 60s")
9088

91-
# Always sweep all endpoints — dedicated e2e account, stale
92-
# endpoints hit the worker quota on subsequent runs.
89+
# Sweep flash-qa-* endpoints — stale endpoints exhaust worker quota.
9390
sweep_endpoints(env["RUNPOD_API_KEY"])

0 commit comments

Comments
 (0)