diff --git a/.gitignore b/.gitignore index 1e1c6fe077..6f0c9cb603 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ /.idea /*git_ignore* .DS_Store +.adk +tmp/ diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore b/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore new file mode 100644 index 0000000000..78cf8c8595 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore @@ -0,0 +1,165 @@ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +.venv/ +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + + +### OSX ### +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon +# Thumbnails +._* +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + + +### Windows ### +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + + +### Vagrant ### +.vagrant/ +### Local rules, see .gitignore.tail to override! ### +shippable +.git + +tmp/ +sessions.db +.adk/ diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore b/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore new file mode 100644 index 0000000000..fb34b7833c --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore @@ -0,0 +1,25 @@ +# This file tells gcloud builds submit which files to exclude from the upload. +# Without it, gcloud ignores .dockerignore and uploads everything (including .venv). + +.git +.venv/ +venv/ +ENV/ +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +*.egg +dist/ +build/ +.tox/ +.cache/ +.coverage +htmlcov/ +*.log +.env +.adk/ +sessions.db +tmp/ +.DS_Store diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile b/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile new file mode 100644 index 0000000000..417ad58946 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.13-slim +WORKDIR /app + +# Install kubectl (required by k8s-agent-sandbox for port-forwarding to sandbox pods) +# Uses TARGETARCH (injected by BuildKit) to download the correct binary for amd64 or arm64 +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl ca-certificates && \ + ARCH=$(dpkg --print-architecture) && \ + curl -LO "https://dl.k8s.io/release/$(curl -sL https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl" && \ + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \ + rm kubectl && \ + apt-get purge -y curl && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +RUN adduser --disabled-password --gecos "" myuser && \ + chown -R myuser:myuser /app + +COPY . . + +USER myuser + +ENV PATH="/home/myuser/.local/bin:$PATH" + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] + diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py new file mode 100644 index 0000000000..5271a8ef60 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py @@ -0,0 +1 @@ +# ADK Agent package diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml b/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml new file mode 100644 index 0000000000..653f07fcf8 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml @@ -0,0 +1,20 @@ +# Cloud Build config for cross-compiling to ARM64. +# Used by PKB when --container_remote_build_config points to this file. +# The _IMAGE substitution is passed by PKB RemoteBuild() automatically. +steps: + - name: 'gcr.io/cloud-builders/docker' + args: ['run', '--privileged', 'multiarch/qemu-user-static', '--reset', '-p', 'yes'] + id: 'qemu-setup' + - name: 'gcr.io/cloud-builders/docker' + args: ['buildx', 'create', '--use', '--name', 'multiarch-builder'] + id: 'create-builder' + waitFor: ['qemu-setup'] + - name: 'gcr.io/cloud-builders/docker' + args: ['buildx', 'build', '--platform', 'linux/arm64', '-t', '${_IMAGE}', '--push', '.'] + id: 'build-and-push' + waitFor: ['create-builder'] +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_32 +substitutions: + _IMAGE: '' diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py new file mode 100644 index 0000000000..c6df9a7a2a --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py @@ -0,0 +1,2 @@ +# GKE Performance Agent package +from . import agent \ No newline at end of file diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py new file mode 100644 index 0000000000..6561942960 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py @@ -0,0 +1,276 @@ +"""GKE Performance Agent -- ADK agent definition. + +This file runs INSIDE the GKE cluster as part of the adk-agent Deployment +(see gke_deploy_utils.py for the K8s manifest). It is NOT run from the +machine executing PKB. The ADK agent pod serves a FastAPI app (main.py) +that PKB calls via HTTP through a kubectl port-forward tunnel. + +Execution flow: + PKB (your laptop/CI) -> kubectl port-forward -> adk-agent pod -> this file + -> GkeCodeExecutor -> SandboxClient -> gVisor sandbox pod +""" + +"""GKE Performance Agent — ADK agent definition for sandbox benchmarking. + +EXECUTION CONTEXT: + This file runs INSIDE the GKE cluster, NOT on the PKB orchestrator machine. + It is packaged into a container image (see ../Dockerfile) and deployed as + the 'adk-agent' Deployment in the benchmark namespace. + + Execution flow: + PKB machine GKE Cluster + ---------- ----------- + benchmark.Run() + -> CallAgentApi("/benchmark/...") -> main.py (FastAPI) + -> Runner(agent=root_agent) + -> MockLlm yields code + -> V3GkeCodeExecutor._execute_in_sandbox() + -> SandboxClient.create_sandbox() + -> sandbox.files.write("script.py", code) + -> sandbox.commands.run("python3 script.py") + -> SandboxClient.delete_sandbox() + + The PKB machine communicates with this agent via HTTP (port-forwarded + through kubectl or via a LoadBalancer/ClusterIP service). +""" + +from google.adk.agents import LlmAgent +from google.adk.code_executors import GkeCodeExecutor +from google.adk.code_executors.code_execution_utils import CodeExecutionResult +from google.adk.models.base_llm import BaseLlm +from google.adk.models.llm_response import LlmResponse +from google.genai import types +from concurrent.futures import ThreadPoolExecutor +from dotenv import load_dotenv +from google.adk.apps import App +import logging +import os + +# --- Configure Logging --- +logging.basicConfig(level=logging.INFO) + +# ========================================================================= +# 1. Environment and Configuration +# ========================================================================= + +basedir = os.path.abspath(os.path.dirname(__file__)) +agent_dir = os.path.join(basedir, "..") + +# Load generated.env (rendered by gke_image_build_utils._GenerateEnvFile from PKB flags). +# In GKE, K8s manifest env vars take precedence. +load_dotenv(os.path.join(agent_dir, "generated.env")) + +# ========================================================================= +# 2. Mock LLM Definition (Inheriting from BaseLlm for Pydantic) +# ========================================================================= + +# Load the benchmark scripts +density_script_path = os.path.join( + basedir, "../sandboxed_apps/python_test_app/benchmark_density.py" +) +try: + with open(density_script_path, "r") as f: + density_benchmark_code = f.read() +except Exception: + density_benchmark_code = "import os; print(os.uname())" + +payload_script_path = os.path.join( + basedir, "../sandboxed_apps/python_test_app/benchmark_payload.py" +) +try: + with open(payload_script_path, "r") as f: + payload_benchmark_code = f.read() +except Exception: + payload_benchmark_code = "import os; print(os.uname())" + +qps_script_path = os.path.join( + basedir, "../sandboxed_apps/python_test_app/benchmark_qps.py" +) +try: + with open(qps_script_path, "r") as f: + qps_benchmark_code = f.read() +except Exception: + qps_benchmark_code = "import json; print(json.dumps({'sandbox_status': 'ok'}))" + +# Keys that main.py sets in os.environ per-request. We inject them into +# the script so they reach the sandbox pod. If unset, the benchmark scripts +# use their own built-in defaults. +_DENSITY_ENV_KEYS = ["SAMPLE_COUNT", "SAMPLE_WARMUP"] +_PAYLOAD_ENV_KEYS = ["PAYLOAD_SIZE_MB", "PAYLOAD_ITERATIONS"] +_QPS_ENV_KEYS: list[str] = [] # QPS script needs no env config + + +def _build_benchmark_code() -> str: + """Build the benchmark script with current env values injected. + + Selects the script based on BENCHMARK_MODE env var: + - 'density' → benchmark_density.py + - 'payload' → benchmark_payload.py + - 'qps' → benchmark_qps.py + """ + mode = os.getenv("BENCHMARK_MODE", "density") + + if mode == "payload": + env_keys = _PAYLOAD_ENV_KEYS + script = payload_benchmark_code + elif mode == "qps": + env_keys = _QPS_ENV_KEYS + script = qps_benchmark_code + else: + env_keys = _DENSITY_ENV_KEYS + script = density_benchmark_code + + lines = ["import os"] + for k in env_keys: + v = os.getenv(k) + if v is not None: + lines.append(f"os.environ['{k}'] = '{v}'") + return "\n".join(lines) + "\n\n" + script + + +class MockLlm(BaseLlm): + model: str = "mock-model" + + async def generate_content_async(self, llm_request, stream=False): + """Mock the ADK response loop. + + BaseLlm.generate_content_async is an AsyncGenerator — it must YIELD + LlmResponse objects, never return them. + """ + # ADK appends the code execution result to the conversation + # history before calling the LLM again. If the history has + # grown beyond the initial user prompt, code has already + # executed — return plain text to stop the loop. + has_execution_result = len(llm_request.contents) > 1 + + if has_execution_result: + part = types.Part(text="Execution Complete") + else: + # Create an ADK-compliant result with executable code. + # Build at request time so SAMPLE_COUNT/SAMPLE_WARMUP reflect + # the current os.environ values set by main.py per-request. + part = types.Part( + executable_code=types.ExecutableCode( + language="PYTHON", code=_build_benchmark_code() + ) + ) + + content = types.Content(role="model", parts=[part]) + response = LlmResponse(content=content, partial=False) + + # Yield exactly one final response (both streaming and non-streaming) + yield response + + +# ========================================================================= +# 3. Agent Initialization +# ========================================================================= + + +# Module-level thread pool for sandbox I/O operations. +# Initialized once at import time to avoid thread-safety issues +# with lazy initialization inside _execute_in_sandbox(). +_SANDBOX_POOL = ThreadPoolExecutor(max_workers=16) + + +class V3GkeCodeExecutor(GkeCodeExecutor): + def _execute_in_sandbox(self, code: str) -> CodeExecutionResult: + """Executes code using the v0.4.6 compatible SandboxClient.""" + from k8s_agent_sandbox.sandbox_client import SandboxClient + from k8s_agent_sandbox.models import SandboxDirectConnectionConfig + import logging + import time + + logging.info("Executing via V3 SandboxClient (v0.4.6 compatible).") + + # _SANDBOX_POOL is initialized at module level (thread-safe). + + # Use DirectConnection when SANDBOX_ROUTER_URL is set (in-cluster), + # otherwise fall back to kubectl port-forward (dev mode). + router_url = os.getenv("SANDBOX_ROUTER_URL") + if router_url: + client = SandboxClient( + connection_config=SandboxDirectConnectionConfig(api_url=router_url) + ) + else: + client = SandboxClient() + # v0.4.6 create_sandbox uses 'template' and 'namespace' arguments + create_ms = upload_ms = run_ms = delete_ms = 0.0 + sandbox = None + # Time sandbox creation + t0 = time.time() + create_future = _SANDBOX_POOL.submit( + client.create_sandbox, + template=self.sandbox_template, + namespace=self.namespace, + ) + sandbox = create_future.result() + create_ms = (time.time() - t0) * 1000.0 + try: + # v0.4.6 handles file I/O via the .files namespace + t0 = time.time() + upload_future = _SANDBOX_POOL.submit(sandbox.files.write, "script.py", code) + upload_future.result() + upload_ms = (time.time() - t0) * 1000.0 + + # SANDBOX_EXEC_TIMEOUT_S is set per-request by main.py. + # Default 60 s keeps density/snapshot runs tight; payload + # sweeps raise it for large blobs. + run_timeout = int(os.getenv("SANDBOX_EXEC_TIMEOUT_S", "60")) + + t0 = time.time() + run_future = _SANDBOX_POOL.submit( + sandbox.commands.run, "python3 script.py", timeout=run_timeout + ) + result = run_future.result() + run_ms = (time.time() - t0) * 1000.0 + + # ADK's build_code_execution_result_part discards stdout when + # stderr is non-empty (OUTCOME_FAILED path). Sandbox scripts + # produce benign stderr (C-extension reimport noise, gVisor + # warnings) that would cause all sandbox_* metrics to vanish. + # Log stderr for debugging, then clear it so ADK passes + # stdout through. + if result.stderr: + logging.warning("Sandbox stderr (ignored): %s", result.stderr[:500]) + + logging.info( + "SANDBOX_TIMINGS: create_ms=%.3f upload_ms=%.3f run_ms=%.3f", + create_ms, + upload_ms, + run_ms, + ) + return CodeExecutionResult(stdout=result.stdout, stderr="") + finally: + # Always cleanup the claim + t0 = time.time() + if sandbox is not None: + delete_future = _SANDBOX_POOL.submit( + client.delete_sandbox, sandbox.claim_name, namespace=self.namespace + ) + delete_future.result() + delete_ms = (time.time() - t0) * 1000.0 + logging.info("SANDBOX_TIMINGS_DELETE: delete_ms=%.3f", delete_ms) + + +gke_executor = V3GkeCodeExecutor( + cluster_name=os.getenv("CLUSTER_NAME"), + location=os.getenv("GOOGLE_CLOUD_LOCATION"), + namespace=os.getenv("AGENTIC_NAMESPACE"), + executor_type="sandbox", + sandbox_template="python-sandbox-template", +) + +gke_performance_agent = LlmAgent( + name="gke_performance_agent", # Must be a valid identifier (no dashes) + model=MockLlm(model="mock-model"), + code_executor=gke_executor, +) + +root_agent = gke_performance_agent + +app = App( + name=root_agent.name, + root_agent=root_agent, + # enable_tracing=True, +) diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py new file mode 100644 index 0000000000..473c2072c2 --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py @@ -0,0 +1,1107 @@ +"""FastAPI service fronting the GKE Performance Agent. + +Exposes REST endpoints that PKB calls to trigger benchmarks. The agent +runs *inside* the GKE cluster so it can reach the Sandbox Controller and +create gVisor sandboxes natively. + +Endpoints: + GET /healthz → liveness probe + POST /benchmark/python/density → run the Python density benchmark (UC-B) + POST /benchmark/python/payload → run the payload transfer benchmark (UC-D) + POST /benchmark/python/qps → run the QPS saturation benchmark (UC-F) + POST /benchmark/chromium/density → run the Chromium density benchmark (UC-C) + POST /run → raw ADK agent interaction + +POST /benchmark/python/density — Request: + { + "sample_count": int — iterations per sandbox session (default: 100) + "sample_warmup": int — warmup iterations excluded from stats (default: 5) + "concurrent_sessions": int — parallel sandbox sessions (default: 1) + "sandbox_exec_timeout_s": int — sandbox command execution timeout in seconds (default: 60) + } + +POST /benchmark/python/density — Response: + { + "concurrent_sessions": int — requested session count + "successful_sessions": int — sessions completed without error + "failed_sessions": int — sessions that returned an error + "aggregate": { + --- Orchestrator-side (timed in _run_single_session, stats in benchmark_density) --- + "orchestrator_cel_mean_ms": mean round-trip across sessions + "orchestrator_cel_p50_ms": P50 round-trip + "orchestrator_cel_p99_ms": P99 round-trip + "orchestrator_cel_min_ms": min round-trip + "orchestrator_cel_max_ms": max round-trip + + --- Sandbox-side overall (from benchmark_density.py, mean across sessions) --- + "sandbox_ttfe_ms": Time To First Execution + "sandbox_total_cel_mean_ms": mean total CEL per iteration (sum of all task types) + "sandbox_total_cel_p50_ms": P50 total CEL per iteration + "sandbox_total_cel_p99_ms": P99 total CEL per iteration + "sandbox_total_cel_min_ms": min total CEL per iteration + "sandbox_total_cel_max_ms": max total CEL per iteration + + --- Sandbox RSS (from benchmark_density.py, mean across sessions) --- + "sandbox_rss_start_mb": RSS at benchmark start + "sandbox_rss_end_mb": RSS at benchmark end + "sandbox_rss_growth_mb": RSS growth during benchmark + + --- Per-type CEL breakdown (from benchmark_density.py, mean across sessions) --- + "sandbox_compute_cel_{mean,p50,p99,min,max}_ms": CPU-bound (math.factorial) + "sandbox_syscall_cel_{mean,p50,p99,min,max}_ms": gVisor Sentry (os.stat/listdir) + "sandbox_import_cel_{mean,p50,p99,min,max}_ms": Gofer FS I/O (importlib) + } + "sessions": [ per-session detail array + { + "session_id": int — zero-based session index + "orchestrator_total_ms": float — full round-trip for this session + "raw_output": str — raw code execution stdout + "sandbox_ttfe_ms": float — TTFE for this session + "sandbox_total_cel_mean_ms": float — total CEL mean for this session + ... all other sandbox_* metrics for this session + } + ] + } + +Data Flow: + benchmark_density.py (inside gVisor) → all sandbox_* metrics per session + main.py (this file) → orchestrator_* timing + cross-session aggregation +""" + +import json +import logging +import os +import re +import time +import asyncio +from typing import Optional +from concurrent.futures import ThreadPoolExecutor + +import uvicorn +from contextlib import asynccontextmanager +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field +from google.genai import types +from google.adk.sessions import InMemorySessionService +from google.adk.artifacts import InMemoryArtifactService +from google.adk.runners import Runner + +from dotenv import load_dotenv + +basedir = os.path.abspath(os.path.dirname(__file__)) + +# Load generated.env (rendered by gke_image_build_utils._GenerateEnvFile from PKB flags). +# In GKE, K8s manifest env vars take precedence. +load_dotenv(os.path.join(basedir, "generated.env")) + +from gke_performance_agent import agent + + +# ── SandboxClient factory (DirectConnection vs Dev-mode tunnel) ────────── +def _make_sandbox_client(): + """Create a SandboxClient with the optimal connection strategy. + + When SANDBOX_ROUTER_URL is set (in-cluster), uses DirectConnectionConfig + to bypass kubectl port-forward SPDY tunnels — enabling true N-way + parallelism. Without it, falls back to LocalTunnelConnectionConfig + (dev mode, serialized through a single SPDY stream). + """ + from k8s_agent_sandbox.sandbox_client import SandboxClient + + router_url = os.getenv("SANDBOX_ROUTER_URL") + if router_url: + from k8s_agent_sandbox.models import SandboxDirectConnectionConfig + + return SandboxClient( + connection_config=SandboxDirectConnectionConfig(api_url=router_url) + ) + return SandboxClient() + + +# --- Constants --- +APP_NAME = "gke_performance_agent_app" +USER_ID = "benchmark_user" + +# --- Configure Logging --- +try: + import google.cloud.logging as gcl + + gcl.Client().setup_logging() +except Exception: + logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# ========================================================================= +# FastAPI Application +# ========================================================================= +# --- Adaptive ThreadPool based on Agent CPU --- +def _compute_thread_count() -> int: + """Compute a recommended max worker count for ThreadPoolExecutor. + + Heuristic: use ~2x the detected CPU count to provide overlap for blocking + I/O (port-forward, file upload) while avoiding CPU oversubscription. + Cap between 2 and 64 workers. + """ + cpu = os.cpu_count() or 1 + return max(2, min(64, cpu * 2)) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Lifespan handler: configure a tuned ThreadPoolExecutor for asyncio. + + Sets the default executor so `asyncio.to_thread` uses our tuned pool, + and shuts it down on application exit. + """ + workers = _compute_thread_count() + executor = ThreadPoolExecutor(max_workers=workers) + loop = asyncio.get_running_loop() + loop.set_default_executor(executor) + logging.info( + "Default ThreadPoolExecutor set to %d workers (cpu=%s)", workers, os.cpu_count() + ) + try: + yield + finally: + try: + executor.shutdown(wait=False) + logging.info("ThreadPoolExecutor shut down") + except Exception: + logging.exception("Error shutting down ThreadPoolExecutor") + + +app = FastAPI(title="GKE Benchmark Agent", version="0.2.0", lifespan=lifespan) + +# Serialise benchmark requests so concurrent POSTs cannot clobber the +# shared env vars (BENCHMARK_MODE, SAMPLE_COUNT, …) that agent.py reads. +_benchmark_lock = asyncio.Lock() + + +def _percentile_stats(sorted_values: list, prefix: str) -> dict: + """Compute mean/p50/p95/p99/min/max from a pre-sorted list of numbers.""" + n = len(sorted_values) + if n == 0: + return {} + return { + f"{prefix}_mean_ms": round(sum(sorted_values) / n, 6), + f"{prefix}_p50_ms": round(sorted_values[n // 2], 6), + f"{prefix}_p95_ms": round(sorted_values[min(int(n * 0.95), n - 1)], 6), + f"{prefix}_p99_ms": round(sorted_values[min(int(n * 0.99), n - 1)], 6), + f"{prefix}_min_ms": round(sorted_values[0], 6), + f"{prefix}_max_ms": round(sorted_values[-1], 6), + } + + +# --- Request / Response Models --- +class BenchmarkRequest(BaseModel): + sample_count: int = Field( + default=100, ge=1, description="Sample count per sandbox session" + ) + sample_warmup: int = Field( + default=5, ge=0, description="Warmup iterations per sandbox session" + ) + concurrent_sessions: int = Field( + default=1, ge=1, description="Number of parallel sandbox sessions" + ) + sandbox_exec_timeout_s: int = Field( + default=60, ge=10, description="Sandbox command execution timeout in seconds" + ) + + +class RunRequest(BaseModel): + prompt: str = "Please start the GKE performance benchmark workflow." + + +class PayloadBenchmarkRequest(BaseModel): + payload_size_mb: float = Field(default=1, gt=0, description="Payload size in MB") + payload_iterations: int = Field( + default=20, ge=1, description="Number of transfer iterations" + ) + concurrent_sessions: int = Field( + default=1, ge=1, description="Number of parallel sandbox sessions" + ) + sandbox_exec_timeout_s: int = Field( + default=60, ge=10, description="Sandbox command execution timeout in seconds" + ) + + +class QpsBenchmarkRequest(BaseModel): + target_qps: float = Field( + default=10.0, ge=0.1, description="Target requests per second" + ) + duration_s: float = Field( + default=60.0, ge=5.0, description="Duration of the QPS burst in seconds" + ) + sandbox_exec_timeout_s: int = Field( + default=30, ge=10, description="Sandbox command execution timeout in seconds" + ) + + +class ChromiumBenchmarkRequest(BaseModel): + task_count: int = Field( + default=10, ge=1, description="Iterations per Chromium session" + ) + warmup_tasks: int = Field( + default=2, ge=0, description="Warmup iterations excluded from stats" + ) + concurrent_sessions: int = Field( + default=1, ge=1, description="Number of parallel Chromium sessions" + ) + sandbox_exec_timeout_s: int = Field( + default=120, ge=10, description="Sandbox command execution timeout in seconds" + ) + + +# --- JSON extraction helper --- +_JSON_RE = re.compile(r"\{[^{}]*\}", re.DOTALL) + + +def _parse_sandbox_json(raw_output: str) -> Optional[dict]: + """Extract the sandbox JSON summary from code execution output. + + The sandbox script prints a JSON blob to stdout among other log lines. + We find the last valid JSON object that contains sandbox_ keys. + """ + matches = _JSON_RE.findall(raw_output) + for candidate in reversed(matches): + try: + obj = json.loads(candidate) + if any(k.startswith("sandbox_") for k in obj): + return obj + except json.JSONDecodeError: + continue + return None + + +# --- Agent helper --- +async def _run_agent(prompt: str) -> str: + """Create a fresh session, run the agent, return the final text output.""" + session_service = InMemorySessionService() + artifact_service = InMemoryArtifactService() + session = await session_service.create_session( + app_name=APP_NAME, + user_id=USER_ID, + state={}, + ) + + runner = Runner( + agent=agent.root_agent, + app_name=APP_NAME, + session_service=session_service, + artifact_service=artifact_service, + ) + + content = types.Content( + role="user", + parts=[types.Part(text=prompt)], + ) + + final_response = "" + code_execution_output = "" + async with runner: + async for event in runner.run_async( + user_id=USER_ID, + session_id=session.id, + new_message=content, + ): + if event.content and event.content.parts: + for part in event.content.parts: + cer = getattr(part, "code_execution_result", None) or getattr( + part, "codeExecutionResult", None + ) + if cer: + code_execution_output = getattr(cer, "output", "") or "" + if event.is_final_response() and event.content and event.content.parts: + final_response = event.content.parts[0].text + + await session_service.delete_session( + app_name=APP_NAME, + user_id=USER_ID, + session_id=session.id, + ) + return code_execution_output if code_execution_output else final_response + + +async def _run_single_session(session_id: int, prompt: str) -> dict: + """Run one agent session and return orchestrator + sandbox metrics.""" + orchestrator_start = time.perf_counter() + logging.info("SESSION_START: session_id=%d start_ts=%.3f", session_id, time.time()) + + try: + raw_output = await _run_agent(prompt) + except Exception as e: + return { + "session_id": session_id, + "error": str(e), + } + + orchestrator_elapsed_ms = round( + (time.perf_counter() - orchestrator_start) * 1000, 6 + ) + logging.info( + "SESSION_END: session_id=%d elapsed_ms=%.3f", + session_id, + orchestrator_elapsed_ms, + ) + + # Parse sandbox-side metrics from the code execution output + sandbox_metrics = _parse_sandbox_json(raw_output) or {} + + return { + "session_id": session_id, + "orchestrator_total_ms": orchestrator_elapsed_ms, + "raw_output": raw_output, + **sandbox_metrics, + } + + +# --- Endpoints --- +@app.get("/healthz") +async def healthz(): + return {"status": "ok"} + + +@app.post("/benchmark/python/density") +async def benchmark_python_density(req: BenchmarkRequest): + """Trigger the Python density benchmark (Use Case B). + + Fires `concurrent_sessions` parallel agent sessions. Each session + claims its own sandbox, runs the benchmark script with the given + iteration/warmup counts, and returns both orchestrator-side and + sandbox-side metrics. + """ + async with _benchmark_lock: + os.environ["BENCHMARK_MODE"] = "density" + os.environ["SAMPLE_COUNT"] = str(req.sample_count) + os.environ["SAMPLE_WARMUP"] = str(req.sample_warmup) + os.environ["SANDBOX_EXEC_TIMEOUT_S"] = str(req.sandbox_exec_timeout_s) + + logger.info( + "Starting Python benchmark: sample_count=%d sample_warmup=%d concurrent_sessions=%d", + req.sample_count, + req.sample_warmup, + req.concurrent_sessions, + ) + + prompt = "Please start the GKE performance benchmark workflow." + + # Fire concurrent sessions. + # DESIGN NOTE: Each session runs in its own thread via asyncio.to_thread() + # with a nested asyncio.run() to create a per-thread event loop. This is + # intentional -- the ADK Runner performs blocking I/O (sandbox lifecycle + # via kubectl/HTTP) that would starve a shared event loop and serialize + # session starts. The per-thread event loop overhead (~0.1ms) is negligible + # compared to sandbox round-trip times (~200ms+). + thread_tasks = [ + asyncio.create_task( + asyncio.to_thread( + lambda sid=i: asyncio.run(_run_single_session(sid, prompt)) + ) + ) + for i in range(req.concurrent_sessions) + ] + session_results = await asyncio.gather(*thread_tasks) + + # Separate successful vs failed sessions + successful = [r for r in session_results if "error" not in r] + failed = [r for r in session_results if "error" in r] + + # Aggregate orchestrator-side metrics across all successful sessions + aggregate = {} + if successful: + orch_times = sorted(r["orchestrator_total_ms"] for r in successful) + aggregate.update(_percentile_stats(orch_times, "orchestrator_cel")) + + # Aggregate sandbox-side metrics across sessions + sandbox_keys = [k for k in successful[0] if k.startswith("sandbox_")] + for key in sandbox_keys: + sample_val = successful[0].get(key) + if isinstance(sample_val, list): + # Pool raw latency arrays across sandboxes → true cross-sandbox stats + pooled = sorted( + v + for r in successful + for v in (r.get(key) or []) + if isinstance(r.get(key), list) + ) + if pooled: + base = key[:-3] if key.endswith("_ms") else key + aggregate.update(_percentile_stats(pooled, base)) + elif isinstance(sample_val, (int, float)): + vals = [ + r[key] + for r in successful + if key in r and isinstance(r[key], (int, float)) + ] + if vals: + if key.endswith("_cel_ms"): + # Latency scalars (e.g. import_cel_ms): compute + # cross-sandbox percentile stats, like array metrics. + base = key[:-3] + aggregate.update(_percentile_stats(sorted(vals), base)) + else: + # Non-latency scalars (e.g. rss_mb, ttfe_ms): average + aggregate[key] = round(sum(vals) / len(vals), 6) + + return { + "concurrent_sessions": req.concurrent_sessions, + "successful_sessions": len(successful), + "failed_sessions": len(failed), + "aggregate": aggregate, + "sessions": session_results, + } + + +@app.post("/benchmark/python/payload") +async def benchmark_python_payload(req: PayloadBenchmarkRequest): + """Trigger the payload transfer benchmark (Use Case D). + + Measures the cost of returning large observation payloads from a + gVisor sandbox back to the orchestrator. Each session generates a + payload of `payload_size_mb` MB, encodes it (base64), writes it + through the gVisor Gofer path, and reports latency breakdowns. + """ + async with _benchmark_lock: + os.environ["BENCHMARK_MODE"] = "payload" + os.environ["PAYLOAD_SIZE_MB"] = str(req.payload_size_mb) + os.environ["PAYLOAD_ITERATIONS"] = str(req.payload_iterations) + os.environ["SANDBOX_EXEC_TIMEOUT_S"] = str(req.sandbox_exec_timeout_s) + + logger.info( + "Starting Payload benchmark: payload_size_mb=%s iterations=%d concurrent_sessions=%d", + req.payload_size_mb, + req.payload_iterations, + req.concurrent_sessions, + ) + + prompt = "Please start the GKE performance benchmark workflow." + + # Fire concurrent sessions. + # DESIGN NOTE: Each session runs in its own thread via asyncio.to_thread() + # with a nested asyncio.run() to create a per-thread event loop. This is + # intentional -- the ADK Runner performs blocking I/O (sandbox lifecycle + # via kubectl/HTTP) that would starve a shared event loop and serialize + # session starts. The per-thread event loop overhead (~0.1ms) is negligible + # compared to sandbox round-trip times (~200ms+). + thread_tasks = [ + asyncio.create_task( + asyncio.to_thread( + lambda sid=i: asyncio.run(_run_single_session(sid, prompt)) + ) + ) + for i in range(req.concurrent_sessions) + ] + session_results = await asyncio.gather(*thread_tasks) + + # Separate successful vs failed sessions + successful = [r for r in session_results if "error" not in r] + failed = [r for r in session_results if "error" in r] + + # Aggregate orchestrator-side metrics across all successful sessions + aggregate = {} + if successful: + orch_times = sorted(r["orchestrator_total_ms"] for r in successful) + aggregate.update(_percentile_stats(orch_times, "orchestrator_transfer")) + + # Aggregate sandbox-side metrics (mean across sessions, numeric only) + sandbox_keys = [k for k in successful[0] if k.startswith("sandbox_")] + for key in sandbox_keys: + vals = [ + r[key] + for r in successful + if key in r and isinstance(r[key], (int, float)) + ] + if vals: + aggregate[key] = round(sum(vals) / len(vals), 6) + + return { + "payload_size_mb": req.payload_size_mb, + "payload_iterations": req.payload_iterations, + "concurrent_sessions": req.concurrent_sessions, + "successful_sessions": len(successful), + "failed_sessions": len(failed), + "aggregate": aggregate, + "sessions": session_results, + } + + +@app.post("/benchmark/python/qps") +async def benchmark_python_qps(req: QpsBenchmarkRequest): + """Trigger the QPS saturation benchmark (Use Case F). + + Fires sandbox claim requests at a controlled rate (target_qps) for + duration_s seconds. Each request claims a sandbox from the warm pool, + runs a trivial script, and releases it. Returns per-request TTFE + (claim + upload + execute + delete) and aggregate latency stats. + + Uses a lightweight path that calls SandboxClient directly — bypasses + the full ADK Runner/MockLLM pipeline to avoid per-request overhead + and accurately measure sandbox lifecycle latency at high QPS. + + When the warm pool drains faster than it refills, TTFE spikes from + ~200ms to seconds — identifying the QPS saturation point. + """ + + # Load the QPS script once + qps_script_path = os.path.join( + basedir, "sandboxed_apps/python_test_app/benchmark_qps.py" + ) + try: + with open(qps_script_path, "r") as f: + qps_code = f.read() + except Exception: + qps_code = "import json; print(json.dumps({'sandbox_status': 'ok'}))" + + sandbox_template = os.getenv("SANDBOX_TEMPLATE", "python-sandbox-template") + sandbox_namespace = os.getenv("AGENTIC_NAMESPACE", "agentic") + exec_timeout = req.sandbox_exec_timeout_s + qps_claim_label = {"created-by": "pkb-qps-benchmark"} + + def _run_qps_request(request_id: int) -> dict: + """Lightweight sandbox claim→execute→release cycle.""" + t_total = time.perf_counter() + client = _make_sandbox_client() + sandbox = None + try: + # Claim + t0 = time.perf_counter() + sandbox = client.create_sandbox( + template=sandbox_template, + namespace=sandbox_namespace, + labels=qps_claim_label, + ) + claim_ms = (time.perf_counter() - t0) * 1000 + + # Upload + t0 = time.perf_counter() + sandbox.files.write("script.py", qps_code) + upload_ms = (time.perf_counter() - t0) * 1000 + + # Execute + t0 = time.perf_counter() + result = sandbox.commands.run("python3 script.py", timeout=exec_timeout) + exec_ms = (time.perf_counter() - t0) * 1000 + + ttfe_ms = (time.perf_counter() - t_total) * 1000 + + return { + "request_id": request_id, + "ttfe_ms": round(ttfe_ms, 3), + "claim_ms": round(claim_ms, 3), + "upload_ms": round(upload_ms, 3), + "exec_ms": round(exec_ms, 3), + } + except Exception as e: + ttfe_ms = (time.perf_counter() - t_total) * 1000 + return { + "request_id": request_id, + "ttfe_ms": round(ttfe_ms, 3), + "error": f"{type(e).__name__}: {e}", + } + finally: + if sandbox is not None: + try: + client.delete_sandbox( + sandbox.claim_name, namespace=sandbox_namespace + ) + except Exception: + pass + + async with _benchmark_lock: + logger.info( + "Starting QPS benchmark: target_qps=%.1f duration_s=%.1f", + req.target_qps, + req.duration_s, + ) + + interval = 1.0 / req.target_qps + + # Use a scoped executor sized to the expected concurrency. + # Each sandbox request takes ~0.5-5s depending on environment + # (in-cluster vs port-forward). We need enough workers so the + # thread pool itself is never the bottleneck — only real sandbox + # contention should limit throughput. + peak_concurrency = int(req.target_qps * req.duration_s) + qps_workers = max(16, min(512, peak_concurrency)) + qps_executor = ThreadPoolExecutor(max_workers=qps_workers) + loop = asyncio.get_running_loop() + logger.info( + "QPS executor: %d workers for ~%d expected requests", + qps_workers, + peak_concurrency, + ) + + # Schedule requests at the target QPS rate + tasks: list[asyncio.Task] = [] + t_start = time.time() + next_fire = t_start + request_id = 0 + + while True: + now = time.time() + elapsed = now - t_start + if elapsed >= req.duration_s: + break + if now >= next_fire: + rid = request_id + request_id += 1 + fut = loop.run_in_executor(qps_executor, _run_qps_request, rid) + tasks.append(fut) + next_fire += interval + else: + await asyncio.sleep(min(0.001, next_fire - now)) + + # Wait for in-flight requests with a drain timeout. + drain_timeout = max(60.0, req.duration_s) + done, pending = await asyncio.wait(tasks, timeout=drain_timeout) + + # Clean up the scoped executor + qps_executor.shutdown(wait=False) + + # Collect completed results (guard against individual task exceptions) + session_results = [] + for t in done: + try: + session_results.append(t.result()) + except Exception as exc: + session_results.append( + { + "request_id": -1, + "error": str(exc), + } + ) + + # Cancel tasks still queued/running and mark as timed out + for t in pending: + t.cancel() + if pending: + logger.warning( + "QPS drain timeout: %d/%d requests still pending after %.0fs", + len(pending), + len(tasks), + drain_timeout, + ) + for t in pending: + session_results.append( + { + "request_id": -1, + "error": "drain_timeout", + } + ) + + # Bulk-delete SandboxClaims left by cancelled tasks. + # Only targets claims labelled created-by=pkb-qps-benchmark so + # we never touch claims created by other workloads. + try: + import subprocess as _sp + + _claims = _sp.run( + [ + "kubectl", + "get", + "sandboxclaim", + "-n", + sandbox_namespace, + "-l", + "created-by=pkb-qps-benchmark", + "-o", + "jsonpath={.items[*].metadata.name}", + ], + capture_output=True, + text=True, + ) + claim_names = _claims.stdout.strip().split() + if claim_names and claim_names != [""]: + logger.info("Cleaning up %d lingering pkb-qps claims", len(claim_names)) + _sp.run( + [ + "kubectl", + "delete", + "sandboxclaim", + "-l", + "created-by=pkb-qps-benchmark", + "-n", + sandbox_namespace, + "--wait=false", + ], + capture_output=True, + text=True, + ) + except Exception: + logger.warning("Failed to clean up lingering claims", exc_info=True) + + wall_time = time.time() - t_start + + # Separate successful vs failed + successful = [r for r in session_results if "error" not in r] + failed = [r for r in session_results if "error" in r] + + # Compute TTFE stats + aggregate = {} + if successful: + ttfe_values = sorted(r["ttfe_ms"] for r in successful) + if ttfe_values: + aggregate.update(_percentile_stats(ttfe_values, "ttfe")) + + # Also compute claim latency stats (the warm-pool-sensitive metric) + claim_values = sorted(r["claim_ms"] for r in successful if "claim_ms" in r) + if claim_values: + aggregate.update(_percentile_stats(claim_values, "claim")) + + return { + "target_qps": req.target_qps, + "actual_qps": round(request_id / wall_time, 2) if wall_time > 0 else 0, + "duration_s": round(wall_time, 2), + "total_requests": request_id, + "successful_requests": len(successful), + "failed_requests": len(failed), + "aggregate": aggregate, + "sessions": session_results, + } + + +@app.post("/benchmark/chromium/density") +async def benchmark_chromium_density(req: ChromiumBenchmarkRequest): + """Trigger the Chromium density benchmark (Use Case C). + + Fires `concurrent_sessions` parallel Chromium sandbox sessions. Each + session claims its own sandbox from the chromium warm pool, connects to + the sandbox's Chrome instance via CDP (Chrome DevTools Protocol), and + drives the benchmark from the orchestrator using Playwright. + + Architecture: + - Sandbox: runs headless Chromium (upstream chrome-sandbox image) with + --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0 + - Orchestrator: connects Playwright via connect_over_cdp() to the + sandbox pod IP:9222 and drives navigate/click/evaluate/screenshot. + - This isolates pure Chrome-under-gVisor overhead without Node.js or + a runtime server in the sandbox. + """ + from playwright.async_api import async_playwright + from kubernetes import client as k8s_client, config as k8s_config + + async with _benchmark_lock: + + sandbox_namespace = os.getenv("AGENTIC_NAMESPACE", "agentic") + sandbox_template = "chromium-sandbox-template" + + logger.info( + "Starting Chromium density benchmark (CDP): concurrent_sessions=%d " + "task_count=%d warmup_tasks=%d", + req.concurrent_sessions, + req.task_count, + req.warmup_tasks, + ) + + # Initialize K8s client for pod IP lookup + try: + k8s_config.load_incluster_config() + except k8s_config.ConfigException: + k8s_config.load_kube_config() + core_v1 = k8s_client.CoreV1Api() + + # Inline HTML test page (data: URL avoids network dependencies) + test_page = """data:text/html, + + +PKB Chromium Benchmark + +

Hello Sandbox

+ + +
+ + +""" + + # Limit concurrent K8s Metrics API calls to avoid overwhelming metrics-server + _metrics_semaphore = asyncio.Semaphore(5) + + async def _run_chromium_session_cdp(session_id: int) -> dict: + """Run one Chromium benchmark session via CDP.""" + sb_client = _make_sandbox_client() + sandbox = None + t_start = time.time() + claim_ms = 0.0 + cold_start_ms = 0.0 + try: + # 1. Claim sandbox from warm pool + t0 = time.time() + sandbox = sb_client.create_sandbox( + template=sandbox_template, + namespace=sandbox_namespace, + ) + claim_ms = (time.time() - t0) * 1000.0 + + # 2. Resolve pod IP + pod_name = sandbox.get_pod_name() + pod = core_v1.read_namespaced_pod(pod_name, sandbox_namespace) + pod_ip = pod.status.pod_ip + if not pod_ip: + raise RuntimeError(f"Pod {pod_name} has no IP assigned") + + cdp_url = f"http://{pod_ip}:9223" + + # 3. Connect Playwright via CDP + async with async_playwright() as pw: + # Wait for Chrome to be ready (retry connection) + browser = None + for attempt in range(20): + try: + browser = await pw.chromium.connect_over_cdp(cdp_url) + break + except Exception: + if attempt >= 19: + raise + await asyncio.sleep(0.5) + + # Cold start = claim + CDP connect (time until browser ready) + cold_start_ms = (time.time() - t_start) * 1000.0 + + context = await browser.new_context() + page = await context.new_page() + + # Navigate once before measurement loop + await page.goto(test_page, wait_until="domcontentloaded") + + # Latency arrays (filled during measured runs only) + navigate_ms = [] + screenshot_ms = [] + evaluate_ms = [] + click_ms = [] + fill_ms = [] + interaction_ms = [] + + total_runs = req.warmup_tasks + req.task_count + for run_idx in range(total_runs): + measuring = run_idx >= req.warmup_tasks + + # 1. Navigate (reload page) + t0 = time.time() + await page.goto(test_page, wait_until="domcontentloaded") + elapsed = (time.time() - t0) * 1000.0 + if measuring: + navigate_ms.append(elapsed) + interaction_ms.append(elapsed) + + # 2. DOM evaluate — read heading text + t0 = time.time() + await page.evaluate( + "() => document.getElementById('heading').textContent" + ) + elapsed = (time.time() - t0) * 1000.0 + if measuring: + evaluate_ms.append(elapsed) + interaction_ms.append(elapsed) + + # 3. Fill input + t0 = time.time() + await page.fill("#search", f"query-{run_idx}") + elapsed = (time.time() - t0) * 1000.0 + if measuring: + fill_ms.append(elapsed) + interaction_ms.append(elapsed) + + # 4. Click button + t0 = time.time() + await page.click("#btn") + elapsed = (time.time() - t0) * 1000.0 + if measuring: + click_ms.append(elapsed) + interaction_ms.append(elapsed) + + # 5. Verify click effect (DOM mutation) + t0 = time.time() + await page.evaluate( + "() => document.getElementById('output').textContent" + ) + elapsed = (time.time() - t0) * 1000.0 + if measuring: + evaluate_ms.append(elapsed) + interaction_ms.append(elapsed) + + # 6. Screenshot + t0 = time.time() + await page.screenshot() + elapsed = (time.time() - t0) * 1000.0 + if measuring: + screenshot_ms.append(elapsed) + interaction_ms.append(elapsed) + + # Read pod memory usage from K8s Metrics API + rss_mb = None + try: + async with _metrics_semaphore: + custom_api = k8s_client.CustomObjectsApi() + pod_metrics = await asyncio.to_thread( + custom_api.get_namespaced_custom_object, + group="metrics.k8s.io", + version="v1beta1", + namespace=sandbox_namespace, + plural="pods", + name=pod_name, + ) + for c in pod_metrics.get("containers", []): + usage = c.get("usage", {}).get("memory", "") + if usage.endswith("Ki"): + rss_mb = round(int(usage[:-2]) / 1024, 1) + elif usage.endswith("Mi"): + rss_mb = round(float(usage[:-2]), 1) + elif usage.endswith("Gi"): + rss_mb = round(float(usage[:-2]) * 1024, 1) + break + except Exception: + logger.warning( + "Failed to read pod metrics for %s", + pod_name, + exc_info=True, + ) + + await browser.close() + + total_ms = (time.time() - t_start) * 1000.0 + + # Compute stats helper + def _compute_stats(arr): + if not arr: + return None + s = sorted(arr) + n = len(s) + return { + "mean_ms": round(sum(s) / n, 3), + "p50_ms": round(s[min(int(n * 0.50), n - 1)], 3), + "p95_ms": round(s[min(int(n * 0.95), n - 1)], 3), + "p99_ms": round(s[min(int(n * 0.99), n - 1)], 3), + "min_ms": round(s[0], 3), + "max_ms": round(s[-1], 3), + } + + return { + "session_id": session_id, + "sandbox_status": "ok", + "orchestrator_total_ms": round(total_ms, 3), + "claim_ms": round(claim_ms, 3), + "cold_start_ms": round(cold_start_ms, 3), + "rss_mb": rss_mb, + "navigate": _compute_stats(navigate_ms), + "evaluate": _compute_stats(evaluate_ms), + "fill": _compute_stats(fill_ms), + "click": _compute_stats(click_ms), + "screenshot": _compute_stats(screenshot_ms), + "interaction": _compute_stats(interaction_ms), + } + + except Exception as e: + total_ms = (time.time() - t_start) * 1000.0 + logger.exception("Chromium CDP session %d failed", session_id) + return { + "session_id": session_id, + "orchestrator_total_ms": round(total_ms, 3), + "claim_ms": round(claim_ms, 3), + "error": f"{type(e).__name__}: {e}", + } + finally: + if sandbox is not None: + try: + sb_client.delete_sandbox( + sandbox.claim_name, namespace=sandbox_namespace + ) + except Exception: + logger.warning( + "Failed to delete sandbox for session %d", + session_id, + exc_info=True, + ) + + # Fire concurrent sessions + tasks = [_run_chromium_session_cdp(i) for i in range(req.concurrent_sessions)] + session_results = await asyncio.gather(*tasks) + + # Separate successful vs failed + successful = [r for r in session_results if "error" not in r] + failed = [r for r in session_results if "error" in r] + + # Aggregate metrics + aggregate = {} + if successful: + orch_times = sorted(r["orchestrator_total_ms"] for r in successful) + aggregate.update(_percentile_stats(orch_times, "orchestrator_total")) + + claim_times = sorted(r["claim_ms"] for r in successful if "claim_ms" in r) + if claim_times: + aggregate.update(_percentile_stats(claim_times, "claim")) + + # Aggregate cold start and RSS + cold_starts = sorted( + r["cold_start_ms"] for r in successful if "cold_start_ms" in r + ) + if cold_starts: + aggregate["cold_start_mean_ms"] = round( + sum(cold_starts) / len(cold_starts), 3 + ) + aggregate["cold_start_p95_ms"] = round( + cold_starts[min(int(len(cold_starts) * 0.95), len(cold_starts) - 1)], 3 + ) + + rss_vals = sorted( + r["rss_mb"] for r in successful if r.get("rss_mb") is not None + ) + if rss_vals: + aggregate["rss_end_mb"] = round(sum(rss_vals) / len(rss_vals), 1) + + # Aggregate per-task-type interaction stats + for metric_key in ( + "interaction", + "navigate", + "evaluate", + "click", + "fill", + "screenshot", + ): + means = sorted( + r[metric_key]["mean_ms"] + for r in successful + if isinstance(r.get(metric_key), dict) and "mean_ms" in r[metric_key] + ) + p95s = sorted( + r[metric_key]["p95_ms"] + for r in successful + if isinstance(r.get(metric_key), dict) and "p95_ms" in r[metric_key] + ) + if means: + aggregate[f"{metric_key}_mean_ms"] = round(sum(means) / len(means), 3) + if p95s: + aggregate[f"{metric_key}_p95_ms"] = round( + p95s[min(int(len(p95s) * 0.95), len(p95s) - 1)], 3 + ) + + return { + "concurrent_sessions": req.concurrent_sessions, + "successful_sessions": len(successful), + "failed_sessions": len(failed), + "aggregate": aggregate, + "sessions": session_results, + } + + +@app.post("/run") +async def run_agent(req: RunRequest): + """Raw agent interaction — send any prompt, get back the agent text.""" + try: + output = await _run_agent(req.prompt) + return {"response": output} + except Exception as e: + logger.exception("Agent run failed") + raise HTTPException(status_code=500, detail=str(e)) + + +# ========================================================================= +# Entry point +# ========================================================================= +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt b/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt new file mode 100644 index 0000000000..4ca072323c --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt @@ -0,0 +1,11 @@ +# Requirements for GKE Performance Agent +google-adk[gke,extensions]==1.34.1 +k8s-agent-sandbox==0.4.6 +kubernetes>=36.0.1 # Fix: v36.0.0 has auth key mismatch bug (PR #2585) +google-cloud-aiplatform[adk]==1.153.1 +google-cloud-logging==3.15.0 +fastapi==0.135.3 +uvicorn[standard]==0.44.0 +python-dotenv==1.0.1 +playwright==1.59.0 + diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py new file mode 100644 index 0000000000..c1d20ecbfb --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +""" +Agentic Python Sandbox Benchmark +Measures: TTFE (Time to First Execution), CEL (Command Execution Latency), RSS Memory + +Three task categories: + - compute: CPU-bound (matrix multiply, sorting large lists) + - syscall: gVisor Sentry stress (large file I/O, many stat calls) + - import: Gofer FS I/O + memory (import heavy stdlib, build data) + +Metrics: all sandbox_* keys. +""" +import time +import json +import os +import resource +import sys +import math +import random +import warnings + +warnings.filterwarnings("ignore") + +SAMPLE_COUNT = int(os.environ.get("SAMPLE_COUNT") or "20") +SAMPLE_WARMUP = int(os.environ.get("SAMPLE_WARMUP") or "0") + +print(f"SAMPLE_COUNT: {SAMPLE_COUNT}") +print(f"SAMPLE_WARMUP: {SAMPLE_WARMUP}") + +# ── Persistent allocations (retained across iterations to grow RSS) ── +# ~20MB baseline allocation that stays resident +_RESIDENT_DATA = [bytearray(1024 * 1024) for _ in range(20)] # 20 × 1MB + + +def get_rss_mb(): + """Get current RSS memory in MB.""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + + +def get_static_tasks(): + """Return deterministic static tasks to measure execution latency. + + Three task categories enable decomposition of CEL degradation: + - compute: sort a 100k-element list + matrix-like multiply + - syscall: write/read 1MB temp files, 2000 stat calls + - import: import 15 heavy stdlib modules + build large dicts + """ + return [ + { + "id": 1, + "type": "compute", + "code": ( + "import math, random\n" + "random.seed(42)\n" + "data = [random.random() for _ in range(100_000)]\n" + "data.sort()\n" + "# Matrix-like multiply (flattened 200×200)\n" + "a = list(range(40_000))\n" + "b = [x * 0.001 for x in a]\n" + "_ = sum(x * y for x, y in zip(a, b))\n" + ), + }, + { + "id": 2, + "type": "syscall", + "code": ( + "import os, tempfile\n" + "d = tempfile.gettempdir()\n" + "# Write + read 1MB file through gVisor Gofer\n" + "path = os.path.join(d, 'bench_heavy.bin')\n" + "data = b'x' * (1024 * 1024)\n" + "with open(path, 'wb') as f:\n" + " f.write(data)\n" + "with open(path, 'rb') as f:\n" + " _ = f.read()\n" + "os.unlink(path)\n" + "# Heavy stat/listdir\n" + "[os.stat(d) for _ in range(1000)]\n" + "[os.listdir(d) for _ in range(1000)]\n" + ), + }, + { + "id": 3, + "type": "import", + "code": ( + "import importlib, sys\n" + "mods = [\n" + " 'json', 'csv', 'html', 'email', 'unittest', 'logging',\n" + " 'xml.etree.ElementTree', 'http.client', 'urllib.request',\n" + " 'argparse', 'pprint', 'textwrap', 'difflib',\n" + "]\n" + "for _ in range(20):\n" + " for m in mods:\n" + " try:\n" + " sys.modules.pop(m, None)\n" + " importlib.import_module(m)\n" + " except Exception:\n" + " pass\n" + "# Build a large dict to add memory pressure\n" + "_ = {str(i): list(range(100)) for i in range(10_000)}\n" + ), + }, + ] + + +def _percentile(sorted_vals, pct): + """Return the value at the given percentile from a pre-sorted list.""" + idx = int(len(sorted_vals) * pct) + return sorted_vals[min(idx, len(sorted_vals) - 1)] + + +def run_benchmark(): + results = {"ttfe_ms": None, "cel_ms": [], "rss_mb_start": None, "rss_mb_end": None} + + # Measure TTFE + ttfe_start = time.perf_counter() + exec("x = 1 + 1", globals()) + results["ttfe_ms"] = round((time.perf_counter() - ttfe_start) * 1000, 6) + + results["rss_mb_start"] = get_rss_mb() + + tasks = get_static_tasks() + sampled_tasks = [t for t in tasks if t["type"] != "import"] + import_task = next((t for t in tasks if t["type"] == "import"), None) + + # Warmup — sampled tasks only (import uses C-extension modules that + # error on repeated reimport, so it runs once outside the loop) + for _ in range(SAMPLE_WARMUP): + for task in sampled_tasks: + exec(task["code"], globals()) + + # Benchmark iterations — compute + syscall only + for i in range(SAMPLE_COUNT): + # Grow resident memory slightly each iteration (~100KB) + _RESIDENT_DATA.append(bytearray(100 * 1024)) + + for task in sampled_tasks: + start = time.perf_counter() + exec(task["code"], globals()) + elapsed_ms = round((time.perf_counter() - start) * 1000, 6) + results["cel_ms"].append({ + "iteration": i, + "task_id": task["id"], + "type": task["type"], + "latency_ms": elapsed_ms, + }) + + # Import task — single run (C-extension modules break on repeated reimport) + import_elapsed_ms = 0.0 + if import_task: + import_start = time.perf_counter() + exec(import_task["code"], globals()) + import_elapsed_ms = round((time.perf_counter() - import_start) * 1000, 6) + + results["rss_mb_end"] = get_rss_mb() + + # --- Raw per-iteration totals (compute + syscall) --- + iteration_totals = [] + for i in range(SAMPLE_COUNT): + total = sum(r["latency_ms"] for r in results["cel_ms"] if r["iteration"] == i) + iteration_totals.append(round(total, 6)) + + # --- Raw per-type latencies --- + types_seen = sorted(set(r["type"] for r in results["cel_ms"])) + per_type_raw = {} + for t in types_seen: + per_type_raw[t] = [round(r["latency_ms"], 6) + for r in results["cel_ms"] if r["type"] == t] + + # Output raw arrays — cross-sandbox stats computed by main.py + summary = { + "hostname": os.environ.get("HOSTNAME", "unknown"), + "sandbox_ttfe_ms": results["ttfe_ms"], + "sandbox_total_cel_ms": iteration_totals, + "sandbox_import_cel_ms": import_elapsed_ms, + "sandbox_rss_start_mb": results["rss_mb_start"], + "sandbox_rss_end_mb": results["rss_mb_end"], + "sandbox_rss_growth_mb": round(results["rss_mb_end"] - results["rss_mb_start"], 6), + "sample_count": SAMPLE_COUNT, + "sample_warmup": SAMPLE_WARMUP, + "total_iterations": len(iteration_totals), + "task_types": len(types_seen) + (1 if import_task else 0), + } + + for t, raw in per_type_raw.items(): + summary[f"sandbox_{t}_cel_ms"] = raw + + print(json.dumps(summary)) + + with open("/tmp/benchmark_results.json", "w") as f: + json.dump(results, f) + + return summary + +if __name__ == "__main__": + run_benchmark() diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py new file mode 100644 index 0000000000..f92a3e694d --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +"""Agentic Payload Transfer Benchmark (Use Case D). + +Measures the cost of returning large "Observation" payloads from a gVisor +sandbox back to the Orchestrator via the real data path: + stdout → code_execution_result.output → orchestrator HTTP response. + +For a given PAYLOAD_SIZE_MB, the script: + 1. Generates a payload of that size (os.urandom + base64) + 2. Measures generation, serialization, and stdout-write times separately + 3. Repeats for PAYLOAD_ITERATIONS to compute stable percentiles + 4. On the final iteration, writes the actual payload to stdout (measuring + real end-to-end transfer); other iterations write to /dev/null to + measure write-syscall cost without flooding the return channel. + 5. Emits a JSON summary to stderr (parsed by main.py) + +Metrics are split so that pass/fail thresholds can exclude generation +time (os.urandom), which is not part of data transfer. + +Environment variables (injected by the agent): + PAYLOAD_SIZE_MB — target payload size in megabytes (default: 1) + PAYLOAD_ITERATIONS — number of transfer iterations (default: 20) +""" + +import base64 +import json +import os +import resource +import sys +import time + +PAYLOAD_SIZE_MB = float(os.environ.get("PAYLOAD_SIZE_MB") or "1") +PAYLOAD_ITERATIONS = int(os.environ.get("PAYLOAD_ITERATIONS") or "20") + + +# Use stderr for all diagnostic/metric output so stdout is reserved for +# the actual payload transfer (the measured data path). +def _log(msg): + print(msg, file=sys.stderr, flush=True) + + +_log(f"PAYLOAD_SIZE_MB: {PAYLOAD_SIZE_MB}") +_log(f"PAYLOAD_ITERATIONS: {PAYLOAD_ITERATIONS}") + + +def get_rss_mb(): + """Get current RSS memory in MB.""" + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 + + +def _percentile(sorted_vals, pct): + """Return the value at the given percentile from a pre-sorted list.""" + if not sorted_vals: + return 0.0 + idx = int(len(sorted_vals) * pct) + return sorted_vals[min(idx, len(sorted_vals) - 1)] + + +def _stats_for(latencies): + """Compute mean/p50/p95/p99/min/max for a list of latencies (ms).""" + latencies.sort() + return { + "mean": round(sum(latencies) / len(latencies), 6), + "p50": round(latencies[len(latencies) // 2], 6), + "p95": round(_percentile(latencies, 0.95), 6), + "p99": round(_percentile(latencies, 0.99), 6), + "min": round(latencies[0], 6), + "max": round(latencies[-1], 6), + } + + +def run_benchmark(): + """Execute the payload transfer benchmark and print JSON results.""" + target_bytes = int(PAYLOAD_SIZE_MB * 1024 * 1024) + rss_start = get_rss_mb() + + generation_times = [] + serialization_times = [] + stdout_times = [] # stdout write syscall time + transfer_times = [] # serialize + stdout write (the threshold metric) + throughputs = [] # MB/s based on stdout write time + + # --- Warmup (2 iterations, not recorded) --- + for _ in range(2): + raw = os.urandom(target_bytes) + _ = base64.b64encode(raw).decode("ascii") + + # --- Measured iterations --- + for i in range(PAYLOAD_ITERATIONS): + # 1. Generate payload (os.urandom — NOT data transfer) + t0 = time.perf_counter() + raw = os.urandom(target_bytes) + t_gen = time.perf_counter() + + # 2. Serialize (base64 encode — mirrors real observation encoding) + encoded = base64.b64encode(raw).decode("ascii") + t_ser = time.perf_counter() + + # 3. Transfer — write payload to stdout (the real sandbox→orchestrator path). + # Only the final iteration writes to actual stdout to measure real + # end-to-end transfer without flooding the return channel. + # Other iterations write to /dev/null (same gVisor write-syscall path, + # data discarded by host kernel). + t_xfer_start = time.perf_counter() + if i == PAYLOAD_ITERATIONS - 1: + sys.stdout.write(encoded) + sys.stdout.flush() + else: + with open("/dev/null", "w") as devnull: + devnull.write(encoded) + t_xfer = time.perf_counter() + + gen_ms = (t_gen - t0) * 1000 + ser_ms = (t_ser - t_gen) * 1000 + stdout_ms = (t_xfer - t_xfer_start) * 1000 + transfer_ms = ser_ms + stdout_ms # excludes generation + + generation_times.append(gen_ms) + serialization_times.append(ser_ms) + stdout_times.append(stdout_ms) + transfer_times.append(transfer_ms) + + # Throughput in MB/s (based on encoded size and stdout write time) + encoded_size_mb = len(encoded) / (1024 * 1024) + if stdout_ms > 0: + throughputs.append(encoded_size_mb / (stdout_ms / 1000)) + + rss_end = get_rss_mb() + + # Compute stats + gen_stats = _stats_for(generation_times) + ser_stats = _stats_for(serialization_times) + stdout_stats = _stats_for(stdout_times) + transfer_stats = _stats_for(transfer_times) + throughput_stats = _stats_for(throughputs) if throughputs else {} + + # Payload metadata + encoded_size_bytes = len(base64.b64encode(os.urandom(target_bytes))) + + summary = { + "hostname": os.environ.get("HOSTNAME", "unknown"), + # Payload config + "sandbox_payload_size_bytes": target_bytes, + "sandbox_payload_encoded_size_bytes": encoded_size_bytes, + "sandbox_payload_iterations": PAYLOAD_ITERATIONS, + # Generation time (os.urandom — NOT data transfer, excluded from threshold) + "sandbox_generation_time_mean_ms": gen_stats["mean"], + "sandbox_generation_time_p50_ms": gen_stats["p50"], + "sandbox_generation_time_p95_ms": gen_stats["p95"], + "sandbox_generation_time_p99_ms": gen_stats["p99"], + "sandbox_generation_time_min_ms": gen_stats["min"], + "sandbox_generation_time_max_ms": gen_stats["max"], + # Serialization time (base64 encode — CPU bound) + "sandbox_serialization_time_mean_ms": ser_stats["mean"], + "sandbox_serialization_time_p50_ms": ser_stats["p50"], + "sandbox_serialization_time_p95_ms": ser_stats["p95"], + "sandbox_serialization_time_p99_ms": ser_stats["p99"], + "sandbox_serialization_time_min_ms": ser_stats["min"], + "sandbox_serialization_time_max_ms": ser_stats["max"], + # Stdout write time (the raw write-syscall through gVisor) + "sandbox_stdout_time_mean_ms": stdout_stats["mean"], + "sandbox_stdout_time_p50_ms": stdout_stats["p50"], + "sandbox_stdout_time_p95_ms": stdout_stats["p95"], + "sandbox_stdout_time_p99_ms": stdout_stats["p99"], + "sandbox_stdout_time_min_ms": stdout_stats["min"], + "sandbox_stdout_time_max_ms": stdout_stats["max"], + # Transfer time (serialization + stdout write — the threshold metric) + "sandbox_transfer_time_mean_ms": transfer_stats["mean"], + "sandbox_transfer_time_p50_ms": transfer_stats["p50"], + "sandbox_transfer_time_p95_ms": transfer_stats["p95"], + "sandbox_transfer_time_p99_ms": transfer_stats["p99"], + "sandbox_transfer_time_min_ms": transfer_stats["min"], + "sandbox_transfer_time_max_ms": transfer_stats["max"], + # Throughput (MB/s based on transfer write time) + "sandbox_throughput_mean_mbps": throughput_stats.get("mean"), + "sandbox_throughput_p50_mbps": throughput_stats.get("p50"), + "sandbox_throughput_min_mbps": throughput_stats.get("min"), + # RSS + "sandbox_rss_start_mb": rss_start, + "sandbox_rss_end_mb": rss_end, + "sandbox_rss_growth_mb": rss_end - rss_start, + } + + # Emit JSON summary to stderr for diagnostics. + _log("---BENCHMARK_RESULT_JSON---") + _log(json.dumps(summary, indent=2)) + + # Also emit to stdout (after the payload data) so that + # _parse_sandbox_json() can find it in code_execution_result.output. + # ADK only captures stdout, not stderr. + print("\n---BENCHMARK_RESULT_JSON---", flush=True) + print(json.dumps(summary), flush=True) + + return summary + + +if __name__ == "__main__": + try: + run_benchmark() + except Exception as e: + import traceback + + traceback.print_exc() diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py new file mode 100644 index 0000000000..07ef6309db --- /dev/null +++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +"""Minimal QPS benchmark script for UC-F (Scheduling Throughput). + +Runs inside the GKE Agent Sandbox to validate claim readiness. +Executes a trivial operation and reports status. The orchestrator-side +timing (orchestrator_total_ms) serves as the primary TTFE measurement — +when the warm pool drains, that metric spikes because fresh pods must be +cold-started. +""" +import json +import time + +t0 = time.perf_counter() + +# Trivial computation to prove the sandbox is functional +result = sum(range(10_000)) + +elapsed_ms = (time.perf_counter() - t0) * 1000 + +print(json.dumps({ + "sandbox_status": "ok", + "sandbox_qps_exec_ms": round(elapsed_ms, 3), + "sandbox_compute_result": result, +})) diff --git a/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml b/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml new file mode 100644 index 0000000000..69922efdb0 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml @@ -0,0 +1,150 @@ +# Agentic Benchmark Configuration for GKE +# Used with: --benchmark_config_file=perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml +# +# User/environment-specific flags that MUST be passed on CLI: +# --project= +# --owner= +# --gce_network_name=-agentic-vpc +# --gke_additional_flags="--workload-pool=.svc.id.goog,--subnetwork=-agentic-subnet,--enable-master-authorized-networks,--master-authorized-networks=$(curl -s ifconfig.me)/32" +# +# Per-run flags: +# --run_stage=provision|prepare|run,cleanup|teardown +# --run_uri= +# --temp_dir= +# +# Benchmark-specific sweep parameters (vary per run): +# --k8s_python_density_concurrent_sandbox_count=N +# --k8s_snapshot_preload_mb=N +# etc. + +# =========================================================================== +# Shared configuration (defined once, referenced by all benchmarks via YAML +# anchors). PKB ignores top-level keys that don't match a benchmark name. +# =========================================================================== + +_shared_flags: &shared_flags + # --- Cluster creation flags --- + gke_additional_flags: + - "--enable-pod-snapshots" + - "--enable-dataplane-v2" + - "--enable-private-nodes" + - "--enable-ip-alias" + - "--master-ipv4-cidr=172.16.0.0/28" + gke_additional_nodepool_flags: + - "--max-pods-per-node=250" + container_cluster_version: "1.35.5-gke.1057002" + gke_enable_shielded_nodes: false + gce_subnet_region: "us-central1" + + # --- Agentic workload flags --- + k8s_namespace: "agentic" + agent_sandbox_version: "v0.4.6" + k8s_gvisor: true + k8s_agent_api_url: "http://localhost:8080" + +_shared_cluster: &shared_cluster + cloud: GCP + type: Kubernetes + vm_count: 1 + vm_spec: + GCP: + machine_type: c4-standard-8 + zone: us-central1-a + boot_disk_type: hyperdisk-balanced + boot_disk_size: 50 + nodepools: + sandbox: + vm_count: 1 + vm_spec: + GCP: + machine_type: c4-standard-8 + zone: us-central1-a + boot_disk_type: hyperdisk-balanced + boot_disk_size: 100 + sandbox_config: + type: gvisor + +_shared_registry: &shared_registry + cloud: GCP + spec: + GCP: + zone: us-central1-a + + +_shared_container_specs: &shared_container_specs + adk_agent: + image: agentic/adk-agent + +# =========================================================================== +# Benchmark definitions (each references the shared anchors above) +# =========================================================================== + +k8s_python_density: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_chromium_density: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_payload: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_qps: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_snapshot: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_warmpool: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster + +k8s_deletion: + flags: + <<: *shared_flags + container_registry: + <<: *shared_registry + container_specs: + <<: *shared_container_specs + container_cluster: + <<: *shared_cluster diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2 new file mode 100644 index 0000000000..068b50be11 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2 @@ -0,0 +1,118 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: adk-agent-sa + namespace: {{ ns }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: adk-agent-sandbox-role +rules: + - apiGroups: ["agents.x-k8s.io"] + resources: ["sandboxes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["agents.x-k8s.io"] + resources: ["sandboxwarmpool", "sandboxwarmpools"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions.agents.x-k8s.io"] + resources: ["sandboxclaims"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: [""] + resources: ["pods", "pods/log", "pods/exec", "services", "configmaps"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/portforward"] + verbs: ["create"] + - apiGroups: ["metrics.k8s.io"] + resources: ["pods"] + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: adk-agent-sandbox-binding + namespace: {{ ns }} +subjects: + - kind: ServiceAccount + name: adk-agent-sa + namespace: {{ ns }} +roleRef: + kind: ClusterRole + name: adk-agent-sandbox-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: adk-agent + namespace: {{ ns }} +spec: + replicas: 1 + selector: + matchLabels: + app: adk-agent + template: + metadata: + labels: + app: adk-agent + spec: + serviceAccountName: adk-agent-sa + containers: + - name: adk-agent + imagePullPolicy: Always + image: {{ adk_image }} + resources: + limits: + memory: "16384Mi" + cpu: "6000m" + requests: + memory: "512Mi" + cpu: "1000m" + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 6 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + env: + - name: PORT + value: "8080" + - name: GOOGLE_CLOUD_PROJECT + value: "{{ project }}" + - name: GOOGLE_CLOUD_LOCATION + value: "{{ region }}" + - name: GOOGLE_GENAI_USE_VERTEXAI + value: "true" + - name: CLUSTER_NAME + value: "{{ cluster }}" + - name: AGENTIC_NAMESPACE + value: "{{ ns }}" + - name: SANDBOX_ROUTER_URL + value: "http://sandbox-router-svc.{{ ns }}.svc.cluster.local:8080" +--- +apiVersion: v1 +kind: Service +metadata: + name: adk-agent + namespace: {{ ns }} +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: 8080 + selector: + app: adk-agent diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2 new file mode 100644 index 0000000000..d76f851e95 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2 @@ -0,0 +1,56 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: psi-reader + namespace: {{ ns }} + labels: + app: psi-reader +spec: + selector: + matchLabels: + app: psi-reader + template: + metadata: + labels: + app: psi-reader + spec: + nodeSelector: + pkb_nodepool: sandbox + tolerations: + - key: "sandbox.gke.io/runtime" + operator: "Equal" + value: "gvisor" + effect: "NoSchedule" + - key: "dedicated" + operator: "Equal" + value: "agentic-sandbox" + effect: "NoSchedule" + hostPID: true + containers: + - name: reader + image: busybox:1.36 + command: ["sleep", "infinity"] + securityContext: + privileged: true + volumeMounts: + - name: cgroup + mountPath: /host/sys/fs/cgroup + readOnly: true + - name: proc + mountPath: /host/proc + readOnly: true + resources: + requests: + cpu: "10m" + memory: "16Mi" + limits: + cpu: "50m" + memory: "32Mi" + volumes: + - name: cgroup + hostPath: + path: /sys/fs/cgroup + - name: proc + hostPath: + path: /proc diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2 new file mode 100644 index 0000000000..0d0541cfe7 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2 @@ -0,0 +1,69 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: sandbox-router-svc + namespace: {{ ns }} +spec: + type: ClusterIP + selector: + app: sandbox-router + ports: + - name: http + protocol: TCP + port: 8080 + targetPort: 8080 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sandbox-router-deployment + namespace: {{ ns }} +spec: + replicas: 2 + selector: + matchLabels: + app: sandbox-router + template: + metadata: + labels: + app: sandbox-router + spec: + serviceAccountName: adk-agent-sa + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app: sandbox-router + containers: + - name: router + image: {{ router_image }} + ports: + - containerPort: 8080 + env: + - name: ALLOW_UNAUTHENTICATED_ROUTER + value: "true" + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "1000m" + memory: "1Gi" + securityContext: + runAsUser: 1000 + runAsGroup: 1000 diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2 new file mode 100644 index 0000000000..e9af43332d --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2 @@ -0,0 +1,103 @@ +--- +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxTemplate +metadata: + name: python-sandbox-template + namespace: {{ ns }} +spec: + podTemplate: + metadata: + labels: + sandbox: python-sandbox-example + spec: + runtimeClassName: gvisor + containers: + - name: python-runtime + image: {{ python_image }} + nodeSelector: + pkb_nodepool: sandbox + tolerations: + - key: "sandbox.gke.io/runtime" + operator: "Equal" + value: "gvisor" + effect: "NoSchedule" + - key: "dedicated" + operator: "Equal" + value: "agentic-sandbox" + effect: "NoSchedule" + restartPolicy: "OnFailure" +--- +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxWarmPool +metadata: + name: python-sandbox-warmpool + namespace: {{ ns }} +spec: + replicas: {{ warmpool_replicas }} + sandboxTemplateRef: + name: python-sandbox-template +--- +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxTemplate +metadata: + name: chromium-sandbox-template + namespace: {{ ns }} +spec: + podTemplate: + metadata: + labels: + sandbox: chromium-sandbox-example + spec: + runtimeClassName: gvisor + containers: + - name: chromium-runtime + image: {{ chromium_image }} + command: ["/bin/sh", "-c"] + args: + - | + socat TCP-LISTEN:9223,fork,reuseaddr TCP:127.0.0.1:9222 & + exec chromium --headless --no-sandbox --disable-gpu --disable-dev-shm-usage --remote-debugging-port=9222 --no-first-run --disable-field-trial-config --user-data-dir=/tmp/chrome-data about:blank + ports: + - containerPort: 9223 + nodeSelector: + pkb_nodepool: sandbox + tolerations: + - key: "sandbox.gke.io/runtime" + operator: "Equal" + value: "gvisor" + effect: "NoSchedule" + - key: "dedicated" + operator: "Equal" + value: "agentic-sandbox" + effect: "NoSchedule" + restartPolicy: "OnFailure" +--- +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxWarmPool +metadata: + name: chromium-sandbox-warmpool + namespace: {{ ns }} +spec: + replicas: {{ chromium_replicas }} + sandboxTemplateRef: + name: chromium-sandbox-template +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-orchestrator-to-chromium + namespace: {{ ns }} +spec: + podSelector: + matchLabels: + sandbox: chromium-sandbox-example + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: adk-agent + ports: + - protocol: TCP + port: 9223 diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2 new file mode 100644 index 0000000000..afc4e0ee4c --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2 @@ -0,0 +1,24 @@ +--- +apiVersion: podsnapshot.gke.io/v1 +kind: PodSnapshotStorageConfig +metadata: + name: benchmark-pssc-gcs +spec: + snapshotStorageConfig: + gcs: + bucket: "{{ bucket_name }}" + path: "{{ snapshot_folder }}" +--- +apiVersion: podsnapshot.gke.io/v1 +kind: PodSnapshotPolicy +metadata: + name: benchmark-psp + namespace: {{ ns }} +spec: + storageConfigName: benchmark-pssc-gcs + selector: + matchLabels: + app: snapshot-benchmark-workload + triggerConfig: + type: manual + postCheckpoint: resume diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2 new file mode 100644 index 0000000000..11850eb444 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2 @@ -0,0 +1,46 @@ +--- +apiVersion: extensions.agents.x-k8s.io/v1alpha1 +kind: SandboxTemplate +metadata: + name: {{ template_name }} + namespace: {{ namespace }} +spec: + podTemplate: + metadata: + labels: + app: snapshot-benchmark-workload + spec: + serviceAccountName: {{ ksa_name }} + runtimeClassName: gvisor + containers: + - name: preloader + image: python:3.11-slim + command: ["python3", "-c"] + args: + - | + import time, os + preload_mb = int(os.environ.get("PRELOAD_MB", "10")) + print(f"Preloading {preload_mb} MB of memory...", flush=True) + _ballast = bytearray(preload_mb * 1024 * 1024) + print(f"Preload complete. Starting counter.", flush=True) + i = 0 + while True: + print(f"Count: {i}", flush=True) + i += 1 + time.sleep(1) + env: + - name: PRELOAD_MB + value: "{{ preload_mb }}" + resources: + requests: + cpu: "250m" + memory: "{{ memory_mi }}Mi" + ephemeral-storage: "512Mi" + nodeSelector: + pkb_nodepool: sandbox + tolerations: + - key: "sandbox.gke.io/runtime" + operator: "Equal" + value: "gvisor" + effect: "NoSchedule" + restartPolicy: "OnFailure" diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md new file mode 100644 index 0000000000..86b33c8486 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md @@ -0,0 +1,64 @@ +# Vibe Coding Startup Scripts + +Pluggable startup scripts for the UC-A snapshot saturation harness (`sweeps/snapshot_saturation_search.py`). Each script simulates a realistic "vibe coding" sandbox cold-start — the kind of environment setup that happens when an AI coding agent provisions a new sandbox for a user. + +## How It Works + +When `--preload_mode=script:` is passed to the sweep harness: + +1. The script is read from disk and embedded into the pod's container entrypoint +2. The pod runs the script to completion (installs packages, starts services, etc.) +3. After the script exits 0, the harness prints `SCRIPT_READY` and starts a counter loop +4. **TTFE** is measured as the total time from SandboxClaim creation to `SCRIPT_READY` + +This lets you compare cold-start TTFE (full script execution) against snapshot/restore TTFE (resuming from a pre-snapshotted state where the script already ran). + +## Scripts + +### startup_pip_fastapi.sh + +**Lightweight Python variant.** Runs natively in the `python:3.11-slim` base image. + +Steps: `pip install fastapi uvicorn` → create app → start uvicorn → wait for first HTTP response. + +Typical cold-start: ~5–8s on GKE with fast network. + +```bash +# Cold-start only +python sweeps/snapshot_saturation_search.py \ + --skip_snapshot \ + --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \ + --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \ + --ttfe_threshold_s=20 + +# With snapshot/restore (shows restore speedup vs cold-start) +python sweeps/snapshot_saturation_search.py \ + --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \ + --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \ + --ttfe_threshold_s=20 --restore_threshold_s=10 +``` + +### startup_npm_vite.sh + +**Heavier Node.js variant.** Installs Node.js + npm from apt, then npm-installs Vite and starts a dev server. + +Steps: `apt-get install nodejs npm` → `npm install vite` → start Vite dev server → wait for first page served. + +Typical cold-start: ~30–60s (apt + npm on cold cache). + +```bash +python sweeps/snapshot_saturation_search.py \ + --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \ + --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \ + --ttfe_threshold_s=120 --restore_threshold_s=10 +``` + +## Writing Your Own Script + +Requirements: +- Must be a bash script (runs via `bash -c` in a `python:3.11-slim` container) +- Must exit 0 on success (use `set -e` for fail-fast) +- Should print progress to stdout (visible in pod logs for debugging) +- The harness appends `SCRIPT_READY` + counter loop after your script — don't add your own + +The `PRELOAD_MB` env var is available but unused by these scripts. The sweep varies it to test different memory request levels on the pod. diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh new file mode 100644 index 0000000000..f3e9c9c235 --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Vibe Coding Startup Script — npm + Vite dev server +# +# Simulates a typical agentic sandbox "vibe coding" cold-start: +# 1. Install Node.js dependencies (bun/npm) +# 2. Start a Vite dev server +# 3. Wait for the server to be ready (first page served) +# +# This script is designed to run inside the sandbox container (python:3.11-slim). +# It installs Node.js + npm + dependencies from scratch to measure realistic +# cold-start latency including package installation. +# +# Usage (cold-start only): +# python sweeps/snapshot_saturation_search.py \ +# --skip_snapshot \ +# --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \ +# --burst_size=3 \ +# --search_mode=binary --search_min=10 --search_max=30 \ +# --ttfe_threshold_s=120 +# +# Usage (with snapshot/restore): +# python sweeps/snapshot_saturation_search.py \ +# --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \ +# --burst_size=3 \ +# --search_mode=binary --search_min=10 --search_max=30 \ +# --ttfe_threshold_s=120 --restore_threshold_s=10 +# +# NOTE: --search_min/--search_max control the PRELOAD_MB env var passed to +# the container; in script mode this is unused by the script itself but +# varies memory requests to test different resource pressure levels. + +set -e + +echo "[vibe-coding] Installing Node.js..." +apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1 + +echo "[vibe-coding] Creating project scaffold..." +mkdir -p /tmp/vibe-project && cd /tmp/vibe-project + +# Create a minimal package.json with Vite +cat > package.json << 'EOF' +{ + "name": "vibe-sandbox", + "private": true, + "scripts": { + "dev": "vite --host 0.0.0.0 --port 5173" + }, + "dependencies": { + "vite": "^5.0.0" + } +} +EOF + +# Create minimal index.html for Vite to serve +cat > index.html << 'EOF' + +Vibe +

Ready

+ +EOF + +echo "[vibe-coding] Installing npm dependencies..." +npm install --prefer-offline 2>&1 | tail -5 + +echo "[vibe-coding] Starting Vite dev server..." +npx vite --host 0.0.0.0 --port 5173 & +VITE_PID=$! + +echo "[vibe-coding] Waiting for server to be ready..." +MAX_WAIT=60 +ELAPSED=0 +while ! curl -s http://localhost:5173 > /dev/null 2>&1; do + sleep 1 + ELAPSED=$((ELAPSED + 1)) + if [ $ELAPSED -ge $MAX_WAIT ]; then + echo "[vibe-coding] ERROR: Server did not start within ${MAX_WAIT}s" + exit 1 + fi +done + +echo "[vibe-coding] First page served successfully (${ELAPSED}s)" + +# Kill the vite server — we only needed to measure startup time +kill $VITE_PID 2>/dev/null || true diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh new file mode 100644 index 0000000000..d54a851bda --- /dev/null +++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Lightweight Vibe Coding Startup Script — pip install + FastAPI +# +# Simulates a Python-based agentic sandbox cold-start: +# 1. Install Python packages (FastAPI + uvicorn) +# 2. Start a web server +# 3. Wait for the server to respond +# +# This is lighter weight than the npm/Vite variant and runs natively +# in the python:3.11-slim base image without needing to install Node.js. +# +# Usage (cold-start only): +# python sweeps/snapshot_saturation_search.py \ +# --skip_snapshot \ +# --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \ +# --burst_size=3 \ +# --search_mode=binary --search_min=10 --search_max=30 \ +# --ttfe_threshold_s=20 +# +# Usage (with snapshot/restore): +# python sweeps/snapshot_saturation_search.py \ +# --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \ +# --burst_size=3 \ +# --search_mode=binary --search_min=10 --search_max=30 \ +# --ttfe_threshold_s=20 --restore_threshold_s=10 +# +# NOTE: --search_min/--search_max control the PRELOAD_MB env var passed to +# the container; in script mode this is unused by the script itself but +# varies memory requests to test different resource pressure levels. + +set -e + +echo "[vibe-coding] Installing Python packages..." +pip install --quiet fastapi uvicorn 2>&1 | tail -3 + +echo "[vibe-coding] Creating app..." +cat > /tmp/app.py << 'EOF' +from fastapi import FastAPI +app = FastAPI() + +@app.get("/") +def root(): + return {"status": "ready"} +EOF + +echo "[vibe-coding] Starting uvicorn server..." +python -m uvicorn app:app --host 0.0.0.0 --port 8000 --app-dir /tmp & +SERVER_PID=$! + +echo "[vibe-coding] Waiting for server to be ready..." +MAX_WAIT=30 +ELAPSED=0 +while ! python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/')" 2>/dev/null; do + sleep 1 + ELAPSED=$((ELAPSED + 1)) + if [ $ELAPSED -ge $MAX_WAIT ]; then + echo "[vibe-coding] ERROR: Server did not start within ${MAX_WAIT}s" + exit 1 + fi +done + +echo "[vibe-coding] First request served successfully (${ELAPSED}s)" + +# Kill the server — we only needed to measure startup time +kill $SERVER_PID 2>/dev/null || true diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py new file mode 100644 index 0000000000..6dfb59b981 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py new file mode 100644 index 0000000000..6dfb59b981 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py new file mode 100644 index 0000000000..b2d31e026b --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py @@ -0,0 +1,497 @@ +"""Shared workload deployment utilities for GKE Agent Sandbox benchmarks. + +Provides idempotent functions to deploy the Agent Sandbox ecosystem +(CRDs, templates, warm pools, router, ADK agent, PSI reader) onto a +pre-provisioned GKE cluster. Called by each benchmark's Prepare() stage. + +All functions are idempotent -- safe to call repeatedly without side effects. +""" + +import logging +import os + +from absl import flags +from jinja2 import Template +from perfkitbenchmarker import data +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.resources.container_service import kubectl + +FLAGS = flags.FLAGS + +# --------------------------------------------------------------------------- +# Flags (registered once; shared across all benchmarks) +# --------------------------------------------------------------------------- + +flags.DEFINE_string( + "agent_sandbox_version", + "v0.4.6", + "Agent Sandbox controller version (GitHub release tag).", +) + +flags.DEFINE_string( + "agent_sandbox_router_image", + "", + "Sandbox router container image. If empty, router deployment is skipped.", +) + +flags.DEFINE_string( + "k8s_agent_image", + "", + "ADK agent container image. If empty, agent deployment is skipped.", +) + +flags.DEFINE_string( + "k8s_chromium_image", + "", + "Chromium sandbox container image. If empty, uses placeholder.", +) + +flags.DEFINE_integer( + "agent_sandbox_warmpool_replicas", + 2, + "Default warm pool replica count for SandboxWarmPool resources.", +) + +flags.DEFINE_integer( + "agent_sandbox_chromium_replicas", + 1, + "Default Chromium warm pool replica count.", +) + +flags.DEFINE_string( + "k8s_python_image", + "registry.k8s.io/agent-sandbox/python-runtime-sandbox:v0.1.0", + "Python runtime sandbox container image.", +) + +flags.DEFINE_integer( + "k8s_deploy_timeout", + 120, + "Timeout in seconds for workload deployment rollout.", +) + + + + +# Module-level derived images (set during DeployWorkloads) +_derived_images = {} + +# --------------------------------------------------------------------------- +# Template loading +# --------------------------------------------------------------------------- + +_MANIFESTS_DIR = "k8s_agents/manifests" + + +def _LoadTemplate(template_name): + """Load a Jinja2 template from the data directory.""" + template_path = os.path.join( + data.ResourcePath(_MANIFESTS_DIR), template_name + ) + with open(template_path, "r") as f: + return Template(f.read()) + + +def _RenderAndApply(template_name, **kwargs): + """Load a Jinja2 template, render it, write to file, and kubectl apply.""" + template = _LoadTemplate(template_name) + rendered = template.render(**kwargs) + + # Write rendered YAML to tmp dir (RunKubectlCommand does not support stdin) + tmp_dir = os.path.join( + data.ResourcePath(_MANIFESTS_DIR), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + + # Strip .j2 extension for the rendered file + rendered_name = template_name.replace(".j2", "") + rendered_path = os.path.join(tmp_dir, rendered_name) + with open(rendered_path, "w") as f: + f.write(rendered) + + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", rendered_path], + raise_on_failure=False, + ) + if retcode != 0: + logging.warning( + "kubectl apply failed for %s: %s", template_name, stderr[:500] + ) + return retcode == 0 + + +flags.DEFINE_bool( + "skip_deploy_snapshots", + False, + "Skip deployment of Pod Snapshot infrastructure. " + "Set to True on non-GKE clusters where pod snapshots are not supported.", +) + +flags.DEFINE_string( + "k8s_snapshot_ksa_name", + "pod-snapshot-sa", + "Kubernetes service account for pod snapshots.", +) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def _DeriveImagePaths(project, region, arch): + """Derive container image paths from cluster config. + + Args: + project: GCP project ID. + region: GCP region (e.g. us-central1). + arch: Docker platform architecture (amd64 or arm64). + + Returns: + Dict with keys: adk_agent, sandbox_router, chromium. + """ + return { + "adk_agent": f"{region}-docker.pkg.dev/{project}/adk-repo/adk-agent:{arch}", + "sandbox_router": f"{region}-docker.pkg.dev/{project}/agent-sandbox/sandbox-router:{arch}", + "chromium": f"{region}-docker.pkg.dev/{project}/agent-sandbox/chrome-sandbox:{arch}", + } + +def DeployWorkloads(benchmark_spec=None): + """Deploy the full Agent Sandbox ecosystem onto the GKE cluster. + + Idempotent: safe to call repeatedly. Sequence: + 1. Build images (if --skip_image_build=False) + 2. Create namespace + 3. Install Agent Sandbox CRDs + 4. Deploy SandboxTemplates + WarmPools + 5. Deploy Sandbox Router + 6. Deploy ADK Agent (Deployment + Service + RBAC) + 7. Deploy PSI Reader DaemonSet + 8. Wait for ADK Agent rollout + """ + ns = FLAGS.k8s_namespace + logging.info("=== DeployWorkloads: namespace=%s ===", ns) + + # Derive project, region, machine_type, cluster_name from benchmark_spec + project = "" + region = "" + machine_type = "" + cluster_name = "" + cluster = None + if benchmark_spec: + cluster = getattr(benchmark_spec, 'container_cluster', None) + if cluster: + project = getattr(cluster, 'project', '') or '' + zone = getattr(cluster, 'zone', '') or '' + region = zone[:-2] if zone else '' + cluster_name = getattr(cluster, 'name', '') or '' + # Prefer sandbox nodepool machine_type + nodepools = getattr(cluster, 'nodepools', None) + if nodepools and isinstance(nodepools, dict): + sandbox_pool = nodepools.get('sandbox') + if sandbox_pool and hasattr(sandbox_pool, 'vm_spec'): + machine_type = getattr(sandbox_pool.vm_spec, 'machine_type', '') or '' + if not machine_type and hasattr(cluster, 'vm_spec'): + machine_type = getattr(cluster.vm_spec, 'machine_type', '') or '' + # Fallback to global FLAGS if benchmark_spec not available + if not project: + project = getattr(FLAGS, 'project', '') or '' + if not region: + zone = getattr(FLAGS, 'zone', '') or '' + region = zone[:-2] if zone else '' + + # Derive image paths for template rendering. + # Chrome and Router images are built during prerequisites + # (gke_prerequisites.py), not during Prepare. + # ADK agent image is built by PKB container_specs during Provision. + from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_image_build_utils, + ) + arch = FLAGS.target_arch or "amd64" + global _derived_images + _derived_images = _DeriveImagePaths(project, region, arch) + logging.info( + "DeployWorkloads: project=%s region=%s arch=%s", + project, region, arch, + ) + logging.info("_derived_images: %s", _derived_images) + + _CreateNamespace(ns) + _InstallCRDs() + _DeploySandboxTemplates(ns) + _DeploySandboxRouter(ns) + # Prefer ADK image from PKB-native container_specs (built during Provision). + # Falls back to FLAGS.k8s_agent_image or derived image path. + adk_image_from_specs = "" + if benchmark_spec: + specs = getattr(benchmark_spec, "container_specs", {}) + adk_spec = specs.get("adk_agent") + if adk_spec and getattr(adk_spec, "image", None): + adk_image_from_specs = adk_spec.image + logging.info("Using ADK image from container_specs: %s", adk_image_from_specs) + _DeployADKAgent(ns, project=project, region=region, cluster_name=cluster_name, adk_image_override=adk_image_from_specs) + _DeployPSIReader(ns) + _WaitForAgentReady(ns) + + logging.info("DeployWorkloads complete.") + + +def DeploySnapshots(): + """Deploy Pod Snapshot infrastructure. + + Idempotent: safe to call repeatedly. Sequence: + 1. Create GCS bucket (hierarchical namespace) + 2. Create managed folder + 3. Create KSA for snapshots + 4. Bind IAM roles + 5. Deploy PodSnapshotStorageConfig + PodSnapshotPolicy + """ + if FLAGS.skip_deploy_snapshots: + logging.info("Skipping snapshot infrastructure (--skip_deploy_snapshots=True).") + return + + ns = FLAGS.k8s_namespace + project = getattr(FLAGS, 'project', '') or '' + zone = getattr(FLAGS, 'zone', '') or '' + region = zone[:-2] if zone else '' + + if not project: + logging.warning("DeploySnapshots: FLAGS.project not set, skipping.") + return + + bucket_name = "agent-sandbox-snapshots-{}".format(project) + snapshot_folder = "benchmark-snapshots" + ksa_name = FLAGS.k8s_snapshot_ksa_name + + logging.info("=== DeploySnapshots: bucket=%s ===", bucket_name) + + # 1. Create GCS bucket + vm_util.IssueCommand( + [ + "gcloud", "storage", "buckets", "create", + "gs://{}".format(bucket_name), + "--uniform-bucket-level-access", + "--enable-hierarchical-namespace", + "--soft-delete-duration=0d", + "--location={}".format(region), + "--project={}".format(project), + ], + raise_on_failure=False, + ) + + # 2. Create managed folder + vm_util.IssueCommand( + [ + "gcloud", "storage", "managed-folders", "create", + "gs://{}/{}/".format(bucket_name, snapshot_folder), + "--project={}".format(project), + ], + raise_on_failure=False, + ) + + # 3. Create KSA + kubectl.RunKubectlCommand( + ["create", "serviceaccount", ksa_name, "--namespace", ns], + raise_on_failure=False, + ) + + # 4. IAM bindings + project_number = _GetProjectNumber(project) + if project_number: + _BindSnapshotIAM(bucket_name, project, project_number, ns, ksa_name) + + # 5. Deploy PSSC + PSP + _RenderAndApply( + "snapshot-crds.yaml.j2", + ns=ns, + bucket_name=bucket_name, + snapshot_folder=snapshot_folder, + ) + + logging.info("DeploySnapshots complete.") + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _CreateNamespace(ns): + """Create namespace if it doesn't exist.""" + kubectl.RunKubectlCommand( + ["create", "namespace", ns], + raise_on_failure=False, + ) + + +def _InstallCRDs(): + """Install Agent Sandbox CRDs from GitHub release.""" + version = FLAGS.agent_sandbox_version + base_url = ( + "https://github.com/kubernetes-sigs/agent-sandbox" + "/releases/download/{}".format(version) + ) + logging.info("Installing Agent Sandbox CRDs (%s)", version) + kubectl.RunKubectlCommand( + [ + "apply", + "-f", "{}/manifest.yaml".format(base_url), + "-f", "{}/extensions.yaml".format(base_url), + ], + raise_on_failure=False, + ) + + +def _DeploySandboxTemplates(ns): + """Deploy SandboxTemplate + WarmPool for Python and Chromium.""" + python_image = FLAGS.k8s_python_image + chromium_image = FLAGS.k8s_chromium_image or _derived_images.get("chromium", "chromium-placeholder:latest") + warmpool_replicas = FLAGS.agent_sandbox_warmpool_replicas + chromium_replicas = FLAGS.agent_sandbox_chromium_replicas + + _RenderAndApply( + "sandbox-templates.yaml.j2", + ns=ns, + python_image=python_image, + chromium_image=chromium_image, + warmpool_replicas=warmpool_replicas, + chromium_replicas=chromium_replicas, + ) + + +def _DeploySandboxRouter(ns): + """Deploy the Sandbox Router Deployment + Service.""" + router_image = FLAGS.agent_sandbox_router_image or _derived_images.get("sandbox_router", "") + if not router_image: + logging.info("Sandbox router image not set, skipping router deployment.") + return + + _RenderAndApply( + "sandbox-router.yaml.j2", + ns=ns, + router_image=router_image, + ) + + +def _DeployADKAgent(ns, project="", region="", cluster_name="", adk_image_override=""): + """Deploy ADK Agent: SA, ClusterRole, RoleBinding, Deployment, Service.""" + adk_image = adk_image_override or FLAGS.k8s_agent_image or _derived_images.get("adk_agent", "") + + # Validate the image looks like a registry path, not a Dockerfile path. + # When Prepare runs separately from Provision, container_specs may not + # have the built image path. The config YAML default (agentic/adk-agent) + # is the Dockerfile lookup path, not a valid registry reference. + if adk_image and "docker.pkg.dev" not in adk_image: + derived = _derived_images.get("adk_agent", "") + if derived: + logging.warning( + "ADK image %s is not a registry path. Using derived: %s", + adk_image, derived, + ) + adk_image = derived + + if not adk_image: + logging.info("ADK agent image not set, skipping agent deployment.") + return + + logging.info("Using ADK image: %s", adk_image) + + project = project or "" + region = region or "" + cluster = cluster_name or "" + + _RenderAndApply( + "adk-agent.yaml.j2", + ns=ns, + adk_image=adk_image, + project=project, + region=region, + cluster=cluster, + ) + + +def _DeployPSIReader(ns): + """Deploy PSI Reader DaemonSet for cgroup pressure metrics.""" + _RenderAndApply("psi-reader.yaml.j2", ns=ns) + + +def _WaitForAgentReady(ns): + """Wait for ADK agent deployment to be ready. + + Always attempts the rollout wait regardless of how the image was + specified (FLAGS.k8s_agent_image, container_specs, or _derived_images). + kubectl rollout status returns non-zero harmlessly if the deployment + does not exist, and raise_on_failure=False prevents that from + propagating. + """ + timeout = FLAGS.k8s_deploy_timeout + logging.info("Waiting for adk-agent rollout (timeout=%ds)...", timeout) + _, stderr, retcode = kubectl.RunKubectlCommand( + [ + "rollout", "status", "deployment/adk-agent", + "-n", ns, + "--timeout={}s".format(timeout), + ], + raise_on_failure=False, + ) + if retcode != 0: + logging.warning( + "adk-agent rollout status returned %d: %s", + retcode, stderr.strip()[:200], + ) + + +def _GetProjectNumber(project): + """Get GCP project number from project ID.""" + stdout, _, retcode = vm_util.IssueCommand( + [ + "gcloud", "projects", "describe", project, + "--format=value(projectNumber)", + ], + raise_on_failure=False, + ) + return stdout.strip() if retcode == 0 else None + + +def _BindSnapshotIAM(bucket_name, project, project_number, ns, ksa_name): + """Bind IAM roles for pod snapshot access.""" + # bucketViewer to namespace + vm_util.IssueCommand( + [ + "gcloud", "storage", "buckets", "add-iam-policy-binding", + "gs://{}".format(bucket_name), + "--member=principalSet://iam.googleapis.com/projects/{}" + "/locations/global/workloadIdentityPools/{}.svc.id.goog" + "/namespace/{}".format(project_number, project, ns), + "--role=roles/storage.bucketViewer", + "--quiet", + ], + raise_on_failure=False, + ) + + # objectAdmin to KSA + vm_util.IssueCommand( + [ + "gcloud", "storage", "buckets", "add-iam-policy-binding", + "gs://{}".format(bucket_name), + "--member=principal://iam.googleapis.com/projects/{}" + "/locations/global/workloadIdentityPools/{}.svc.id.goog" + "/subject/ns/{}/sa/{}".format(project_number, project, ns, ksa_name), + "--role=roles/storage.objectAdmin", + "--quiet", + ], + raise_on_failure=False, + ) + + # objectUser to GKE snapshot controller + vm_util.IssueCommand( + [ + "gcloud", "storage", "buckets", "add-iam-policy-binding", + "gs://{}".format(bucket_name), + "--member=serviceAccount:service-{}" + "@container-engine-robot.iam.gserviceaccount.com".format(project_number), + "--role=roles/storage.objectUser", + "--quiet", + ], + raise_on_failure=False, + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py new file mode 100644 index 0000000000..2e976207f5 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py @@ -0,0 +1,355 @@ +"""Shared image build utilities for GKE Agent Sandbox benchmarks. + +Builds and pushes container images (Chrome sandbox, Sandbox Router) via +Google Cloud Build. Called from gke_deploy_utils.DeployWorkloads() during +the Prepare stage. + +NOTE: The ADK Agent image is built by the PKB native container_specs +mechanism during the Provision stage, not by this module. + +Images built: + - Chrome Sandbox: cloned from agent-sandbox repo + - Sandbox Router: cloned from agent-sandbox repo +""" + +import logging +import os +import shutil +import subprocess +import tempfile + +from absl import flags +from perfkitbenchmarker import vm_util + +FLAGS = flags.FLAGS + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Architecture detection +# --------------------------------------------------------------------------- + +flags.DEFINE_string( + "target_arch", + "", + "Target CPU architecture for container images (amd64 or arm64). " + "If set, skips gcloud machine-type detection. " + "Use this for non-GCP environments or when gcloud is unavailable.", +) + +_ARCH_MAP = { + "X86_64": "amd64", + "ARM64": "arm64", +} + + +def _DetectArchitecture(machine_type, zone, project): + """Detect CPU architecture for a GCP machine type. + + Uses gcloud to query the machine type's architecture, then maps + GCP naming (X86_64/ARM64) to Docker platform naming (amd64/arm64). + + Falls back to amd64 if gcloud fails. + """ + # Quick exit if user provided arch explicitly + if FLAGS.target_arch: + arch = FLAGS.target_arch.lower() + if arch in ("amd64", "arm64"): + logging.info("Using user-provided target_arch: %s", arch) + return arch + logging.warning( + "Invalid --target_arch='%s'. Must be amd64 or arm64. " + "Proceeding with gcloud detection.", + FLAGS.target_arch, + ) + + try: + stdout, _, retcode = vm_util.IssueCommand( + [ + "gcloud", + "compute", + "machine-types", + "describe", + machine_type, + f"--zone={zone}", + f"--project={project}", + "--format=value(architecture)", + ], + raise_on_failure=False, + timeout=30, + ) + if retcode == 0 and stdout.strip(): + gcp_arch = stdout.strip().upper() + docker_arch = _ARCH_MAP.get(gcp_arch) + if docker_arch: + logging.info( + "Detected architecture for %s: %s -> %s", + machine_type, + gcp_arch, + docker_arch, + ) + return docker_arch + logging.warning( + "Unknown GCP architecture '%s' for %s. Falling back to amd64.", + gcp_arch, + machine_type, + ) + except Exception as e: + logging.warning( + "gcloud machine-type describe failed: %s. Falling back to amd64.", e + ) + + return "amd64" + + +def build_images_with_config(project, region, machine_type, zone, arch): + """Core image build logic — no FLAGS dependency. + + Callable from both PKB (via BuildImages()) and prerequisite_setup.py. + Uses the project's default Cloud Build SA (no custom SA needed). + + Args: + project: GCP project ID. + region: GCP region (e.g. "us-central1"). + machine_type: Machine type string (e.g. "c4-standard-8"). + Used to derive target architecture (arm64 for c4a, amd64 otherwise). + """ + # Architecture passed in from caller (detected via gcloud) + target_arch = arch + + # Derive image paths + adk_image = f"{region}-docker.pkg.dev/{project}/adk-repo/adk-agent:{target_arch}" + chrome_image = ( + f"{region}-docker.pkg.dev/{project}/agent-sandbox/chrome-sandbox:{target_arch}" + ) + router_image = ( + f"{region}-docker.pkg.dev/{project}/agent-sandbox/sandbox-router:{target_arch}" + ) + + logger.info("=== Building Container Images (Chrome + Router only) ===") + logger.info(" Project: %s", project) + logger.info(" Region: %s", region) + logger.info(" Architecture: %s", target_arch) + logger.info(" Cloud Build SA: default (project Cloud Build SA)") + logger.info(" NOTE: ADK Agent image is built by PKB via container_specs") + + # 1. Build Chrome Sandbox + _BuildChromeSandboxImage( + project=project, + region=region, + target_arch=target_arch, + image_path=chrome_image, + ) + + # 3. Build Sandbox Router + _BuildSandboxRouterImage( + project=project, + region=region, + target_arch=target_arch, + image_path=router_image, + ) + + logger.info("=== Chrome + Router images built successfully ===") + logger.info(" Chrome Sandbox: %s", chrome_image) + logger.info(" Sandbox Router: %s", router_image) + logger.info(" (ADK Agent built by PKB via container_specs)") + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _BuildChromeSandboxImage(project, region, target_arch, image_path): + """Build and push the Chrome Sandbox image.""" + logger.info("Building Chrome Sandbox image: %s", image_path) + + tmp_dir = tempfile.mkdtemp(prefix="chrome-sandbox-") + try: + # Clone agent-sandbox repo (sparse checkout) + logger.info("Cloning agent-sandbox chrome-sandbox source...") + _RunCmd( + [ + "git", + "clone", + "--depth", + "1", + "--filter=blob:none", + "--sparse", + "https://github.com/kubernetes-sigs/agent-sandbox.git", + tmp_dir, + ] + ) + _RunCmd( + ["git", "sparse-checkout", "set", "examples/chrome-sandbox"], + cwd=tmp_dir, + ) + + build_dir = os.path.join(tmp_dir, "examples", "chrome-sandbox") + if not os.path.isfile(os.path.join(build_dir, "Dockerfile")): + raise RuntimeError(f"chrome-sandbox Dockerfile not found at {build_dir}") + + # Patch Dockerfile: add socat for CDP proxy + dockerfile_path = os.path.join(build_dir, "Dockerfile") + with open(dockerfile_path, "r") as f: + content = f.read() + content = content.replace( + "RUN apt-get update && apt-get install --yes --no-install-recommends chromium", + "RUN apt-get update && apt-get install --yes --no-install-recommends chromium socat", + ) + with open(dockerfile_path, "w") as f: + f.write(content) + + # Submit Cloud Build (generates cloudbuild.yaml in temp dir) + _SubmitCloudBuild( + source_dir=build_dir, + image_path=image_path, + target_arch=target_arch, + project=project, + ) + + logger.info("Chrome Sandbox image built successfully.") + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) + + +def _BuildSandboxRouterImage(project, region, target_arch, image_path): + """Build and push the Sandbox Router image.""" + logger.info("Building Sandbox Router image: %s", image_path) + + tmp_dir = tempfile.mkdtemp(prefix="sandbox-router-") + try: + # Clone agent-sandbox repo (sparse checkout) + logger.info("Cloning agent-sandbox router source...") + _RunCmd( + [ + "git", + "clone", + "--depth", + "1", + "--filter=blob:none", + "--sparse", + "https://github.com/kubernetes-sigs/agent-sandbox.git", + tmp_dir, + ] + ) + _RunCmd( + [ + "git", + "sparse-checkout", + "set", + "clients/python/agentic-sandbox-client/sandbox-router", + ], + cwd=tmp_dir, + ) + + build_dir = os.path.join( + tmp_dir, "clients", "python", "agentic-sandbox-client", "sandbox-router" + ) + if not os.path.isfile(os.path.join(build_dir, "Dockerfile")): + raise RuntimeError(f"sandbox-router Dockerfile not found at {build_dir}") + + # Submit Cloud Build (generates cloudbuild.yaml in temp dir) + _SubmitCloudBuild( + source_dir=build_dir, + image_path=image_path, + target_arch=target_arch, + project=project, + ) + + logger.info("Sandbox Router image built successfully.") + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) + + +def _SubmitCloudBuild(source_dir, image_path, target_arch, project): + """Generate a cloudbuild.yaml with substitutions and submit via Cloud Build. + + Used for Chrome and Router images (built in temp directories). + Uses the project's default Cloud Build SA. + + For cross-architecture builds (e.g. arm64 on amd64 workers), uses + QEMU emulation + Docker Buildx to produce the target-arch image. + A high-CPU machine type (E2_HIGHCPU_32) is used to offset the + overhead of QEMU instruction translation. + """ + if target_arch == "amd64": + # Native build — no emulation needed + cloudbuild_content = """steps: + - name: 'gcr.io/cloud-builders/docker' + args: ['build', '--platform', '${_PLATFORM}', '-t', '${_IMAGE_PATH}', '.'] + env: + - 'DOCKER_BUILDKIT=1' +images: + - '${_IMAGE_PATH}' +options: + logging: CLOUD_LOGGING_ONLY +substitutions: + _IMAGE_PATH: '' + _PLATFORM: 'linux/amd64' +""" + else: + # Cross-arch build — QEMU + Buildx required. + # Cloud Build workers are amd64; QEMU registers binfmt handlers + # so the kernel can execute arm64 binaries transparently. + # E2_HIGHCPU_32 provides 32 vCPUs to offset emulation overhead. + # Buildx --push handles the registry push directly, so no + # top-level 'images:' key is needed. + cloudbuild_content = """steps: + - name: 'gcr.io/cloud-builders/docker' + args: ['run', '--privileged', 'multiarch/qemu-user-static', '--reset', '-p', 'yes'] + id: 'qemu-setup' + - name: 'gcr.io/cloud-builders/docker' + args: ['buildx', 'create', '--use', '--name', 'multiarch-builder'] + id: 'create-builder' + waitFor: ['qemu-setup'] + - name: 'gcr.io/cloud-builders/docker' + args: ['buildx', 'build', '--platform', '${_PLATFORM}', '-t', '${_IMAGE_PATH}', '--push', '.'] + id: 'build-and-push' + waitFor: ['create-builder'] +options: + logging: CLOUD_LOGGING_ONLY + machineType: E2_HIGHCPU_32 +substitutions: + _IMAGE_PATH: '' + _PLATFORM: 'linux/amd64' +""" + cloudbuild_path = os.path.join(source_dir, "cloudbuild.yaml") + with open(cloudbuild_path, "w") as f: + f.write(cloudbuild_content) + + _RunCmd( + [ + "gcloud", + "builds", + "submit", + source_dir, + f"--config={cloudbuild_path}", + f"--substitutions=_IMAGE_PATH={image_path},_PLATFORM=linux/{target_arch}", + f"--project={project}", + ] + ) + + +def _RunCmd(cmd, cwd=None): + """Run a shell command, raising on failure.""" + logger.info(" CMD: %s", " ".join(cmd)) + env = os.environ.copy() + env["CLOUDSDK_AUTH_DISABLE_SSL_VALIDATION"] = "true" + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=cwd, + timeout=2400, # 40 min: allows for QEMU cross-arch builds + env=env, + ) + + if proc.returncode != 0: + raise RuntimeError( + f"Command failed (rc={proc.returncode}): {' '.join(cmd)}\n" + f"stderr: {proc.stderr[-500:]}" + ) + return proc.stdout diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py new file mode 100644 index 0000000000..1bae7b41d4 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Post-Teardown Cleanup for GKE Agentic Benchmarking. + +Cleans up infrastructure created by gke_prerequisites.py and DeploySnapshots(): + - Delete Cloud Build service account + IAM bindings + - Delete GCS snapshot bucket + - Delete Artifact Registry repositories + +Run ONCE after all benchmarks are complete (after PKB Teardown has deleted the cluster): + python -m perfkitbenchmarker.linux_benchmarks.kubernetes.agentic.gke_post_teardown \ + --project_id= --region= +""" + +import argparse +import logging +import subprocess + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +def _run(cmd, check=False, timeout=300): + logger.info("CMD: %s", " ".join(cmd)) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + if check and result.returncode != 0: + logger.warning("Command failed (rc=%d): %s", result.returncode, result.stderr[-300:]) + return result + + +def revoke_cloudbuild_sa_permissions(project_id): + """Revoke extra IAM roles from Cloud Build SA(s). + + Mirrors grant_cloudbuild_sa_permissions() from gke_prerequisites.py. + Revokes roles from both possible SAs. Does NOT delete them + (they are project-managed). + """ + logger.info("=== Revoking extra permissions from Cloud Build SA(s) ===") + result = _run(["gcloud", "projects", "describe", project_id, + "--format=value(projectNumber)"]) + project_number = result.stdout.strip() + if not project_number: + logger.warning("Could not determine project number, skipping SA cleanup") + return + sa_emails = [ + f"{project_number}@cloudbuild.gserviceaccount.com", + f"{project_number}-compute@developer.gserviceaccount.com", + ] + roles = ["roles/logging.logWriter", "roles/storage.objectViewer", + "roles/artifactregistry.writer", "roles/serviceusage.serviceUsageConsumer"] + for sa_email in sa_emails: + for role in roles: + _run(["gcloud", "projects", "remove-iam-policy-binding", project_id, + f"--member=serviceAccount:{sa_email}", f"--role={role}", "--quiet"]) + logger.info("Cloud Build SA extra permissions revoked.") + + +def teardown_snapshot_bucket(project_id, region): + logger.info("=== Deleting Snapshot Bucket ===") + bucket_name = f"agent-sandbox-snapshots-{project_id}" + _run(["gcloud", "storage", "rm", f"gs://{bucket_name}/**", + f"--project={project_id}", "--quiet"]) + _run(["gcloud", "storage", "buckets", "delete", f"gs://{bucket_name}", + f"--project={project_id}", "--quiet"]) + logger.info("Snapshot bucket deleted.") + + +def teardown_images(project_id, region): + logger.info("=== Deleting AR repos ===") + # "adk-repo" is created/deleted by PKB container_registry lifecycle + # (Provision creates it, Teardown deletes it). If you skip PKB Teardown, + # run: gcloud artifacts repositories delete adk-repo --location= + # Only "agent-sandbox" (Chrome + Router images) needs manual cleanup here. + for repo in ["agent-sandbox"]: + _run(["gcloud", "artifacts", "repositories", "delete", repo, + f"--location={region}", f"--project={project_id}", "--quiet"]) + logger.info("AR repos deleted.") + + +def main(): + p = argparse.ArgumentParser(description="GKE Agentic Benchmark Post-Teardown") + p.add_argument("--project_id", required=True, help="GCP project ID") + p.add_argument("--region", default="us-central1", help="GCP region") + p.add_argument("--keep_images", action="store_true", help="Skip AR repo deletion") + p.add_argument("--keep_bucket", action="store_true", help="Skip snapshot bucket deletion") + args = p.parse_args() + revoke_cloudbuild_sa_permissions(args.project_id) + if not args.keep_bucket: + teardown_snapshot_bucket(args.project_id, args.region) + if not args.keep_images: + teardown_images(args.project_id, args.region) + print("\nPost-teardown complete!") + + +if __name__ == "__main__": + main() diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py new file mode 100644 index 0000000000..72c32d5b1f --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Prerequisite Setup for GKE Agentic Benchmarking. + +Creates infrastructure that PKB cannot manage natively: + - Enable required GCP APIs + - Create Artifact Registry repositories + - Create Cloud Build service account + IAM bindings + +Run ONCE before PKB provisioning: + python -m perfkitbenchmarker.linux_benchmarks.kubernetes.agentic.gke_prerequisites \ + --project_id= --region= +""" + +import argparse +import logging +import os +import subprocess +import time + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + + +def _run(cmd, check=True, timeout=300): + logger.info("CMD: %s", " ".join(cmd)) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + if check and result.returncode != 0: + logger.error("Command failed (rc=%d): %s", result.returncode, result.stderr[-500:]) + raise RuntimeError(f"Command failed: {cmd}") + return result + + +def _exists(cmd): + result = subprocess.run(cmd, capture_output=True, text=True, timeout=60) + return result.returncode == 0 + + +def enable_apis(project_id): + logger.info("=== Enabling GCP APIs ===") + apis = [ + "container.googleapis.com", + "artifactregistry.googleapis.com", + "cloudbuild.googleapis.com", + "aiplatform.googleapis.com", + "storage.googleapis.com", + "iam.googleapis.com", + "connectgateway.googleapis.com", + "gkehub.googleapis.com", + "gkeconnect.googleapis.com", + "iap.googleapis.com", + ] + _run(["gcloud", "services", "enable"] + apis + [f"--project={project_id}"]) + logger.info("APIs enabled.") + + +def create_artifact_registry(project_id, region): + logger.info("=== Creating Artifact Registry Repos ===") + # "adk-repo" is no longer needed here -- PKB creates its own AR repo + # via container_registry during the Provision stage. + # Only "agent-sandbox" is needed for Chrome/Router images. + for repo in ["agent-sandbox"]: + if _exists(["gcloud", "artifacts", "repositories", "describe", repo, + f"--location={region}", f"--project={project_id}"]): + logger.info("AR repo %s already exists.", repo) + continue + _run(["gcloud", "artifacts", "repositories", "create", repo, + "--repository-format=docker", + f"--location={region}", f"--project={project_id}"]) + logger.info("AR repo %s created.", repo) + + +def grant_cloudbuild_sa_permissions(project_id): + """Grant required IAM roles to the Cloud Build service account(s). + + Auto-detects which SA Cloud Build uses in this project: + - Legacy projects: {number}@cloudbuild.gserviceaccount.com + - Newer projects: {number}-compute@developer.gserviceaccount.com + + Grants permissions to both SAs to ensure compatibility regardless + of project configuration. This is idempotent and safe. + """ + logger.info("=== Granting permissions to Cloud Build SA(s) ===") + result = _run(["gcloud", "projects", "describe", project_id, + "--format=value(projectNumber)"]) + project_number = result.stdout.strip() + if not project_number: + logger.error("Could not determine project number for %s", project_id) + return + + # Both possible Cloud Build SAs + cloudbuild_sa = f"{project_number}@cloudbuild.gserviceaccount.com" + compute_sa = f"{project_number}-compute@developer.gserviceaccount.com" + + # Detect which SA(s) exist + sa_emails = [] + for sa in [cloudbuild_sa, compute_sa]: + if _exists(["gcloud", "iam", "service-accounts", "describe", + sa, f"--project={project_id}"]): + sa_emails.append(sa) + logger.info("Found Cloud Build SA: %s", sa) + else: + logger.info("SA not found (skipping): %s", sa) + + if not sa_emails: + logger.error("No Cloud Build SA found in project %s", project_id) + return + + roles = [ + "roles/logging.logWriter", + "roles/storage.objectViewer", + "roles/artifactregistry.writer", + "roles/serviceusage.serviceUsageConsumer", + ] + for sa_email in sa_emails: + logger.info("Granting roles to %s", sa_email) + for role in roles: + _run(["gcloud", "projects", "add-iam-policy-binding", project_id, + f"--member=serviceAccount:{sa_email}", + f"--role={role}", "--condition=None", "--quiet"], check=False) + logger.info("Cloud Build SA permissions granted.") + + + + +def build_sandbox_images(project_id, region, target_arch): + """Build Chrome Sandbox and Sandbox Router images via Cloud Build.""" + logger.info("=== Building Sandbox Images (arch=%s) ===", target_arch) + from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import gke_image_build_utils + + chrome_image = ( + f"{region}-docker.pkg.dev/{project_id}/agent-sandbox/chrome-sandbox:{target_arch}" + ) + router_image = ( + f"{region}-docker.pkg.dev/{project_id}/agent-sandbox/sandbox-router:{target_arch}" + ) + + gke_image_build_utils._BuildChromeSandboxImage( + project=project_id, + region=region, + target_arch=target_arch, + image_path=chrome_image, + ) + + gke_image_build_utils._BuildSandboxRouterImage( + project=project_id, + region=region, + target_arch=target_arch, + image_path=router_image, + ) + + logger.info("Sandbox images built successfully.") + logger.info(" Chrome: %s", chrome_image) + logger.info(" Router: %s", router_image) + +def main(): + p = argparse.ArgumentParser(description="GKE Agentic Benchmark Prerequisites") + p.add_argument("--project_id", required=True, help="GCP project ID") + p.add_argument("--region", default="us-central1", help="GCP region") + p.add_argument( + "--target_arch", + required=True, + choices=["amd64", "arm64"], + help="Target CPU architecture for container images (amd64 or arm64)", + ) + p.add_argument( + "--skip_image_build", + action="store_true", + help="Skip Chrome and Router image builds (images already in registry)", + ) + args = p.parse_args() + enable_apis(args.project_id) + create_artifact_registry(args.project_id, args.region) + grant_cloudbuild_sa_permissions(args.project_id) + if not args.skip_image_build: + build_sandbox_images(args.project_id, args.region, args.target_arch) + else: + logger.info("Skipping image builds (--skip_image_build)") + print("\nPrerequisite setup complete!") + + +if __name__ == "__main__": + main() + diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py new file mode 100644 index 0000000000..e23aa32a6d --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py @@ -0,0 +1,506 @@ +"""Shared utilities for GKE Agent Sandbox benchmarks. + +Provides helpers for agent API interaction, kubectl commands, warm pool +management, and sample construction used by all GKE agent benchmark +definitions. +""" + +import json +import logging +import subprocess +import time +import urllib.request +import urllib.error + +from absl import flags +from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.resources.container_service import kubectl + +FLAGS = flags.FLAGS + +# Module-level benchmark_spec reference for metadata derivation. +# Set by each benchmark's Run() via set_benchmark_spec(). +_current_benchmark_spec = None + + +# --------------------------------------------------------------------------- +# Shared flags (registered once; importable by benchmark modules) +# --------------------------------------------------------------------------- + +flags.DEFINE_string( + "k8s_namespace", + "agentic", + "Kubernetes namespace where the agentic workloads are deployed.", +) + +flags.DEFINE_bool( + "k8s_gvisor", + True, + "Whether the sandbox node pool uses gVisor. Recorded in sample metadata.", +) + +flags.DEFINE_string( + "k8s_benchmark_note", + "", + "Arbitrary note string attached to every sample for tagging runs.", +) + +flags.DEFINE_string( + "k8s_agent_api_url", + "http://localhost:8080", + "Base URL of the ADK Agent API.", +) + +flags.DEFINE_integer( + "k8s_agent_api_timeout", + 600, + "HTTP timeout in seconds for agent API benchmark calls.", +) + + +# --------------------------------------------------------------------------- +# Agent API helpers +# --------------------------------------------------------------------------- + + +def GetAgentApiUrl(): + """Return the base URL of the ADK agent API service.""" + return FLAGS.k8s_agent_api_url.rstrip("/") + + +def CheckAgentHealthz(api_url=None, required=True): + """Verify the agent API is reachable via /healthz. + + Args: + api_url: Base URL to check. Defaults to FLAGS.k8s_agent_api_url. + required: If True (default), raise on failure. If False, log warning. + """ + if api_url is None: + api_url = GetAgentApiUrl() + try: + req = urllib.request.Request(f"{api_url}/healthz") + with urllib.request.urlopen(req, timeout=15) as resp: + logging.info("Agent healthz: %s", resp.read().decode()) + except (urllib.error.URLError, urllib.error.HTTPError) as e: + msg = ( + f"Agent API is not reachable at {api_url}/healthz: {e}\n" + "Hint: ensure kubectl port-forward is running " + "(kubectl port-forward svc/adk-agent -n 8080:80)." + ) + if required: + raise RuntimeError(msg) + else: + logging.warning("Health check deferred (non-fatal): %s", msg) + + +def CallAgentApi(endpoint, payload, timeout=None): + """POST JSON to an agent API endpoint and return the parsed response.""" + if timeout is None: + timeout = FLAGS.k8s_agent_api_timeout + base_url = GetAgentApiUrl() + url = f"{base_url}{endpoint}" + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + url, data=data, + headers={"Content-Type": "application/json"}, + method="POST", + ) + logging.info("POST %s payload=%s timeout=%ds", url, payload, timeout) + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + body = resp.read().decode("utf-8") + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace") + raise RuntimeError(f"Agent API returned HTTP {e.code}: {body[:500]}") + except urllib.error.URLError as e: + raise RuntimeError(f"Cannot reach agent API at {url}: {e.reason}") + try: + return json.loads(body) + except json.JSONDecodeError: + raise RuntimeError(f"Agent API returned non-JSON response:\n{body[:500]}") + + +# --------------------------------------------------------------------------- +# kubectl helpers +# --------------------------------------------------------------------------- + + +def RunKubectl(args, timeout=120, raise_on_failure=True): + """Run a kubectl command and return (stdout, stderr, retcode). + + Delegates to PKB's native kubectl module which handles kubeconfig + and retries for transient connection errors automatically. + """ + return kubectl.RunKubectlCommand( + list(args), + timeout=timeout, + raise_on_failure=raise_on_failure, + ) + + +def CountPods(namespace, label, phase=None): + """Count pods matching label (and optionally phase).""" + cmd = ["get", "pods", "-n", namespace, "-l", label, "-o", "name"] + if phase: + cmd += [f"--field-selector=status.phase={phase}"] + stdout, _, rc = RunKubectl(cmd, raise_on_failure=False) + if rc != 0 or not stdout: + return 0 + return len(stdout.strip().splitlines()) + + +def PatchWarmPool(namespace, warmpool_name, replicas, label, wait_timeout=180): + """Patch SandboxWarmPool replicas and wait for pods to be ready.""" + logging.info("Patching %s replicas -> %d", warmpool_name, replicas) + patch_json = json.dumps({"spec": {"replicas": replicas}}) + RunKubectl([ + "patch", "sandboxwarmpool", warmpool_name, + "-n", namespace, "--type=merge", f"-p={patch_json}", + ]) + if replicas == 0: + return True + deadline = time.time() + wait_timeout + while time.time() < deadline: + running = CountPods(namespace, label, phase="Running") + logging.info("%d/%d warm pool pods Running", running, replicas) + if running >= replicas: + return True + time.sleep(3) + logging.warning("Timed out waiting for %d warm pool pods", replicas) + return False + + +def DrainWarmPool(namespace, warmpool_name, label, timeout=120): + """Scale warm pool to 0 and wait for all pods to terminate.""" + logging.info("Draining warm pool %s to 0", warmpool_name) + patch_json = json.dumps({"spec": {"replicas": 0}}) + RunKubectl([ + "patch", "sandboxwarmpool", warmpool_name, + "-n", namespace, "--type=merge", f"-p={patch_json}", + ], raise_on_failure=False) + + # Delete lingering SandboxClaims that may prevent pod termination + RunKubectl([ + "delete", "sandboxclaims", "--all", + "-n", namespace, "--ignore-not-found=true", + ], timeout=60, raise_on_failure=False) + + deadline = time.time() + timeout + while time.time() < deadline: + remaining = CountPods(namespace, label) + if remaining == 0: + logging.info("Warm pool drained successfully") + return True + logging.info("Draining... %d pods remaining", remaining) + time.sleep(2) + logging.warning("Drain timed out, %d pods still present", + CountPods(namespace, label)) + return False + + +def set_benchmark_spec(benchmark_spec): + """Store benchmark_spec for metadata derivation (called by Run()).""" + global _current_benchmark_spec + _current_benchmark_spec = benchmark_spec + + + + +# --------------------------------------------------------------------------- +# Sample construction +# --------------------------------------------------------------------------- + + +def BuildMetadata(namespace, extra=None): + """Construct the common metadata dict for all samples.""" + metadata = { + "namespace": namespace, + "gvisor": FLAGS.k8s_gvisor, + } + # Derive machine_type from benchmark_spec (set via set_benchmark_spec) + machine_type = None + if _current_benchmark_spec: + cluster = getattr(_current_benchmark_spec, 'container_cluster', None) + if cluster: + # Prefer sandbox nodepool machine_type over default pool + nodepools = getattr(cluster, 'nodepools', None) + if nodepools and isinstance(nodepools, dict): + sandbox_pool = nodepools.get('sandbox') + if sandbox_pool and hasattr(sandbox_pool, 'vm_spec'): + machine_type = getattr(sandbox_pool.vm_spec, 'machine_type', None) + if not machine_type and hasattr(cluster, 'vm_spec'): + machine_type = getattr(cluster.vm_spec, 'machine_type', None) + if machine_type: + metadata["machine_type"] = machine_type + if FLAGS.k8s_benchmark_note: + metadata["note"] = FLAGS.k8s_benchmark_note + if extra: + metadata.update(extra) + return metadata + + +def MakeSample(metric, value, unit, namespace, extra_metadata=None): + """Create a single sample.Sample with standard metadata.""" + return sample.Sample( + metric=metric, + value=value, + unit=unit, + metadata=BuildMetadata(namespace, extra_metadata), + ) + + +# --------------------------------------------------------------------------- +# Port-forward flags +# --------------------------------------------------------------------------- + +flags.DEFINE_bool( + "k8s_auto_portforward", + True, + "Automatically manage kubectl port-forward to the agent service.", +) + +flags.DEFINE_integer( + "k8s_portforward_local_port", + 8080, + "Local port for kubectl port-forward.", +) + +flags.DEFINE_integer( + "k8s_portforward_remote_port", + 80, + "Remote service port for kubectl port-forward.", +) + +flags.DEFINE_string( + "k8s_portforward_service", + "svc/adk-agent", + "Kubernetes service to port-forward to.", +) + +flags.DEFINE_float( + "k8s_portforward_reconnect_delay", + 1.0, + "Seconds to wait before reconnecting after port-forward drops.", +) + +flags.DEFINE_float( + "k8s_portforward_health_timeout", + 30.0, + "Seconds to wait for agent health check after starting port-forward.", +) + + +# --------------------------------------------------------------------------- +# Port-forward manager +# --------------------------------------------------------------------------- + +import atexit +import os as _os +import signal +import threading + + +_PID_FILE = "/tmp/pkb_portforward.pid" + + +class _PortForwardManager: + """Manages a kubectl port-forward subprocess with auto-reconnect. + + Mimics the shell pattern: + while true; do + kubectl port-forward svc/adk-agent -n agentic 8080:80 + echo "Reconnecting..." + sleep 1 + done + + Thread-safe. Idempotent start/stop. Cleans up orphans via PID file. + """ + + def __init__(self): + self._proc = None + self._thread = None + self._stop_event = threading.Event() + self._lock = threading.Lock() + self._started = False + + @property + def is_running(self): + return self._started and not self._stop_event.is_set() + + def start(self): + """Start the port-forward loop (idempotent).""" + with self._lock: + if self._started and not self._stop_event.is_set(): + if self._proc and self._proc.poll() is None: + return + return + + self._kill_orphan() + self._stop_event.clear() + self._started = True + self._thread = threading.Thread( + target=self._loop, daemon=True, name="pkb-portforward" + ) + self._thread.start() + + def stop(self): + """Stop the port-forward loop and kill the subprocess.""" + with self._lock: + if not self._started: + return + self._stop_event.set() + self._kill_proc() + self._started = False + self._cleanup_pid_file() + + def _loop(self): + """Background reconnect loop.""" + ns = FLAGS.k8s_namespace + svc = FLAGS.k8s_portforward_service + local_port = FLAGS.k8s_portforward_local_port + remote_port = FLAGS.k8s_portforward_remote_port + delay = FLAGS.k8s_portforward_reconnect_delay + + cmd = ["kubectl"] + if FLAGS.kubeconfig: + cmd += ["--kubeconfig", FLAGS.kubeconfig] + cmd += [ + "port-forward", svc, + "-n", ns, + f"{local_port}:{remote_port}", + ] + + while not self._stop_event.is_set(): + logging.info("Starting port-forward: %s", " ".join(cmd)) + try: + self._proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + self._write_pid_file(self._proc.pid) + + while not self._stop_event.is_set(): + retcode = self._proc.poll() + if retcode is not None: + break + self._stop_event.wait(timeout=0.5) + + except Exception as e: + logging.warning("Port-forward error: %s", e) + + if not self._stop_event.is_set(): + logging.info( + "Port-forward disconnected. Reconnecting in %.1fs...", delay + ) + self._stop_event.wait(timeout=delay) + + def _kill_proc(self): + """Kill the current subprocess if alive.""" + if self._proc and self._proc.poll() is None: + try: + self._proc.terminate() + self._proc.wait(timeout=5) + except Exception: + try: + self._proc.kill() + except Exception: + pass + self._proc = None + + def _write_pid_file(self, pid): + """Write PID to file for orphan detection.""" + try: + with open(_PID_FILE, "w") as f: + f.write(str(pid)) + except Exception: + pass + + def _cleanup_pid_file(self): + """Remove PID file.""" + try: + _os.unlink(_PID_FILE) + except OSError: + pass + + def _kill_orphan(self): + """Kill a port-forward process left by a previous PKB run.""" + try: + if _os.path.exists(_PID_FILE): + with open(_PID_FILE, "r") as f: + pid = int(f.read().strip()) + logging.info("Killing orphan port-forward (PID %d)", pid) + _os.kill(pid, signal.SIGTERM) + import time as _time + _time.sleep(0.5) + try: + _os.kill(pid, signal.SIGKILL) + except OSError: + pass + self._cleanup_pid_file() + except (OSError, ValueError): + self._cleanup_pid_file() + + local_port = FLAGS.k8s_portforward_local_port + try: + result = subprocess.run( + ["lsof", "-ti", f":{local_port}"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0 and result.stdout.strip(): + for pid_str in result.stdout.strip().split(): + try: + pid = int(pid_str) + _os.kill(pid, signal.SIGTERM) + logging.info("Killed process %d on port %d", pid, local_port) + except (OSError, ValueError): + pass + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + + +# Singleton instance +_port_forward_manager = _PortForwardManager() + +# Ensure cleanup on interpreter exit +atexit.register(_port_forward_manager.stop) + + +def EnsurePortForward(): + """Start port-forward if auto_portforward is enabled (idempotent). + + Blocks until the agent health check passes or timeout is reached. + Safe to call multiple times - only starts one background loop. + """ + if not FLAGS.k8s_auto_portforward: + logging.info("Auto port-forward disabled (--k8s_auto_portforward=false)") + return + + _port_forward_manager.start() + + import time as _time + timeout = FLAGS.k8s_portforward_health_timeout + deadline = _time.time() + timeout + api_url = GetAgentApiUrl() + + while _time.time() < deadline: + try: + req = urllib.request.Request(f"{api_url}/healthz") + with urllib.request.urlopen(req, timeout=3) as resp: + logging.info("Port-forward healthy: %s", resp.read().decode()) + return + except Exception: + _time.sleep(1) + + logging.warning( + "Port-forward health check did not pass within %.0fs. " + "Continuing anyway (Run() will fail if agent is unreachable).", + timeout, + ) + + +def StopPortForward(): + """Stop the port-forward subprocess and clean up.""" + _port_forward_manager.stop() + logging.info("Port-forward stopped.") diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py new file mode 100644 index 0000000000..bd9114877c --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py @@ -0,0 +1,284 @@ +"""PKB Benchmark: GKE Agent Chromium Density Saturation . + +Atomic single-point measurement of Chromium browser sandbox density on a +pre-provisioned GKE cluster with gVisor isolation. Measures interaction +latency, screenshot generation time, cold start, navigation, evaluation, +fill, click latencies, and RSS memory at a given concurrent session count. + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the density parameter across iterations to find +the saturation point. + +Usage: + python pkb.py --benchmarks=gke_chromium_density \\ + --k8s_chromium_density_concurrent_sessions=4 \\ + --k8s_chromium_density_task_count=10 \\ + --k8s_chromium_density_warmup_tasks=5 \\ + --k8s_namespace=agentic \\ + --k8s_agent_api_url=http://localhost:8080 + +Samples emitted (per run): + - gke_chromium_density_interaction_mean (ms) + - gke_chromium_density_interaction_p95 (ms) + - gke_chromium_density_navigate_mean (ms) + - gke_chromium_density_navigate_p95 (ms) + - gke_chromium_density_evaluate_mean (ms) + - gke_chromium_density_evaluate_p95 (ms) + - gke_chromium_density_fill_mean (ms) + - gke_chromium_density_fill_p95 (ms) + - gke_chromium_density_click_mean (ms) + - gke_chromium_density_click_p95 (ms) + - gke_chromium_density_screenshot_mean (ms) + - gke_chromium_density_screenshot_p95 (ms) + - gke_chromium_density_cold_start_mean (ms) + - gke_chromium_density_cold_start_p95 (ms) + - gke_chromium_density_rss_end (MB) + - gke_chromium_density_rss_growth (MB) + - gke_chromium_density_wall_time (seconds) +""" + +import logging +import time + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_chromium_density" +BENCHMARK_CONFIG = """ +k8s_chromium_density: + description: > + Atomic single-point Chromium browser sandbox density measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +_WARMPOOL_NAME = "chromium-sandbox-warmpool" +_WARMPOOL_LABEL = "sandbox=chromium-sandbox-example" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_integer( + "k8s_chromium_density_concurrent_sessions", + 1, + "Number of concurrent Chromium browser sessions to run.", +) + +flags.DEFINE_integer( + "k8s_chromium_density_task_count", + 10, + "Number of browser task iterations per Chromium session.", +) + +flags.DEFINE_integer( + "k8s_chromium_density_warmup_tasks", + 5, + "Number of warmup iterations per session (excluded from stats).", +) + +flags.DEFINE_bool( + "k8s_chromium_density_patch_warmpool", + True, + "Patch SandboxWarmPool replicas to match density before measurement.", +) + +flags.DEFINE_integer( + "k8s_chromium_density_exec_timeout", + 120, + "Sandbox command execution timeout in seconds.", +) + +flags.DEFINE_integer( + "k8s_chromium_density_provision_timeout", + 300, + "Max seconds to wait for warm pool pods to reach Running.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads and verify agent API.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + utils.CheckAgentHealthz(required=False) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Execute a single Chromium density measurement and return samples. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + density = FLAGS.k8s_chromium_density_concurrent_sessions + + logging.info("=== Run: chromium_density=%d ===", density) + + # Ensure port-forward is active (needed when sweeps skip Prepare) + utils.EnsurePortForward() + + # Patch warm pool (moved from Prepare for sweep compatibility) + if FLAGS.k8s_chromium_density_patch_warmpool: + utils.PatchWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + replicas=density, + label=_WARMPOOL_LABEL, + wait_timeout=FLAGS.k8s_chromium_density_provision_timeout, + ) + + # POST to agent API + payload = { + "task_count": FLAGS.k8s_chromium_density_task_count, + "warmup_tasks": FLAGS.k8s_chromium_density_warmup_tasks, + "concurrent_sessions": density, + "sandbox_exec_timeout_s": FLAGS.k8s_chromium_density_exec_timeout, + } + + t0 = time.time() + result = utils.CallAgentApi("/benchmark/chromium/density", payload) + wall_time = time.time() - t0 + + successful = result.get("successful_sessions", 0) + failed = result.get("failed_sessions", 0) + agg = result.get("aggregate", {}) + + logging.info( + "API response: %d successful, %d failed sessions (%.1fs)", + successful, + failed, + wall_time, + ) + + # Build samples + extra = { + "density": density, + "successful_sessions": successful, + "failed_sessions": failed, + "task_count": FLAGS.k8s_chromium_density_task_count, + "warmup_tasks": FLAGS.k8s_chromium_density_warmup_tasks, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + # Per-task-type latency: mean and P95 for each + _emit(samples, agg, "interaction_mean_ms", "interaction_mean", "ms", ns, extra) + _emit(samples, agg, "interaction_p95_ms", "interaction_p95", "ms", ns, extra) + _emit(samples, agg, "navigate_mean_ms", "navigate_mean", "ms", ns, extra) + _emit(samples, agg, "navigate_p95_ms", "navigate_p95", "ms", ns, extra) + _emit(samples, agg, "evaluate_mean_ms", "evaluate_mean", "ms", ns, extra) + _emit(samples, agg, "evaluate_p95_ms", "evaluate_p95", "ms", ns, extra) + _emit(samples, agg, "fill_mean_ms", "fill_mean", "ms", ns, extra) + _emit(samples, agg, "fill_p95_ms", "fill_p95", "ms", ns, extra) + _emit(samples, agg, "click_mean_ms", "click_mean", "ms", ns, extra) + _emit(samples, agg, "click_p95_ms", "click_p95", "ms", ns, extra) + _emit(samples, agg, "screenshot_mean_ms", "screenshot_mean", "ms", ns, extra) + _emit(samples, agg, "screenshot_p95_ms", "screenshot_p95", "ms", ns, extra) + _emit(samples, agg, "cold_start_mean_ms", "cold_start_mean", "ms", ns, extra) + _emit(samples, agg, "cold_start_p95_ms", "cold_start_p95", "ms", ns, extra) + + # RSS memory + _emit(samples, agg, "rss_end_mb", "rss_end", "MB", ns, extra) + _emit(samples, agg, "rss_growth_mb", "rss_growth", "MB", ns, extra) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for chromium_density=%d.", len(samples), density) + return samples + + +def Cleanup(benchmark_spec): + """Clean up after measurement. Delete claims and drain warm pool.""" + ns = FLAGS.k8s_namespace + logging.info("Cleanup: deleting SandboxClaims and draining warm pool.") + + # Delete any lingering SandboxClaims to release claimed pods + utils.RunKubectl( + [ + "delete", + "sandboxclaims", + "--all", + "-n", + ns, + "--ignore-not-found=true", + ], + timeout=60, + raise_on_failure=False, + ) + + # Drain warm pool to 0 + utils.DrainWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + label=_WARMPOOL_LABEL, + ) + + utils.StopPortForward() + logging.info("Cleanup complete (cluster persists).") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra): + """Emit a sample if the key exists in the aggregate dict. + + Args: + samples: List to append the new sample.Sample to. + agg: Aggregate metrics dict returned by the agent API response. + agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms"). + metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric + name (e.g. "orchestrator_cel_mean"). + unit: Unit string for the sample (e.g. "ms", "MB", "seconds"). + namespace: Kubernetes namespace (included in sample metadata). + extra: Dict of additional metadata key-value pairs attached to + every sample (density, session counts, wall time, etc.). + """ + value = agg.get(agg_key) + if value is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_suffix}", + value, + unit, + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py new file mode 100644 index 0000000000..418b5c1ed9 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py @@ -0,0 +1,481 @@ +"""PKB Benchmark: GKE Agent Deletion & Cleanup . + +Atomic single-point measurement of bulk deletion efficiency and IP +reclamation on a pre-provisioned GKE cluster with gVisor isolation. +Provisions N sandbox pods via SandboxWarmPool, then bulk-deletes them +and measures per-pod deletion latency, aggregate deletion stats, and +IP address reclamation timing. + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the batch_size parameter across iterations to find +the deletion saturation point. + +Usage: + python pkb.py --benchmarks=gke_deletion \\ + --k8s_deletion_batch_size=100 \\ + --k8s_deletion_warmpool_name=python-sandbox-warmpool \\ + --k8s_deletion_pod_label=sandbox=python-sandbox-example \\ + --k8s_deletion_poll_interval_s=1.0 \\ + --k8s_deletion_provision_timeout_s=120.0 \\ + --k8s_deletion_drain_timeout_s=300.0 \\ + --k8s_namespace=agentic \\ + --gke_machine_type=c4-standard-8 + +Samples emitted (per run): + - gke_deletion_provision_time (seconds) + - gke_deletion_total_drain_time (seconds) + - gke_deletion_latency_p50 (seconds) + - gke_deletion_latency_p95 (seconds) + - gke_deletion_latency_p99 (seconds) + - gke_deletion_latency_max (seconds) + - gke_deletion_rate (pods/sec) + - gke_deletion_ip_before (count) + - gke_deletion_ip_after (count) + - gke_deletion_ip_reclaim_time (seconds) + - gke_deletion_final_running_count (count) + - gke_deletion_wall_time (seconds) +""" + +import json +import logging +import time + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_deletion" +BENCHMARK_CONFIG = """ +k8s_deletion: + description: > + Atomic single-point bulk deletion and IP reclamation measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_integer( + "k8s_deletion_batch_size", + 100, + "Number of sandbox pods to provision then bulk-delete.", +) + +flags.DEFINE_string( + "k8s_deletion_warmpool_name", + "python-sandbox-warmpool", + "SandboxWarmPool resource name.", +) + +flags.DEFINE_string( + "k8s_deletion_pod_label", + "sandbox=python-sandbox-example", + "Label selector for warm pool pods.", +) + +flags.DEFINE_float( + "k8s_deletion_poll_interval_s", + 1.0, + "Seconds between kubectl polls during deletion.", +) + +flags.DEFINE_float( + "k8s_deletion_provision_timeout_s", + 120.0, + "Max seconds to wait for pods to reach Running before deletion.", +) + +flags.DEFINE_float( + "k8s_deletion_drain_timeout_s", + 300.0, + "Max seconds to wait for all pods to terminate after scale-to-0.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads onto the cluster.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Provision N pods, bulk-delete, measure deletion latency and IP reclamation. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + batch_size = FLAGS.k8s_deletion_batch_size + warmpool_name = FLAGS.k8s_deletion_warmpool_name + label = FLAGS.k8s_deletion_pod_label + poll_interval = FLAGS.k8s_deletion_poll_interval_s + provision_timeout = FLAGS.k8s_deletion_provision_timeout_s + drain_timeout = FLAGS.k8s_deletion_drain_timeout_s + + logging.info("=== Run: batch_size=%d ===", batch_size) + + # Drain to 0 for clean measurement (moved from Prepare for sweep compatibility) + utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(drain_timeout)) + time.sleep(2) + + t_wall_start = time.time() + + # 1. Provision N pods + logging.info("Provisioning %d pods...", batch_size) + provision_start = time.time() + _PatchReplicas(ns, warmpool_name, batch_size) + + deadline = time.time() + provision_timeout + while time.time() < deadline: + running = utils.CountPods(ns, label, phase="Running") + pct = (running / batch_size * 100) if batch_size > 0 else 0 + logging.info("Provisioning... %d/%d (%.0f%%)", running, batch_size, pct) + if running >= batch_size: + break + time.sleep(3) + + provision_time = time.time() - provision_start + final_running = utils.CountPods(ns, label, phase="Running") + + logging.info( + "Provisioned %d/%d pods in %.1fs", + final_running, + batch_size, + provision_time, + ) + + # If not all pods reached Running, this is a failure + if final_running < batch_size: + raise RuntimeError( + f"Provisioning failed: only {final_running}/{batch_size} pods " + f"reached Running within {provision_timeout}s" + ) + + # 2. Record pod names and IP count before deletion + pod_names_before = set(_GetPodNames(ns, label)) + ip_before = _CountAllocatedIPs(ns, label) + + logging.info( + "Recorded %d pods, %d IPs allocated", + len(pod_names_before), + ip_before, + ) + + # Brief settle + time.sleep(1) + + # 3. Bulk delete: scale to 0 + logging.info("Scaling to 0 (bulk delete of %d pods)...", len(pod_names_before)) + _PatchReplicas(ns, warmpool_name, 0) + + # 4. Poll: track pod disappearance and IP reclamation + t_delete = time.time() + deadline_drain = t_delete + drain_timeout + pod_gone_times = {} # pod_name -> elapsed_s when first absent + ip_reclaim_time = None + + while time.time() < deadline_drain: + elapsed = time.time() - t_delete + + # Current pod names still present + current_pods = set(_GetPodNames(ns, label)) + remaining = len(current_pods) + + # Track which pods have disappeared + gone_now = pod_names_before - current_pods + for pn in gone_now: + if pn not in pod_gone_times: + pod_gone_times[pn] = elapsed + + # IP count (scoped to warm pool label) + ips = _CountAllocatedIPs(ns, label) + if ip_reclaim_time is None and ips == 0: + ip_reclaim_time = elapsed + + deleted = len(pod_names_before) - remaining + pct = (deleted / len(pod_names_before) * 100) if pod_names_before else 0 + logging.info( + "[%.1fs] Deleted: %d/%d (%.0f%%) IPs: %d", + elapsed, + deleted, + len(pod_names_before), + pct, + ips, + ) + + if remaining == 0: + break + + time.sleep(poll_interval) + + total_drain_time = time.time() - t_delete + + # Pods we never saw disappear (stuck) get the full drain time + for pn in pod_names_before: + if pn not in pod_gone_times: + pod_gone_times[pn] = total_drain_time + + # 5. Compute per-pod deletion latencies + deletion_latencies = sorted(pod_gone_times.values()) + n = len(deletion_latencies) + + ip_after = _CountAllocatedIPs(ns, label) + deletion_rate = ( + (len(pod_names_before) / total_drain_time) if total_drain_time > 0 else 0 + ) + + logging.info( + "Drain complete: %.1fs, rate=%.1f pods/sec, IPs: %d->%d", + total_drain_time, + deletion_rate, + ip_before, + ip_after, + ) + + wall_time = time.time() - t_wall_start + + # 6. Build samples + extra = { + "batch_size": batch_size, + "final_running_count": final_running, + "ip_before": ip_before, + "ip_after": ip_after, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_provision_time", + round(provision_time, 2), + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_total_drain_time", + round(total_drain_time, 2), + "seconds", + ns, + extra, + ) + ) + + if n > 0: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_latency_p50", + round(_Percentile(deletion_latencies, 50), 3), + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_latency_p95", + round(_Percentile(deletion_latencies, 95), 3), + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_latency_p99", + round(_Percentile(deletion_latencies, 99), 3), + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_latency_max", + round(deletion_latencies[-1], 3), + "seconds", + ns, + extra, + ) + ) + + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_rate", + round(deletion_rate, 2), + "pods/sec", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ip_before", + float(ip_before), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ip_after", + float(ip_after), + "count", + ns, + extra, + ) + ) + + if ip_reclaim_time is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ip_reclaim_time", + round(ip_reclaim_time, 2), + "seconds", + ns, + extra, + ) + ) + + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_final_running_count", + float(final_running), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for batch_size=%d.", len(samples), batch_size) + return samples + + +def Cleanup(benchmark_spec): + """Best-effort drain of warm pool after measurement.""" + ns = FLAGS.k8s_namespace + warmpool_name = FLAGS.k8s_deletion_warmpool_name + label = FLAGS.k8s_deletion_pod_label + + logging.info("Cleanup: draining warm pool to 0.") + utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_deletion_drain_timeout_s)) + utils.StopPortForward() + logging.info("Cleanup complete.") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _PatchReplicas(namespace, warmpool_name, replicas): + """Patch SandboxWarmPool to a specific replica count.""" + patch_json = json.dumps({"spec": {"replicas": replicas}}) + utils.RunKubectl( + [ + "patch", + "sandboxwarmpool", + warmpool_name, + "-n", + namespace, + "--type=merge", + f"-p={patch_json}", + ], + raise_on_failure=False, + ) + + +def _GetPodNames(namespace, label): + """Return list of pod names matching the label selector.""" + stdout, _, rc = utils.RunKubectl( + [ + "get", + "pods", + "-n", + namespace, + "-l", + label, + "-o", + "jsonpath={.items[*].metadata.name}", + ], + timeout=30, + raise_on_failure=False, + ) + if rc != 0 or not stdout: + return [] + return stdout.split() + + +def _CountAllocatedIPs(namespace, label): + """Count pod IPs currently allocated for pods matching the label. + + Scoped to the warm pool label to accurately measure IPAM release + efficiency for the specific pods being deleted. + """ + stdout, _, rc = utils.RunKubectl( + [ + "get", + "pods", + "-n", + namespace, + "-l", + label, + "-o", + "jsonpath={.items[*].status.podIP}", + ], + timeout=30, + raise_on_failure=False, + ) + if rc != 0 or not stdout: + return 0 + return len([ip for ip in stdout.split() if ip]) + + +def _Percentile(sorted_values, pct): + """Calculate percentile (0-100) with linear interpolation.""" + if not sorted_values: + return 0.0 + idx = (pct / 100) * (len(sorted_values) - 1) + lo = int(idx) + hi = min(lo + 1, len(sorted_values) - 1) + frac = idx - lo + return sorted_values[lo] * (1 - frac) + sorted_values[hi] * frac diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py new file mode 100644 index 0000000000..109ab0efe6 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py @@ -0,0 +1,617 @@ +"""PKB Benchmark: GKE Agent Payload Transfer Saturation . + +Atomic single-point measurement of payload transfer latency from a gVisor +sandbox back to the orchestrator on a pre-provisioned GKE cluster. Measures +generation time, serialization time, stdout write time, total transfer time, +throughput, and RSS at a given payload_size_mb and concurrent_sessions count. + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the payload_size_mb parameter across iterations to +find the saturation point. + +Usage: + python pkb.py --benchmarks=gke_payload \ + --k8s_payload_size_mb=50 \ + --k8s_payload_iterations=20 \ + --k8s_payload_concurrent_sessions=5 \ + --k8s_namespace=agentic \ + --k8s_agent_api_url=http://localhost:8080 + +Samples emitted (per run): + - gke_payload_orchestrator_transfer_mean (ms) + - gke_payload_orchestrator_transfer_p50 (ms) + - gke_payload_orchestrator_transfer_p95 (ms) + - gke_payload_orchestrator_transfer_p99 (ms) + - gke_payload_orchestrator_transfer_min (ms) + - gke_payload_orchestrator_transfer_max (ms) + - gke_payload_sandbox_payload_size_bytes (bytes) + - gke_payload_sandbox_payload_encoded_size_bytes (bytes) + - gke_payload_sandbox_payload_iterations (count) + - gke_payload_sandbox_generation_time_mean (ms) + - gke_payload_sandbox_generation_time_p50 (ms) + - gke_payload_sandbox_generation_time_p95 (ms) + - gke_payload_sandbox_generation_time_p99 (ms) + - gke_payload_sandbox_generation_time_min (ms) + - gke_payload_sandbox_generation_time_max (ms) + - gke_payload_sandbox_serialization_time_mean (ms) + - gke_payload_sandbox_serialization_time_p50 (ms) + - gke_payload_sandbox_serialization_time_p95 (ms) + - gke_payload_sandbox_serialization_time_p99 (ms) + - gke_payload_sandbox_serialization_time_min (ms) + - gke_payload_sandbox_serialization_time_max (ms) + - gke_payload_sandbox_stdout_time_mean (ms) + - gke_payload_sandbox_stdout_time_p50 (ms) + - gke_payload_sandbox_stdout_time_p95 (ms) + - gke_payload_sandbox_stdout_time_p99 (ms) + - gke_payload_sandbox_stdout_time_min (ms) + - gke_payload_sandbox_stdout_time_max (ms) + - gke_payload_sandbox_transfer_time_mean (ms) + - gke_payload_sandbox_transfer_time_p50 (ms) + - gke_payload_sandbox_transfer_time_p95 (ms) + - gke_payload_sandbox_transfer_time_p99 (ms) + - gke_payload_sandbox_transfer_time_min (ms) + - gke_payload_sandbox_transfer_time_max (ms) + - gke_payload_sandbox_throughput_mean (MB/s) + - gke_payload_sandbox_throughput_p50 (MB/s) + - gke_payload_sandbox_throughput_min (MB/s) + - gke_payload_sandbox_rss_start (MB) + - gke_payload_sandbox_rss_end (MB) + - gke_payload_sandbox_rss_growth (MB) + - gke_payload_wall_time (seconds) +""" + +import logging +import time + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_payload" +BENCHMARK_CONFIG = """ +k8s_payload: + description: > + Atomic single-point payload transfer saturation measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +_WARMPOOL_NAME = "python-sandbox-warmpool" +_WARMPOOL_LABEL = "sandbox=python-sandbox-example" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_float( + "k8s_payload_size_mb", + 1.0, + "Payload size in megabytes to transfer from the sandbox.", +) + +flags.DEFINE_integer( + "k8s_payload_iterations", + 20, + "Number of transfer iterations per sandbox session.", +) + +flags.DEFINE_integer( + "k8s_payload_concurrent_sessions", + 5, + "Number of parallel sandbox sessions.", +) + +flags.DEFINE_integer( + "k8s_payload_exec_timeout", + 300, + "Sandbox command execution timeout in seconds.", +) + +flags.DEFINE_bool( + "k8s_payload_patch_warmpool", + True, + "Patch SandboxWarmPool replicas to match concurrent_sessions before measurement.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads and verify agent API.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + utils.CheckAgentHealthz(required=False) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Execute a single payload transfer measurement and return samples. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + payload_size_mb = FLAGS.k8s_payload_size_mb + iterations = FLAGS.k8s_payload_iterations + concurrent = FLAGS.k8s_payload_concurrent_sessions + + logging.info( + "=== Run: payload_size_mb=%s, iterations=%d, concurrent=%d ===", + payload_size_mb, + iterations, + concurrent, + ) + + # Ensure port-forward is active (needed when sweeps skip Prepare) + utils.EnsurePortForward() + + # Patch warm pool (moved from Prepare for sweep compatibility) + if FLAGS.k8s_payload_patch_warmpool: + utils.PatchWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + replicas=concurrent, + label=_WARMPOOL_LABEL, + ) + + # POST to agent API + payload = { + "payload_size_mb": payload_size_mb, + "payload_iterations": iterations, + "concurrent_sessions": concurrent, + "sandbox_exec_timeout_s": FLAGS.k8s_payload_exec_timeout, + } + + t0 = time.time() + result = utils.CallAgentApi("/benchmark/python/payload", payload) + wall_time = time.time() - t0 + + successful = result.get("successful_sessions", 0) + failed = result.get("failed_sessions", 0) + agg = result.get("aggregate", {}) + + logging.info( + "API response: %d successful, %d failed sessions (%.1fs)", + successful, + failed, + wall_time, + ) + + # Build samples + extra = { + "payload_size_mb": payload_size_mb, + "payload_iterations": iterations, + "concurrent_sessions": concurrent, + "successful_sessions": successful, + "failed_sessions": failed, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + # Orchestrator-side transfer latency + _emit( + samples, + agg, + "orchestrator_transfer_mean_ms", + "orchestrator_transfer_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "orchestrator_transfer_p50_ms", + "orchestrator_transfer_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "orchestrator_transfer_p95_ms", + "orchestrator_transfer_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "orchestrator_transfer_p99_ms", + "orchestrator_transfer_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "orchestrator_transfer_min_ms", + "orchestrator_transfer_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "orchestrator_transfer_max_ms", + "orchestrator_transfer_max", + "ms", + ns, + extra, + ) + + # Payload metadata + _emit( + samples, + agg, + "sandbox_payload_size_bytes", + "sandbox_payload_size_bytes", + "bytes", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_payload_encoded_size_bytes", + "sandbox_payload_encoded_size_bytes", + "bytes", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_payload_iterations", + "sandbox_payload_iterations", + "count", + ns, + extra, + ) + + # Generation time (os.urandom) + _emit( + samples, + agg, + "sandbox_generation_time_mean_ms", + "sandbox_generation_time_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_generation_time_p50_ms", + "sandbox_generation_time_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_generation_time_p95_ms", + "sandbox_generation_time_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_generation_time_p99_ms", + "sandbox_generation_time_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_generation_time_min_ms", + "sandbox_generation_time_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_generation_time_max_ms", + "sandbox_generation_time_max", + "ms", + ns, + extra, + ) + + # Serialization time (base64 encode) + _emit( + samples, + agg, + "sandbox_serialization_time_mean_ms", + "sandbox_serialization_time_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_serialization_time_p50_ms", + "sandbox_serialization_time_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_serialization_time_p95_ms", + "sandbox_serialization_time_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_serialization_time_p99_ms", + "sandbox_serialization_time_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_serialization_time_min_ms", + "sandbox_serialization_time_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_serialization_time_max_ms", + "sandbox_serialization_time_max", + "ms", + ns, + extra, + ) + + # Stdout write time (gVisor Gofer write syscall) + _emit( + samples, + agg, + "sandbox_stdout_time_mean_ms", + "sandbox_stdout_time_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_stdout_time_p50_ms", + "sandbox_stdout_time_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_stdout_time_p95_ms", + "sandbox_stdout_time_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_stdout_time_p99_ms", + "sandbox_stdout_time_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_stdout_time_min_ms", + "sandbox_stdout_time_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_stdout_time_max_ms", + "sandbox_stdout_time_max", + "ms", + ns, + extra, + ) + + # Transfer time (serialization + stdout write — threshold metric) + _emit( + samples, + agg, + "sandbox_transfer_time_mean_ms", + "sandbox_transfer_time_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_transfer_time_p50_ms", + "sandbox_transfer_time_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_transfer_time_p95_ms", + "sandbox_transfer_time_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_transfer_time_p99_ms", + "sandbox_transfer_time_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_transfer_time_min_ms", + "sandbox_transfer_time_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_transfer_time_max_ms", + "sandbox_transfer_time_max", + "ms", + ns, + extra, + ) + + # Throughput + _emit( + samples, + agg, + "sandbox_throughput_mean_mbps", + "sandbox_throughput_mean", + "MB/s", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_throughput_p50_mbps", + "sandbox_throughput_p50", + "MB/s", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_throughput_min_mbps", + "sandbox_throughput_min", + "MB/s", + ns, + extra, + ) + + # RSS + _emit(samples, agg, "sandbox_rss_start_mb", "sandbox_rss_start", "MB", ns, extra) + _emit(samples, agg, "sandbox_rss_end_mb", "sandbox_rss_end", "MB", ns, extra) + _emit(samples, agg, "sandbox_rss_growth_mb", "sandbox_rss_growth", "MB", ns, extra) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info( + "Emitted %d samples for payload_size_mb=%s.", len(samples), payload_size_mb + ) + return samples + + +def Cleanup(benchmark_spec): + """Clean up after measurement. Scale warm pool to 0.""" + ns = FLAGS.k8s_namespace + logging.info("Cleanup: draining warm pool.") + + utils.DrainWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + label=_WARMPOOL_LABEL, + ) + + utils.StopPortForward() + logging.info("Cleanup complete (cluster persists).") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra): + """Emit a sample if the key exists in the aggregate dict. + + Args: + samples: List to append the new sample.Sample to. + agg: Aggregate metrics dict returned by the agent API response. + agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms"). + metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric + name (e.g. "orchestrator_cel_mean"). + unit: Unit string for the sample (e.g. "ms", "MB", "seconds"). + namespace: Kubernetes namespace (included in sample metadata). + extra: Dict of additional metadata key-value pairs attached to + every sample (density, session counts, wall time, etc.). + """ + value = agg.get(agg_key) + if value is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_suffix}", + value, + unit, + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py new file mode 100644 index 0000000000..7760f23ff7 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py @@ -0,0 +1,378 @@ +"""PKB Benchmark: GKE Agent Python Sandbox Density . + +Atomic single-point measurement of Python sandbox density on a +pre-provisioned GKE cluster with gVisor isolation. Measures Code Execution +Latency (CEL), Time To First Execution (TTFE), RSS memory growth, and +per-type latency breakdown (compute, syscall, import) at a given +concurrent session count. + +Workflow per session: + 1. Claim a pre-warmed sandbox pod from the SandboxWarmPool + 2. Upload and execute the benchmark script inside the gVisor sandbox + 3. Run `sample_warmup` iterations (results discarded - stabilizes caches) + 4. Run `sample_count` measured iterations (results recorded) + 5. Report TTFE, per-iteration CEL, RSS, and per-task-type breakdown + 6. Release the sandbox claim + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the density parameter across iterations to find +the saturation point. + +Usage: + python pkb.py --benchmarks=gke_python_density \\ + --k8s_python_density_concurrent_sandbox_count=16 \\ + --k8s_python_density_sample_count=20 \\ + --k8s_python_density_sample_warmup=0 \\ + --k8s_namespace=agentic \\ + --k8s_agent_api_url=http://localhost:8080 + +Samples emitted (per run): + - gke_python_density_orchestrator_cel_mean (ms) + - gke_python_density_orchestrator_cel_p50 (ms) + - gke_python_density_orchestrator_cel_p95 (ms) + - gke_python_density_orchestrator_cel_p99 (ms) + - gke_python_density_orchestrator_cel_min (ms) + - gke_python_density_orchestrator_cel_max (ms) + - gke_python_density_sandbox_total_cel_mean (ms) + - gke_python_density_sandbox_total_cel_p50 (ms) + - gke_python_density_sandbox_total_cel_p95 (ms) + - gke_python_density_sandbox_total_cel_p99 (ms) + - gke_python_density_sandbox_total_cel_min (ms) + - gke_python_density_sandbox_total_cel_max (ms) + - gke_python_density_sandbox_ttfe (ms) + - gke_python_density_sandbox_rss_start (MB) + - gke_python_density_sandbox_rss_end (MB) + - gke_python_density_sandbox_rss_growth (MB) + - gke_python_density_sandbox_compute_cel_mean (ms) + - gke_python_density_sandbox_syscall_cel_mean (ms) + - gke_python_density_sandbox_import_cel_mean (ms) + - gke_python_density_wall_time (seconds) +""" + +import logging +import time + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_python_density" +BENCHMARK_CONFIG = """ +k8s_python_density: + description: > + Atomic single-point Python sandbox density measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +_WARMPOOL_NAME = "python-sandbox-warmpool" +_WARMPOOL_LABEL = "sandbox=python-sandbox-example" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_integer( + "k8s_python_density_concurrent_sandbox_count", + 1, + "Number of concurrent sandbox sessions to run.", +) + +flags.DEFINE_integer( + "k8s_python_density_sample_count", + 20, + "Number of sample iterations per sandbox session.", +) + +flags.DEFINE_integer( + "k8s_python_density_sample_warmup", + 0, + "Number of warmup iterations per session (excluded from stats). " + "Warmup iterations execute the same benchmark tasks as measured " + "iterations but their latency results are discarded. This allows " + "JIT compilation, caches, and gVisor page faults to stabilize " + "before measurement begins.", +) + +flags.DEFINE_bool( + "k8s_python_density_patch_warmpool", + True, + "Patch SandboxWarmPool replicas to match density before measurement.", +) + +flags.DEFINE_integer( + "k8s_python_density_exec_timeout", + 600, + "Timeout in seconds for the API call.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads and verify agent API.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + utils.CheckAgentHealthz(required=False) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Execute a single density measurement and return samples. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + density = FLAGS.k8s_python_density_concurrent_sandbox_count + + logging.info("=== Run: density=%d ===", density) + + # Ensure port-forward is active (needed when sweeps skip Prepare) + utils.EnsurePortForward() + + # Patch warm pool to match density (moved from Prepare for sweep compatibility) + if FLAGS.k8s_python_density_patch_warmpool: + utils.PatchWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + replicas=density, + label=_WARMPOOL_LABEL, + ) + + # POST to agent API + payload = { + "sample_count": FLAGS.k8s_python_density_sample_count, + "sample_warmup": FLAGS.k8s_python_density_sample_warmup, + "concurrent_sessions": density, + "sandbox_exec_timeout_s": FLAGS.k8s_python_density_exec_timeout, + } + + t0 = time.time() + result = utils.CallAgentApi("/benchmark/python/density", payload) + wall_time = time.time() - t0 + + successful = result.get("successful_sessions", 0) + failed = result.get("failed_sessions", 0) + agg = result.get("aggregate", {}) + + logging.info( + "API response: %d successful, %d failed sessions (%.1fs)", + successful, + failed, + wall_time, + ) + + # Build samples + extra = { + "density": density, + "successful_sessions": successful, + "failed_sessions": failed, + "sample_count": FLAGS.k8s_python_density_sample_count, + "sample_warmup": FLAGS.k8s_python_density_sample_warmup, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + # Orchestrator-side CEL + _emit( + samples, + agg, + "orchestrator_cel_mean_ms", + "orchestrator_cel_mean", + "ms", + ns, + extra, + ) + _emit( + samples, agg, "orchestrator_cel_p50_ms", "orchestrator_cel_p50", "ms", ns, extra + ) + _emit( + samples, agg, "orchestrator_cel_p95_ms", "orchestrator_cel_p95", "ms", ns, extra + ) + _emit( + samples, agg, "orchestrator_cel_p99_ms", "orchestrator_cel_p99", "ms", ns, extra + ) + _emit( + samples, agg, "orchestrator_cel_min_ms", "orchestrator_cel_min", "ms", ns, extra + ) + _emit( + samples, agg, "orchestrator_cel_max_ms", "orchestrator_cel_max", "ms", ns, extra + ) + + # Sandbox-side total CEL + _emit( + samples, + agg, + "sandbox_total_cel_mean_ms", + "sandbox_total_cel_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_total_cel_p50_ms", + "sandbox_total_cel_p50", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_total_cel_p95_ms", + "sandbox_total_cel_p95", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_total_cel_p99_ms", + "sandbox_total_cel_p99", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_total_cel_min_ms", + "sandbox_total_cel_min", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_total_cel_max_ms", + "sandbox_total_cel_max", + "ms", + ns, + extra, + ) + + # TTFE + _emit(samples, agg, "sandbox_ttfe_ms", "sandbox_ttfe", "ms", ns, extra) + + # RSS + _emit(samples, agg, "sandbox_rss_start_mb", "sandbox_rss_start", "MB", ns, extra) + _emit(samples, agg, "sandbox_rss_end_mb", "sandbox_rss_end", "MB", ns, extra) + _emit(samples, agg, "sandbox_rss_growth_mb", "sandbox_rss_growth", "MB", ns, extra) + + # Per-type CEL breakdown + _emit( + samples, + agg, + "sandbox_compute_cel_mean_ms", + "sandbox_compute_cel_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_syscall_cel_mean_ms", + "sandbox_syscall_cel_mean", + "ms", + ns, + extra, + ) + _emit( + samples, + agg, + "sandbox_import_cel_mean_ms", + "sandbox_import_cel_mean", + "ms", + ns, + extra, + ) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for density=%d.", len(samples), density) + return samples + + +def Cleanup(benchmark_spec): + """Clean up after measurement. Scale warm pool to 0.""" + ns = FLAGS.k8s_namespace + logging.info("Cleanup: draining warm pool.") + + if FLAGS.k8s_python_density_patch_warmpool: + utils.DrainWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + label=_WARMPOOL_LABEL, + ) + + utils.StopPortForward() + logging.info("Cleanup complete (cluster persists).") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra): + """Emit a sample if the key exists in the aggregate dict. + + Args: + samples: List to append the new sample.Sample to. + agg: Aggregate metrics dict returned by the agent API response. + agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms"). + metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric + name (e.g. "orchestrator_cel_mean"). + unit: Unit string for the sample (e.g. "ms", "MB", "seconds"). + namespace: Kubernetes namespace (included in sample metadata). + extra: Dict of additional metadata key-value pairs attached to + every sample (density, session counts, wall time, etc.). + """ + value = agg.get(agg_key) + if value is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_suffix}", + value, + unit, + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py new file mode 100644 index 0000000000..feb82c8614 --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py @@ -0,0 +1,805 @@ +"""PKB Benchmark: GKE Agent QPS Saturation . + +Atomic single-point measurement of scheduling throughput on a pre-provisioned +GKE cluster. Fires sandbox claim requests at a controlled QPS rate for a +fixed duration and measures per-request TTFE (Time To First Execution). + +Supports two operating modes: + - **agent**: POST to the orchestrator /benchmark/python/qps endpoint + - **raw_claim**: Bypass the agent, create SandboxClaims directly via kubectl + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the target_qps parameter across iterations to find +the QPS saturation point. + +Usage: + # Agent mode + python pkb.py --benchmarks=gke_qps \\ + --k8s_qps_target_qps=5.0 \\ + --k8s_qps_pool_size=70 \\ + --k8s_qps_step_duration_s=30.0 \\ + --k8s_qps_mode=agent \\ + --k8s_namespace=agentic \\ + --k8s_agent_api_url=http://localhost:8080 + + # Raw claim mode + python pkb.py --benchmarks=gke_qps \\ + --k8s_qps_target_qps=5.0 \\ + --k8s_qps_pool_size=70 \\ + --k8s_qps_step_duration_s=30.0 \\ + --k8s_qps_mode=raw_claim \\ + --k8s_qps_claim_timeout_s=60.0 \\ + --k8s_namespace=agentic + +Samples emitted (per run): + - gke_qps_ttfe_mean (ms) + - gke_qps_ttfe_p50 (ms) + - gke_qps_ttfe_p95 (ms) + - gke_qps_ttfe_p99 (ms) + - gke_qps_ttfe_min (ms) + - gke_qps_ttfe_max (ms) + - gke_qps_claim_mean (ms) + - gke_qps_claim_p95 (ms) + - gke_qps_actual_qps (requests/sec) + - gke_qps_duration (seconds) + - gke_qps_total_requests (count) + - gke_qps_successful_requests (count) + - gke_qps_failed_requests (count) + - gke_qps_pool_before (count) + - gke_qps_pool_after (count) + - gke_qps_wall_time (seconds) +""" + +import json +import os +import logging +import threading +import time +import uuid + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import data +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_qps" +BENCHMARK_CONFIG = """ +k8s_qps: + description: > + Atomic single-point QPS saturation measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +_WARMPOOL_NAME = "python-sandbox-warmpool" +_WARMPOOL_LABEL = "sandbox=python-sandbox-example" +_SANDBOX_TEMPLATE = "python-sandbox-template" +_QPS_CLAIM_LABEL = "created-by=pkb-qps-benchmark" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_float( + "k8s_qps_target_qps", + 5.0, + "Target requests per second (sandbox claims per second).", +) + +flags.DEFINE_integer( + "k8s_qps_pool_size", + 70, + "Warm pool size maintained during the measurement.", +) + +flags.DEFINE_float( + "k8s_qps_step_duration_s", + 30.0, + "Duration of the QPS burst in seconds.", +) + +flags.DEFINE_integer( + "k8s_qps_sandbox_exec_timeout_s", + 30, + "Sandbox command execution timeout in seconds.", +) + +flags.DEFINE_float( + "k8s_qps_provision_timeout_s", + 180.0, + "Max seconds to wait for pool pods to reach Running.", +) + +flags.DEFINE_string( + "k8s_qps_mode", + "agent", + "Operating mode: 'agent' (POST to orchestrator API) or " + "'raw_claim' (create SandboxClaims directly via kubectl).", +) + +flags.DEFINE_float( + "k8s_qps_claim_timeout_s", + 60.0, + "Max seconds to wait for a raw claim to bind " "(only used with mode=raw_claim).", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads and verify agent API.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + + mode = FLAGS.k8s_qps_mode + if mode == "agent": + utils.CheckAgentHealthz(required=False) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Execute a single QPS measurement and return samples. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + pool_size = FLAGS.k8s_qps_pool_size + + # Scale warm pool (moved from Prepare for sweep compatibility) + utils.PatchWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + replicas=pool_size, + label=_WARMPOOL_LABEL, + wait_timeout=int(FLAGS.k8s_qps_provision_timeout_s), + ) + + mode = FLAGS.k8s_qps_mode + + if mode == "raw_claim": + return _RunRawClaim(benchmark_spec) + else: + return _RunAgent(benchmark_spec) + + +def Cleanup(benchmark_spec): + """Delete benchmark claims and drain warm pool.""" + ns = FLAGS.k8s_namespace + logging.info("Cleanup: deleting benchmark claims and draining warm pool.") + + # Delete any lingering benchmark claims + _DeleteBenchmarkClaims(ns) + + # Drain warm pool + utils.DrainWarmPool( + namespace=ns, + warmpool_name=_WARMPOOL_NAME, + label=_WARMPOOL_LABEL, + ) + + utils.StopPortForward() + logging.info("Cleanup complete.") + + +# --------------------------------------------------------------------------- +# Agent mode +# --------------------------------------------------------------------------- + + +def _RunAgent(benchmark_spec): + """Fire QPS burst via the orchestrator API.""" + ns = FLAGS.k8s_namespace + target_qps = FLAGS.k8s_qps_target_qps + pool_size = FLAGS.k8s_qps_pool_size + step_duration = FLAGS.k8s_qps_step_duration_s + + logging.info( + "=== Run (agent): target_qps=%s, pool_size=%d, duration=%ss ===", + target_qps, + pool_size, + step_duration, + ) + + # Ensure port-forward is active (needed when sweeps skip Prepare) + utils.EnsurePortForward() + + # Record pool state before burst + pool_before = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running") + + # POST to agent API + payload = { + "target_qps": target_qps, + "duration_s": step_duration, + "sandbox_exec_timeout_s": FLAGS.k8s_qps_sandbox_exec_timeout_s, + } + + t0 = time.time() + api_timeout = int(step_duration + 300) + result = utils.CallAgentApi("/benchmark/python/qps", payload, timeout=api_timeout) + wall_time = time.time() - t0 + + # Record pool state after burst + pool_after = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running") + + # Extract response fields + aggregate = result.get("aggregate", {}) + successful = result.get("successful_requests", 0) + failed = result.get("failed_requests", 0) + total = result.get("total_requests", 0) + actual_qps = result.get("actual_qps", 0) + duration_s = result.get("duration_s", 0) + + logging.info( + "API response: actual_qps=%s, %d/%d requests ok (%.1fs)", + actual_qps, + successful, + total, + wall_time, + ) + + # Build samples + extra = { + "target_qps": target_qps, + "pool_size": pool_size, + "step_duration_s": step_duration, + "mode": "agent", + "actual_qps": actual_qps, + "total_requests": total, + "successful_requests": successful, + "failed_requests": failed, + "pool_before": pool_before, + "pool_after": pool_after, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + # TTFE latency stats + _emit(samples, aggregate, "ttfe_mean_ms", "ttfe_mean", "ms", ns, extra) + _emit(samples, aggregate, "ttfe_p50_ms", "ttfe_p50", "ms", ns, extra) + _emit(samples, aggregate, "ttfe_p95_ms", "ttfe_p95", "ms", ns, extra) + _emit(samples, aggregate, "ttfe_p99_ms", "ttfe_p99", "ms", ns, extra) + _emit(samples, aggregate, "ttfe_min_ms", "ttfe_min", "ms", ns, extra) + _emit(samples, aggregate, "ttfe_max_ms", "ttfe_max", "ms", ns, extra) + + # Claim latency stats + _emit(samples, aggregate, "claim_mean_ms", "claim_mean", "ms", ns, extra) + _emit(samples, aggregate, "claim_p95_ms", "claim_p95", "ms", ns, extra) + + # Throughput and counts + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_actual_qps", + actual_qps, + "requests/sec", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_duration", + duration_s, + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_total_requests", + float(total), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_successful_requests", + float(successful), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_failed_requests", + float(failed), + "count", + ns, + extra, + ) + ) + + # Pool state + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_pool_before", + float(pool_before), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_pool_after", + float(pool_after), + "count", + ns, + extra, + ) + ) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for target_qps=%s.", len(samples), target_qps) + return samples + + +# --------------------------------------------------------------------------- +# Raw claim mode +# --------------------------------------------------------------------------- + + +def _RunRawClaim(benchmark_spec): + """Fire SandboxClaims directly at target_qps (no agent).""" + ns = FLAGS.k8s_namespace + target_qps = FLAGS.k8s_qps_target_qps + pool_size = FLAGS.k8s_qps_pool_size + step_duration = FLAGS.k8s_qps_step_duration_s + claim_timeout = FLAGS.k8s_qps_claim_timeout_s + + logging.info( + "=== Run (raw_claim): target_qps=%s, pool_size=%d, duration=%ss ===", + target_qps, + pool_size, + step_duration, + ) + + # Record pool state before burst + pool_before = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running") + + # Calculate total claims to fire + total_claims = max(1, int(target_qps * step_duration)) + interval = 1.0 / target_qps if target_qps > 0 else 1.0 + + logging.info( + "Firing %d raw SandboxClaims at %s req/s", + total_claims, + target_qps, + ) + + # Fire claims at target QPS in parallel threads + claim_results = [] + lock = threading.Lock() + + def _fire_and_wait(idx, fire_time): + claim_name = f"pkb-qps-0-{idx}-{uuid.uuid4().hex[:6]}" + result = {"request_id": idx, "fire_time_s": round(fire_time, 3)} + try: + t_create = _CreateClaim(ns, _SANDBOX_TEMPLATE, claim_name) + result["create_ts"] = t_create + t_bound = _WaitClaimBound(ns, claim_name, claim_timeout) + if t_bound is not None: + ttfe_ms = (t_bound - t_create) * 1000.0 + result["ttfe_ms"] = round(ttfe_ms, 3) + result["claim_ms"] = round(ttfe_ms, 3) + result["error"] = None + else: + result["ttfe_ms"] = None + result["error"] = "Timeout waiting for claim to bind" + except Exception as e: + result["ttfe_ms"] = None + result["error"] = f"{type(e).__name__}: {e}" + with lock: + claim_results.append(result) + + t0 = time.time() + threads = [] + for i in range(total_claims): + fire_time = time.time() - t0 + t = threading.Thread(target=_fire_and_wait, args=(i, fire_time), daemon=True) + threads.append(t) + t.start() + if i < total_claims - 1: + next_fire = t0 + (i + 1) * interval + sleep_time = next_fire - time.time() + if sleep_time > 0: + time.sleep(sleep_time) + + for t in threads: + t.join(timeout=claim_timeout + 30) + + wall_time = time.time() - t0 + actual_qps = round(total_claims / wall_time, 2) if wall_time > 0 else 0 + + # Record pool state after burst + pool_after = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running") + + # Aggregate results + successful = [r for r in claim_results if r.get("ttfe_ms") is not None] + failed = [r for r in claim_results if r.get("error")] + ttfe_values = sorted(r["ttfe_ms"] for r in successful) + + logging.info( + "Raw claim burst complete: %d/%d ok, actual_qps=%s (%.1fs)", + len(successful), + total_claims, + actual_qps, + wall_time, + ) + + # Build samples + extra = { + "target_qps": target_qps, + "pool_size": pool_size, + "step_duration_s": step_duration, + "mode": "raw_claim", + "actual_qps": actual_qps, + "total_requests": total_claims, + "successful_requests": len(successful), + "failed_requests": len(failed), + "pool_before": pool_before, + "pool_after": pool_after, + "wall_time_s": round(wall_time, 2), + } + + samples = [] + + # TTFE latency stats (computed from raw claim results) + if ttfe_values: + n = len(ttfe_values) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_mean", + round(sum(ttfe_values) / n, 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_p50", + round(_percentile(ttfe_values, 50), 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_p95", + round(_percentile(ttfe_values, 95), 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_p99", + round(_percentile(ttfe_values, 99), 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_min", + round(ttfe_values[0], 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_ttfe_max", + round(ttfe_values[-1], 3), + "ms", + ns, + extra, + ) + ) + + # Claim latency (same as TTFE in raw_claim mode) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_claim_mean", + round(sum(ttfe_values) / n, 3), + "ms", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_claim_p95", + round(_percentile(ttfe_values, 95), 3), + "ms", + ns, + extra, + ) + ) + + # Throughput and counts + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_actual_qps", + actual_qps, + "requests/sec", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_duration", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_total_requests", + float(total_claims), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_successful_requests", + float(len(successful)), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_failed_requests", + float(len(failed)), + "count", + ns, + extra, + ) + ) + + # Pool state + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_pool_before", + float(pool_before), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_pool_after", + float(pool_after), + "count", + ns, + extra, + ) + ) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + # Cleanup benchmark claims + _DeleteBenchmarkClaims(ns) + + logging.info("Emitted %d samples for target_qps=%s.", len(samples), target_qps) + return samples + + +# --------------------------------------------------------------------------- +# Raw claim helpers +# --------------------------------------------------------------------------- + + +def _CreateClaim(namespace, template, claim_name): + """Create a single SandboxClaim via kubectl and return creation timestamp.""" + manifest = json.dumps( + { + "apiVersion": "extensions.agents.x-k8s.io/v1alpha1", + "kind": "SandboxClaim", + "metadata": { + "name": claim_name, + "namespace": namespace, + "labels": {"created-by": "pkb-qps-benchmark"}, + }, + "spec": { + "sandboxTemplateRef": {"name": template}, + }, + } + ) + tmp_dir = os.path.join( + data.ResourcePath("k8s_agents/manifests"), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + tmp_path = os.path.join(tmp_dir, f"qps-claim-{claim_name}.json") + try: + with open(tmp_path, "w") as f: + f.write(manifest) + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", tmp_path], + timeout=30, + raise_on_failure=False, + ) + finally: + if os.path.isfile(tmp_path): + os.unlink(tmp_path) + t_create = time.time() + if retcode != 0: + raise RuntimeError( + f"Failed to create claim {claim_name}: {stderr.strip()}" + ) + return t_create + + +def _WaitClaimBound(namespace, claim_name, timeout_s): + """Wait for a SandboxClaim to reach Bound phase. Returns timestamp or None.""" + deadline = time.time() + timeout_s + while time.time() < deadline: + stdout, _, rc = utils.RunKubectl( + [ + "get", + "sandboxclaim", + claim_name, + "-n", + namespace, + "-o", + "jsonpath={.status.phase}", + ], + timeout=10, + raise_on_failure=False, + ) + if rc == 0 and stdout.lower() in ("bound", "ready"): + return time.time() + time.sleep(0.1) + return None + + +def _DeleteBenchmarkClaims(namespace): + """Delete SandboxClaims labelled created-by=pkb-qps-benchmark.""" + stdout, _, rc = utils.RunKubectl( + [ + "get", + "sandboxclaim", + "-l", + _QPS_CLAIM_LABEL, + "-n", + namespace, + "-o", + "jsonpath={.items[*].metadata.name}", + ], + timeout=30, + raise_on_failure=False, + ) + names = stdout.split() if stdout else [] + if not names or names == [""]: + return 0 + + count = len(names) + logging.info("Deleting %d pkb-qps SandboxClaim(s)", count) + utils.RunKubectl( + [ + "delete", + "sandboxclaim", + "-l", + _QPS_CLAIM_LABEL, + "-n", + namespace, + "--wait=false", + ], + timeout=60, + raise_on_failure=False, + ) + + # Wait for claims to be fully removed + t0 = time.time() + while time.time() - t0 < 120: + stdout, _, _ = utils.RunKubectl( + [ + "get", + "sandboxclaim", + "-l", + _QPS_CLAIM_LABEL, + "-n", + namespace, + "--no-headers", + "--ignore-not-found", + ], + timeout=10, + raise_on_failure=False, + ) + remaining = len([l for l in stdout.splitlines() if l]) if stdout else 0 + if remaining == 0: + break + time.sleep(2) + + logging.info("Claims cleaned up in %.1fs", time.time() - t0) + return count + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _percentile(sorted_values, pct): + """Calculate percentile (0-100) with linear interpolation.""" + if not sorted_values: + return 0.0 + idx = (pct / 100) * (len(sorted_values) - 1) + lo = int(idx) + hi = min(lo + 1, len(sorted_values) - 1) + frac = idx - lo + return sorted_values[lo] * (1 - frac) + sorted_values[hi] * frac + + +def _emit(samples, data, data_key, metric_suffix, unit, namespace, extra): + """Emit a sample if the key exists in the data dict.""" + value = data.get(data_key) + if value is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_suffix}", + value, + unit, + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py new file mode 100644 index 0000000000..8d78c6649b --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py @@ -0,0 +1,1037 @@ +"""PKB Benchmark: GKE Agent Pod Snapshot Saturation . + +Atomic single-point measurement of GKE Pod Snapshot create/restore latency +on a pre-provisioned GKE cluster with gVisor isolation. Measures snapshot +time, restore time, TTFE (Time To First Execution), and restore correctness +at a given preload_mb and burst_size. + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the preload_mb parameter across iterations to find +the saturation point. + +Usage: + python pkb.py --benchmarks=gke_snapshot \\ + --k8s_snapshot_preload_mb=50 \\ + --k8s_snapshot_burst_size=3 \\ + --k8s_namespace=agentic \\ + --k8s_snapshot_skip_snapshot=false + +Samples emitted (per run): + - k8s_snapshot_snapshot_p50 (seconds) + - k8s_snapshot_snapshot_p95 (seconds) + - k8s_snapshot_snapshot_max (seconds) + - k8s_snapshot_restore_p50 (seconds) + - k8s_snapshot_restore_p95 (seconds) + - k8s_snapshot_restore_max (seconds) + - k8s_snapshot_ttfe_p50 (seconds) + - k8s_snapshot_ttfe_p95 (seconds) + - k8s_snapshot_ttfe_max (seconds) + - k8s_snapshot_startup_time (seconds) + - k8s_snapshot_restore_correct_count (count) + - k8s_snapshot_wall_time (seconds) +""" + +import json +import logging +import os +import re +import time +from concurrent.futures import ThreadPoolExecutor + +from jinja2 import Template + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import data +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker import sample +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_snapshot" +BENCHMARK_CONFIG = """ +k8s_snapshot: + description: > + Atomic single-point Pod Snapshot saturation measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_integer( + "k8s_snapshot_preload_mb", + 10, + "Megabytes of memory to pre-allocate in the sandbox before snapshot.", +) + +flags.DEFINE_integer( + "k8s_snapshot_burst_size", + 1, + "Number of concurrent source/snapshot/restore pods per measurement.", +) + +# k8s_snapshot_ksa_name is defined in gke_deploy_utils.py +# (where DeploySnapshots() consumes it) and is available here +# via the deploy_utils import. + +flags.DEFINE_integer( + "k8s_snapshot_pod_timeout", + 180, + "Max seconds to wait for pod Running / preload.", +) + +flags.DEFINE_boolean( + "k8s_snapshot_skip_snapshot", + False, + "Skip snapshot/restore phases — measure cold-start TTFE only.", +) + +flags.DEFINE_string( + "k8s_snapshot_preload_mode", + "synthetic", + "Preload mode: 'synthetic' (os.urandom fill) or " + "'script:' to run a custom startup script.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads, snapshot infra, and validate readiness.""" + benchmark_spec.always_call_cleanup = True + ns = FLAGS.k8s_namespace + preload_mb = FLAGS.k8s_snapshot_preload_mb + + logging.info( + "=== Prepare: preload_mb=%d, burst_size=%d ===", + preload_mb, + FLAGS.k8s_snapshot_burst_size, + ) + + # Deploy Agent Sandbox ecosystem (idempotent) + deploy_utils.DeployWorkloads(benchmark_spec) + + # Deploy Pod Snapshot infrastructure (idempotent). + # Pod Snapshots are GKE-specific; skip on other platforms. + # Only attempt deployment when we have a confirmed GCP cluster + # (avoids surprise failures on pre-existing clusters where + # benchmark_spec.container_cluster may be None). + cluster = getattr(benchmark_spec, "container_cluster", None) + if cluster and getattr(cluster, "cloud", None) == "GCP" and not FLAGS.skip_deploy_snapshots: + deploy_utils.DeploySnapshots() + elif not cluster: + logging.info( + "Pod Snapshot infrastructure skipped (no container_cluster in " + "benchmark_spec). Use --skip_deploy_snapshots=False to force." + ) + elif getattr(cluster, "cloud", None) != "GCP": + logging.info( + "Pod Snapshot infrastructure skipped (cloud=%s, GKE required).", + getattr(cluster, "cloud", "unknown"), + ) + + # 1. Verify PodSnapshotStorageConfig exists (cluster-scoped). + _, _, retcode = utils.RunKubectl( + ["get", "podsnapshotstorageconfigs.podsnapshot.gke.io", "--no-headers"], + timeout=30, + raise_on_failure=False, + ) + if retcode != 0: + raise RuntimeError( + "PodSnapshotStorageConfig CRD not found. " + "Ensure pod snapshots are enabled on the cluster." + ) + logging.info("PodSnapshotStorageConfig verified.") + + # 2. Verify PodSnapshotPolicy exists in the namespace. + _, _, retcode = utils.RunKubectl( + ["get", "podsnapshotpolicies.podsnapshot.gke.io", "-n", ns, "--no-headers"], + timeout=30, + raise_on_failure=False, + ) + if retcode != 0: + logging.warning("PodSnapshotPolicy not found in namespace %s.", ns) + + # 3. Verify the service account exists. + ksa = FLAGS.k8s_snapshot_ksa_name + _, _, retcode = utils.RunKubectl( + ["get", "serviceaccount", ksa, "-n", ns], + timeout=30, + raise_on_failure=False, + ) + if retcode != 0: + raise RuntimeError( + f"ServiceAccount {ksa} not found in namespace {ns}. " + "Run setup_snapshot_gke.sh or ensure DeploySnapshots() succeeded." + ) + logging.info("ServiceAccount %s verified.", ksa) + + # 4. Verify the template file exists. + template_path = _GetTemplatePath() + if not os.path.isfile(template_path): + raise RuntimeError(f"Snapshot template not found: {template_path}") + logging.info("Template file verified: %s", template_path) + + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Execute a single snapshot/restore measurement and return samples. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + preload_mb = FLAGS.k8s_snapshot_preload_mb + burst_size = FLAGS.k8s_snapshot_burst_size + skip_snapshot = FLAGS.k8s_snapshot_skip_snapshot + preload_mode = FLAGS.k8s_snapshot_preload_mode + ksa_name = FLAGS.k8s_snapshot_ksa_name + pod_timeout = FLAGS.k8s_snapshot_pod_timeout + + logging.info( + "=== Run: preload_mb=%d, burst_size=%d, skip_snapshot=%s ===", + preload_mb, + burst_size, + skip_snapshot, + ) + + template_path = _GetTemplatePath() + t0 = time.time() + + # Run the snapshot/restore cycle + step_result = _RunSnapshotCycle( + namespace=ns, + preload_mb=preload_mb, + burst_size=burst_size, + skip_snapshot=skip_snapshot, + preload_mode=preload_mode, + ksa_name=ksa_name, + pod_timeout=pod_timeout, + template_path=template_path, + ) + + wall_time = time.time() - t0 + + # Build samples + extra = { + "preload_mb": preload_mb, + "burst_size": burst_size, + "skip_snapshot": skip_snapshot, + "preload_mode": preload_mode, + "restore_correct_count": step_result.get("restore_correct_count", 0), + "wall_time_s": round(wall_time, 2), + } + + if step_result.get("error"): + extra["error"] = step_result["error"] + + samples = [] + + # Snapshot metrics + _emit(samples, step_result, "snapshot_p50_s", "snapshot_p50", "seconds", ns, extra) + _emit(samples, step_result, "snapshot_p95_s", "snapshot_p95", "seconds", ns, extra) + _emit(samples, step_result, "snapshot_max_s", "snapshot_max", "seconds", ns, extra) + + # Restore metrics + _emit(samples, step_result, "restore_p50_s", "restore_p50", "seconds", ns, extra) + _emit(samples, step_result, "restore_p95_s", "restore_p95", "seconds", ns, extra) + _emit(samples, step_result, "restore_max_s", "restore_max", "seconds", ns, extra) + + # TTFE metrics + _emit(samples, step_result, "ttfe_p50_s", "ttfe_p50", "seconds", ns, extra) + _emit(samples, step_result, "ttfe_p95_s", "ttfe_p95", "seconds", ns, extra) + _emit(samples, step_result, "ttfe_max_s", "ttfe_max", "seconds", ns, extra) + + # Startup time + _emit(samples, step_result, "startup_time_s", "startup_time", "seconds", ns, extra) + + # Restore correctness + correct = step_result.get("restore_correct_count") + if correct is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_restore_correct_count", + correct, + "count", + ns, + extra, + ) + ) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + round(wall_time, 2), + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for preload_mb=%d.", len(samples), preload_mb) + return samples + + +def Cleanup(benchmark_spec): + """Clean up any leftover benchmark resources.""" + ns = FLAGS.k8s_namespace + logging.info("Cleanup — deleting any leftover snapshot-benchmark resources.") + + for kind in ( + "sandboxclaim", + "sandboxtemplate", + "podsnapshotmanualtrigger", + "podsnapshots.podsnapshot.gke.io", + ): + utils.RunKubectl( + [ + "delete", + kind, + "-l", + "app=snapshot-benchmark-workload", + "-n", + ns, + "--ignore-not-found=true", + ], + timeout=60, + raise_on_failure=False, + ) + utils.StopPortForward() + logging.info("Cleanup complete.") + + +# --------------------------------------------------------------------------- +# Core snapshot/restore logic +# --------------------------------------------------------------------------- + + +def _RunSnapshotCycle( + namespace, + preload_mb, + burst_size, + skip_snapshot, + preload_mode, + ksa_name, + pod_timeout, + template_path, +): + """Execute one full snapshot/restore cycle and return a result dict. + + Handles source creation, snapshot, restore, TTFE measurement, + correctness verification, and cleanup. + """ + step_template = f"snap-bench-{preload_mb}mb" + source_names = [f"snap-src-0-{i}" for i in range(burst_size)] + restore_names = [f"snap-restore-0-{i}" for i in range(burst_size)] + trigger_names = [f"snap-trigger-0-{i}" for i in range(burst_size)] + + result = { + "preload_mb": preload_mb, + "burst_size": burst_size, + "snapshot_p50_s": None, + "snapshot_p95_s": None, + "snapshot_max_s": None, + "restore_p50_s": None, + "restore_p95_s": None, + "restore_max_s": None, + "ttfe_p50_s": None, + "ttfe_p95_s": None, + "ttfe_max_s": None, + "startup_time_s": None, + "snapshot_counter": None, + "restore_correct_count": 0, + "burst_results": [], + "error": None, + } + + try: + # 1. Create step-specific SandboxTemplate + logging.info( + "Creating SandboxTemplate '%s' (PRELOAD_MB=%d, memory=%dMi)", + step_template, + preload_mb, + max(512, preload_mb + 256), + ) + if not _RenderAndApplyTemplate( + template_path, + step_template, + namespace, + ksa_name, + preload_mb, + preload_mode, + ): + raise RuntimeError("Failed to create SandboxTemplate") + + time.sleep(2) + + # 2. Create source claims and wait for Running + preload + logging.info("Creating %d source SandboxClaim(s)", burst_size) + t0_sources = time.time() + workers = min(burst_size, 50) + with ThreadPoolExecutor(max_workers=workers) as pool: + for sname in source_names: + pool.submit(_ApplyClaim, sname, namespace, step_template) + + logging.info("Waiting for %d source pod(s) Running + preload", burst_size) + with ThreadPoolExecutor(max_workers=workers) as pool: + source_futs = [ + pool.submit( + _MeasureSingleSource, + sname, + namespace, + t0_sources, + pod_timeout, + preload_mode, + ) + for sname in source_names + ] + source_results = [f.result() for f in source_futs] + + src_failed = [r for r in source_results if r.get("error")] + if src_failed: + fail_msgs = "; ".join(f"{r['pod']}: {r['error']}" for r in src_failed) + raise RuntimeError( + f"{len(src_failed)}/{burst_size} source pod(s) failed: {fail_msgs}" + ) + + startup_times = [ + r["startup_time_s"] + for r in source_results + if r["startup_time_s"] is not None + ] + result["startup_time_s"] = ( + round(_Percentile(startup_times, 50), 3) if startup_times else None + ) + + snapshot_counters = {r["pod"]: r["snapshot_counter"] for r in source_results} + min_counter = min( + (c for c in snapshot_counters.values() if c is not None), default=None + ) + result["snapshot_counter"] = min_counter + logging.info("%d source pod(s) ready. Min counter: %s", burst_size, min_counter) + + # --skip_snapshot: measure cold-start TTFE only + if skip_snapshot: + logging.info("skip_snapshot mode: measuring cold-start TTFE") + ttfe_times = [] + burst_results = [] + for i, sname in enumerate(source_names): + startup = source_results[i]["startup_time_s"] + counter = source_results[i]["snapshot_counter"] + preload_done = source_results[i].get("preload_complete_time_s") + ttfe_s = preload_done if preload_done else startup + ttfe_times.append(ttfe_s) + burst_results.append( + { + "pod": sname, + "source_pod": sname, + "startup_time_s": startup, + "snapshot_counter": None, + "snapshot_time_s": None, + "restore_time_s": None, + "ttfe_s": ttfe_s, + "restore_counter": counter, + "restore_correct": True, + "error": None, + } + ) + + result["burst_results"] = burst_results + result["restore_correct_count"] = burst_size + + if ttfe_times: + result["ttfe_p50_s"] = round(_Percentile(ttfe_times, 50), 3) + result["ttfe_p95_s"] = round(_Percentile(ttfe_times, 95), 3) + result["ttfe_max_s"] = round(max(ttfe_times), 3) + + # Skip to cleanup + return result + + # 3. Trigger snapshots concurrently + logging.info("Triggering %d snapshot(s)", burst_size) + t0_snap = time.time() + with ThreadPoolExecutor(max_workers=workers) as pool: + snap_futs = [ + pool.submit( + _TriggerAndWaitSnapshot, + tname, + sname, + namespace, + t0_snap, + ) + for tname, sname in zip(trigger_names, source_names) + ] + snap_results = [f.result() for f in snap_futs] + + snap_failed = [r for r in snap_results if r.get("error")] + snap_times = [ + r["snapshot_time_s"] + for r in snap_results + if r["snapshot_time_s"] is not None + ] + if snap_times: + result["snapshot_p50_s"] = round(_Percentile(snap_times, 50), 3) + result["snapshot_p95_s"] = round(_Percentile(snap_times, 95), 3) + result["snapshot_max_s"] = round(max(snap_times), 3) + + if snap_failed: + fail_msgs = "; ".join(f"{r['trigger']}: {r['error']}" for r in snap_failed) + raise RuntimeError( + f"{len(snap_failed)}/{burst_size} snapshot(s) failed: {fail_msgs}" + ) + + # 4. Create restore claims concurrently + logging.info("Creating %d restore SandboxClaim(s)", burst_size) + t0_burst = time.time() + with ThreadPoolExecutor(max_workers=workers) as pool: + create_futs = [ + pool.submit(_ApplyClaim, rname, namespace, step_template) + for rname in restore_names + ] + for f in create_futs: + f.result() + + # 5. Poll restore pods for Running + TTFE + logging.info("Measuring restore + TTFE across %d pod(s)", burst_size) + with ThreadPoolExecutor(max_workers=workers) as pool: + measure_futs = [ + pool.submit( + _MeasureSingleRestore, + rname, + namespace, + t0_burst, + min_counter, + pod_timeout, + ) + for rname in restore_names + ] + burst_results = [f.result() for f in measure_futs] + + # Merge source + snapshot info + for i in range(burst_size): + burst_results[i]["source_pod"] = source_names[i] + burst_results[i]["startup_time_s"] = source_results[i]["startup_time_s"] + burst_results[i]["snapshot_counter"] = source_results[i]["snapshot_counter"] + burst_results[i]["snapshot_time_s"] = snap_results[i]["snapshot_time_s"] + + result["burst_results"] = burst_results + + # 6. Aggregate + restore_times = [ + r["restore_time_s"] + for r in burst_results + if r["restore_time_s"] is not None + ] + ttfe_times = [r["ttfe_s"] for r in burst_results if r["ttfe_s"] is not None] + correct_count = sum(1 for r in burst_results if r["restore_correct"]) + + result["restore_correct_count"] = correct_count + + if restore_times: + result["restore_p50_s"] = round(_Percentile(restore_times, 50), 3) + result["restore_p95_s"] = round(_Percentile(restore_times, 95), 3) + result["restore_max_s"] = round(max(restore_times), 3) + + if ttfe_times: + result["ttfe_p50_s"] = round(_Percentile(ttfe_times, 50), 3) + result["ttfe_p95_s"] = round(_Percentile(ttfe_times, 95), 3) + result["ttfe_max_s"] = round(max(ttfe_times), 3) + + logging.info("Counter correct: %d/%d", correct_count, burst_size) + + except Exception as e: + result["error"] = str(e) + logging.error("Snapshot cycle failed: %s", e) + + finally: + # Cleanup + logging.info("Cleaning up step resources") + _CleanupStep( + source_names, + restore_names, + trigger_names, + step_template, + namespace, + ) + time.sleep(5) + + return result + + +# --------------------------------------------------------------------------- +# Kubernetes interaction helpers +# --------------------------------------------------------------------------- + + +def _ApplyClaim(name, namespace, template_name): + """Create a SandboxClaim.""" + manifest = json.dumps( + { + "apiVersion": "extensions.agents.x-k8s.io/v1alpha1", + "kind": "SandboxClaim", + "metadata": { + "name": name, + "namespace": namespace, + "labels": {"app": "snapshot-benchmark-workload"}, + }, + "spec": {"sandboxTemplateRef": {"name": template_name}}, + } + ) + tmp_dir = os.path.join( + data.ResourcePath("k8s_agents/manifests"), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + tmp_path = os.path.join(tmp_dir, f"snap-claim-{name}.json") + try: + with open(tmp_path, "w") as f: + f.write(manifest) + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", tmp_path], + timeout=30, + raise_on_failure=False, + ) + finally: + if os.path.isfile(tmp_path): + os.unlink(tmp_path) + if retcode != 0: + raise RuntimeError(f"Failed to create SandboxClaim {name}: {stderr}") + + +def _RenderAndApplyTemplate( + template_path, + template_name, + namespace, + ksa_name, + preload_mb, + preload_mode, +): + """Render the Jinja2 template with step-specific values and kubectl apply.""" + if preload_mode.startswith("script:"): + return _RenderAndApplyScriptTemplate( + template_name, + namespace, + ksa_name, + preload_mb, + preload_mode, + ) + + with open(template_path) as f: + content = f.read() + + memory_mi = max(512, preload_mb + 256) + + tmpl = Template(content) + rendered = tmpl.render( + template_name=template_name, + namespace=namespace, + ksa_name=ksa_name, + preload_mb=preload_mb, + memory_mi=memory_mi, + ) + + tmp_dir = os.path.join( + data.ResourcePath("k8s_agents/manifests"), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + tmp_path = os.path.join(tmp_dir, f"snap-template-{template_name}.yaml") + try: + with open(tmp_path, "w") as f: + f.write(rendered) + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", tmp_path], + timeout=30, + raise_on_failure=False, + ) + finally: + if os.path.isfile(tmp_path): + os.unlink(tmp_path) + if retcode != 0: + logging.warning("kubectl apply stderr: %s", stderr) + return retcode == 0 + + +def _get_sandbox_node_selector(): + """Return the nodeSelector for sandbox pods.""" + return {"pkb_nodepool": "sandbox"} + + +def _get_sandbox_tolerations(): + """Return tolerations for sandbox pods.""" + return [ + { + "key": "sandbox.gke.io/runtime", + "operator": "Equal", + "value": "gvisor", + "effect": "NoSchedule", + }, + ] + + +def _RenderAndApplyScriptTemplate( + template_name, + namespace, + ksa_name, + preload_mb, + preload_mode, +): + """Render a SandboxTemplate that runs a user-provided startup script.""" + script_path = preload_mode.split(":", 1)[1] + if not os.path.isfile(script_path): + logging.error("Script not found: %s", script_path) + return False + + with open(script_path) as f: + user_script = f.read() + + memory_mi = max(512, preload_mb + 256) + + entrypoint = ( + "#!/bin/bash\n" + "set -e\n" + 'echo "Running startup script..."\n' + "# --- User script start ---\n" + f"{user_script}\n" + "# --- User script end ---\n" + 'echo "SCRIPT_READY"\n' + 'echo "Starting counter."\n' + "i=0\n" + "while true; do\n" + ' echo "Count: $i"\n' + " i=$((i + 1))\n" + " sleep 1\n" + "done\n" + ) + + manifest = json.dumps({ + "apiVersion": "extensions.agents.x-k8s.io/v1alpha1", + "kind": "SandboxTemplate", + "metadata": { + "name": template_name, + "namespace": namespace, + }, + "spec": { + "podTemplate": { + "metadata": { + "labels": {"app": "snapshot-benchmark-workload"}, + }, + "spec": { + "serviceAccountName": ksa_name, + "runtimeClassName": "gvisor", + "containers": [ + { + "name": "preloader", + "image": "python:3.11-slim", + "command": ["bash", "-c"], + "args": [entrypoint], + "env": [{"name": "PRELOAD_MB", "value": str(preload_mb)}], + "resources": { + "requests": { + "cpu": "250m", + "memory": f"{memory_mi}Mi", + "ephemeral-storage": "512Mi", + } + }, + } + ], + "nodeSelector": _get_sandbox_node_selector(), + "tolerations": _get_sandbox_tolerations(), + "restartPolicy": "OnFailure", + }, + } + }, + }) + + tmp_dir = os.path.join( + data.ResourcePath("k8s_agents/manifests"), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + tmp_path = os.path.join(tmp_dir, f"snap-script-template-{template_name}.json") + try: + with open(tmp_path, "w") as f: + f.write(manifest) + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", tmp_path], + timeout=30, + raise_on_failure=False, + ) + finally: + if os.path.isfile(tmp_path): + os.unlink(tmp_path) + if retcode != 0: + logging.warning("kubectl apply stderr: %s", stderr) + return retcode == 0 + + +def _MeasureSingleSource(name, namespace, t0, pod_timeout, preload_mode): + """Wait for a source pod to be Running and preloaded.""" + result = { + "pod": name, + "startup_time_s": None, + "preload_complete_time_s": None, + "snapshot_counter": None, + "error": None, + } + + # Wait for Running + deadline = t0 + pod_timeout + while time.time() < deadline: + stdout, _, rc = utils.RunKubectl( + ["get", "pod", name, "-n", namespace, "-o", "jsonpath={.status.phase}"], + timeout=10, + raise_on_failure=False, + ) + if stdout == "Running": + result["startup_time_s"] = round(time.time() - t0, 3) + break + time.sleep(1) + else: + result["error"] = f"Pod {name} did not reach Running within {pod_timeout}s" + return result + + # Wait for preload + if not _WaitForPreload(name, namespace, pod_timeout, preload_mode): + result["error"] = f"Preload did not complete within {pod_timeout}s" + return result + + result["preload_complete_time_s"] = round(time.time() - t0, 3) + + # Let counter tick + time.sleep(3) + result["snapshot_counter"] = _GetLastCounter(name, namespace) + return result + + +def _WaitForPreload(name, namespace, timeout_s, preload_mode): + """Wait for preload to complete.""" + deadline = time.time() + timeout_s + while time.time() < deadline: + stdout, _, rc = utils.RunKubectl( + ["logs", name, "-n", namespace, "--tail=20"], + timeout=10, + raise_on_failure=False, + ) + if "SCRIPT_READY" in stdout: + return True + if "Starting counter" in stdout or re.search(r"Count:\s*\d+", stdout): + return True + time.sleep(2) + return False + + +def _GetLastCounter(name, namespace): + """Extract the last Count: N value from pod logs.""" + stdout, _, rc = utils.RunKubectl( + ["logs", name, "-n", namespace, "--tail=10"], + timeout=10, + raise_on_failure=False, + ) + if rc != 0: + return None + matches = re.findall(r"Count:\s*(\d+)", stdout) + return int(matches[-1]) if matches else None + + +def _TriggerAndWaitSnapshot(trigger_name, target_pod, namespace, t0, timeout_s=300): + """Create a snapshot trigger and wait for Complete.""" + result = { + "trigger": trigger_name, + "pod": target_pod, + "snapshot_time_s": None, + "error": None, + } + manifest = json.dumps( + { + "apiVersion": "podsnapshot.gke.io/v1", + "kind": "PodSnapshotManualTrigger", + "metadata": {"name": trigger_name, "namespace": namespace}, + "spec": {"targetPod": target_pod}, + } + ) + tmp_dir = os.path.join( + data.ResourcePath("k8s_agents/manifests"), "tmp" + ) + os.makedirs(tmp_dir, exist_ok=True) + tmp_path = os.path.join(tmp_dir, f"snap-trigger-{trigger_name}.json") + try: + with open(tmp_path, "w") as f: + f.write(manifest) + stdout, stderr, retcode = kubectl.RunKubectlCommand( + ["apply", "-f", tmp_path], + timeout=30, + raise_on_failure=False, + ) + finally: + if os.path.isfile(tmp_path): + os.unlink(tmp_path) + if retcode != 0: + result["error"] = f"Failed to create trigger: {stderr}" + return result + + deadline = t0 + timeout_s + while time.time() < deadline: + stdout, _, rc = utils.RunKubectl( + [ + "get", + "podsnapshotmanualtriggers.podsnapshot.gke.io", + trigger_name, + "-n", + namespace, + "-o", + "jsonpath={.status.conditions[0].reason}", + ], + timeout=10, + raise_on_failure=False, + ) + if stdout == "Complete": + result["snapshot_time_s"] = round(time.time() - t0, 3) + return result + time.sleep(2) + result["error"] = f"Snapshot {trigger_name} did not complete within {timeout_s}s" + return result + + +def _MeasureSingleRestore(name, namespace, t0, snapshot_counter, pod_timeout): + """Measure restore_time and TTFE for a single pod.""" + result = { + "pod": name, + "restore_time_s": None, + "ttfe_s": None, + "restore_counter": None, + "restore_correct": False, + "error": None, + } + + # Wait for Running + deadline = t0 + pod_timeout + while time.time() < deadline: + stdout, _, rc = utils.RunKubectl( + ["get", "pod", name, "-n", namespace, "-o", "jsonpath={.status.phase}"], + timeout=10, + raise_on_failure=False, + ) + if stdout == "Running": + result["restore_time_s"] = round(time.time() - t0, 3) + break + time.sleep(1) + else: + result["error"] = f"Pod {name} did not reach Running within {pod_timeout}s" + return result + + # Wait for first Count (TTFE) + ttfe_deadline = t0 + pod_timeout + while time.time() < ttfe_deadline: + stdout, _, rc = utils.RunKubectl( + ["logs", name, "-n", namespace, "--tail=50"], + timeout=10, + raise_on_failure=False, + ) + if rc == 0: + matches = re.findall(r"Count:\s*(\d+)", stdout) + if matches: + result["ttfe_s"] = round(time.time() - t0, 3) + result["restore_counter"] = int(matches[0]) + if ( + snapshot_counter is not None + and result["restore_counter"] >= snapshot_counter + ): + result["restore_correct"] = True + return result + time.sleep(1) + + result["error"] = f"Pod {name}: no Count output within timeout" + return result + + +def _CleanupStep(source_names, restore_names, trigger_names, template_name, namespace): + """Delete source claims, restore claims, triggers, snapshots, and template.""" + to_delete = [("sandboxtemplate", template_name)] + for name in source_names: + to_delete.append(("sandboxclaim", name)) + for name in restore_names: + to_delete.append(("sandboxclaim", name)) + for name in trigger_names: + to_delete.append(("podsnapshotmanualtrigger", name)) + + for kind, name in to_delete: + utils.RunKubectl( + ["delete", kind, name, "-n", namespace, "--ignore-not-found=true"], + timeout=60, + raise_on_failure=False, + ) + # Delete any PodSnapshot resources + utils.RunKubectl( + [ + "delete", + "podsnapshots.podsnapshot.gke.io", + "--all", + "-n", + namespace, + "--ignore-not-found=true", + ], + timeout=60, + raise_on_failure=False, + ) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _GetTemplatePath(): + """Return the absolute path to the snapshot SandboxTemplate template.""" + return os.path.join( + data.ResourcePath("k8s_agents/manifests"), + "snapshot-sandbox-template.yaml.j2", + ) + + +def _Percentile(values, pct): + """Calculate percentile (0-100) from a list of values.""" + if not values: + return 0.0 + s = sorted(values) + idx = (pct / 100) * (len(s) - 1) + lo = int(idx) + hi = min(lo + 1, len(s) - 1) + frac = idx - lo + return s[lo] * (1 - frac) + s[hi] * frac + + +def _emit(samples, data, data_key, metric_suffix, unit, namespace, extra): + """Emit a sample if the key exists in the data dict.""" + value = data.get(data_key) + if value is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_suffix}", + value, + unit, + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py new file mode 100644 index 0000000000..9024f9f28e --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py @@ -0,0 +1,426 @@ +"""PKB Benchmark: GKE Agent Warmpool Scale-Up (Use Case E). + +Atomic single-point measurement of warm pool provisioning speed on a +pre-provisioned GKE cluster. Measures how quickly N sandbox pods can be +provisioned from zero via the SandboxWarmPool controller. No agent API +is needed; this benchmark interacts directly with the Kubernetes API. + +This benchmark is designed to be invoked repeatedly by an external sweep +controller that varies the target_replicas parameter across iterations to +find the provisioning saturation point. + +Usage: + python pkb.py --benchmarks=gke_warmpool \ + --k8s_warmpool_target_replicas=100 \ + --k8s_warmpool_name=python-sandbox-warmpool \ + --k8s_warmpool_pod_label=sandbox=python-sandbox-example \ + --k8s_warmpool_ready_threshold_s=300 \ + --k8s_warmpool_poll_interval_s=2.0 \ + --k8s_warmpool_drain_timeout_s=300 \ + --k8s_namespace=agentic \ + --gke_machine_type=c4-standard-8 + +Samples emitted (per run): + - gke_warmpool_total_time_to_ready (seconds) + - gke_warmpool_refill_rate (pods/sec) + - gke_warmpool_drain_time (seconds) + - gke_warmpool_first_pod_running (seconds) + - gke_warmpool_final_running_count (count) + - gke_warmpool_final_pending_count (count) + - gke_warmpool_time_to_created_p50 (seconds) + - gke_warmpool_time_to_created_p95 (seconds) + - gke_warmpool_time_to_created_max (seconds) + - gke_warmpool_time_to_created_count (count) + - gke_warmpool_time_to_scheduled_p50 (seconds) + - gke_warmpool_time_to_scheduled_p95 (seconds) + - gke_warmpool_time_to_scheduled_max (seconds) + - gke_warmpool_time_to_scheduled_count (count) + - gke_warmpool_time_to_running_p50 (seconds) + - gke_warmpool_time_to_running_p95 (seconds) + - gke_warmpool_time_to_running_max (seconds) + - gke_warmpool_time_to_running_count (count) + - gke_warmpool_wall_time (seconds) +""" + +import json +import logging +import time + +from absl import flags +from datetime import datetime, timezone +from perfkitbenchmarker import configs +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + k8s_benchmark_utils as utils, +) +from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import ( + gke_deploy_utils as deploy_utils, +) + +FLAGS = flags.FLAGS + +BENCHMARK_NAME = "k8s_warmpool" +BENCHMARK_CONFIG = """ +k8s_warmpool: + description: > + Atomic single-point warm pool scale-up measurement on a + pre-provisioned GKE cluster with gVisor isolation. +""" + +# --------------------------------------------------------------------------- +# Benchmark-specific flags +# --------------------------------------------------------------------------- + +flags.DEFINE_integer( + "k8s_warmpool_target_replicas", + 100, + "Number of warm pool replicas to provision from zero.", +) + +flags.DEFINE_string( + "k8s_warmpool_name", + "python-sandbox-warmpool", + "SandboxWarmPool resource name.", +) + +flags.DEFINE_string( + "k8s_warmpool_pod_label", + "sandbox=python-sandbox-example", + "Label selector for warm pool pods.", +) + +flags.DEFINE_float( + "k8s_warmpool_ready_threshold_s", + 300.0, + "Max seconds allowed for all pods to reach Running.", +) + +flags.DEFINE_float( + "k8s_warmpool_poll_interval_s", + 2.0, + "Seconds between kubectl polls during provisioning.", +) + +flags.DEFINE_float( + "k8s_warmpool_drain_timeout_s", + 300.0, + "Max seconds to wait for drain to 0.", +) + + +# --------------------------------------------------------------------------- +# Lifecycle +# --------------------------------------------------------------------------- + + +def GetConfig(user_config): + """Load and return benchmark config. + + No vm_groups — PKB skips Provision() and Teardown(). + """ + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(benchmark_spec): + """Deploy workloads onto the cluster.""" + benchmark_spec.always_call_cleanup = True + logging.info("=== Prepare: deploying workloads ===") + deploy_utils.DeployWorkloads(benchmark_spec) + utils.EnsurePortForward() + logging.info("Prepare complete.") + + +def Run(benchmark_spec): + """Scale warm pool from 0 to target and measure provisioning time. + + Returns: + List of sample.Sample objects. + """ + utils.set_benchmark_spec(benchmark_spec) + + ns = FLAGS.k8s_namespace + target = FLAGS.k8s_warmpool_target_replicas + warmpool_name = FLAGS.k8s_warmpool_name + label = FLAGS.k8s_warmpool_pod_label + threshold_s = FLAGS.k8s_warmpool_ready_threshold_s + poll_interval = FLAGS.k8s_warmpool_poll_interval_s + + # Drain to 0 for clean measurement (moved from Prepare for sweep compatibility) + utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s)) + time.sleep(3) + + logging.info("=== Run: scaling %s to %d replicas ===", warmpool_name, target) + + t_wall_start = time.time() + + # 1. Measure drain time (should be near-zero since Prepare drained) + t0 = time.time() + utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s)) + drain_time_s = round(time.time() - t0, 2) + + time.sleep(2) + + # 2. Scale up + logging.info("Patching %s replicas -> %d", warmpool_name, target) + patch_json = json.dumps({"spec": {"replicas": target}}) + utils.RunKubectl( + [ + "patch", + "sandboxwarmpool", + warmpool_name, + "-n", + ns, + "--type=merge", + f"-p={patch_json}", + ] + ) + + # 3. Poll until ready or timeout + t_scale = time.time() + scale_start_epoch = t_scale + deadline = t_scale + threshold_s + first_pod_time = None + + while time.time() < deadline: + elapsed = time.time() - t_scale + running = utils.CountPods(ns, label, "Running") + pending = utils.CountPods(ns, label, "Pending") + + if first_pod_time is None and running > 0: + first_pod_time = elapsed + + pct = (running / target * 100) if target > 0 else 0 + logging.info( + "[%.1fs] Running: %d/%d (%.0f%%) Pending: %d", + elapsed, + running, + target, + pct, + pending, + ) + + if running >= target: + break + + time.sleep(poll_interval) + + total_time = round(time.time() - t_scale, 2) + final_running = utils.CountPods(ns, label, "Running") + final_pending = utils.CountPods(ns, label, "Pending") + rate = round(final_running / total_time, 2) if total_time > 0 else 0 + + logging.info( + "Scale-up complete: %d/%d Running in %.1fs (%.1f pods/sec)", + final_running, + target, + total_time, + rate, + ) + + # 4. Scrape pod lifecycle timestamps + lifecycle = _ScrapeLifecycle(ns, label, scale_start_epoch) + + wall_time = round(time.time() - t_wall_start, 2) + + # 5. Build samples + extra = { + "target_replicas": target, + "final_running_count": final_running, + "final_pending_count": final_pending, + "wall_time_s": wall_time, + } + + samples = [] + + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_total_time_to_ready", + total_time, + "seconds", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_refill_rate", + rate, + "pods/sec", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_drain_time", + drain_time_s, + "seconds", + ns, + extra, + ) + ) + + if first_pod_time is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_first_pod_running", + round(first_pod_time, 2), + "seconds", + ns, + extra, + ) + ) + + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_final_running_count", + float(final_running), + "count", + ns, + extra, + ) + ) + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_final_pending_count", + float(final_pending), + "count", + ns, + extra, + ) + ) + + # Pod lifecycle percentiles + _EmitLifecycleSamples(samples, lifecycle, ns, extra) + + # Wall time + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_wall_time", + wall_time, + "seconds", + ns, + extra, + ) + ) + + logging.info("Emitted %d samples for target_replicas=%d.", len(samples), target) + return samples + + +def Cleanup(benchmark_spec): + """Drain warm pool back to 0 after measurement.""" + ns = FLAGS.k8s_namespace + warmpool_name = FLAGS.k8s_warmpool_name + label = FLAGS.k8s_warmpool_pod_label + + logging.info("Cleanup: draining warm pool to 0.") + utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s)) + utils.StopPortForward() + logging.info("Cleanup complete.") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _ScrapeLifecycle(namespace, label, scale_start_epoch): + """Scrape pod metadata to compute time-to-created/scheduled/running. + + Returns a dict with P50/P95/max/count for each phase relative to + scale_start_epoch. + """ + stdout, _, rc = utils.RunKubectl( + ["get", "pods", "-n", namespace, "-l", label, "-o", "json"], + timeout=60, + raise_on_failure=False, + ) + if rc != 0 or not stdout: + return {} + + pods = json.loads(stdout).get("items", []) + created_deltas = [] + scheduled_deltas = [] + running_deltas = [] + + for pod in pods: + meta = pod.get("metadata", {}) + status = pod.get("status", {}) + + # creationTimestamp -> time-to-created + created_str = meta.get("creationTimestamp") + if created_str: + created_ts = datetime.fromisoformat( + created_str.replace("Z", "+00:00") + ).timestamp() + created_deltas.append(created_ts - scale_start_epoch) + + # PodScheduled condition -> time-to-scheduled + conditions = status.get("conditions", []) + for cond in conditions: + if cond.get("type") == "PodScheduled" and cond.get("status") == "True": + ts_str = cond.get("lastTransitionTime") + if ts_str: + ts = datetime.fromisoformat( + ts_str.replace("Z", "+00:00") + ).timestamp() + scheduled_deltas.append(ts - scale_start_epoch) + if cond.get("type") == "Ready" and cond.get("status") == "True": + ts_str = cond.get("lastTransitionTime") + if ts_str: + ts = datetime.fromisoformat( + ts_str.replace("Z", "+00:00") + ).timestamp() + running_deltas.append(ts - scale_start_epoch) + + def _pcts(vals): + if not vals: + return {} + vals.sort() + n = len(vals) + return { + "p50": round(vals[n // 2], 2), + "p95": round(vals[int(n * 0.95)], 2) if n > 1 else round(vals[-1], 2), + "max": round(vals[-1], 2), + "count": n, + } + + return { + "time_to_created_s": _pcts(created_deltas), + "time_to_scheduled_s": _pcts(scheduled_deltas), + "time_to_running_s": _pcts(running_deltas), + } + + +def _EmitLifecycleSamples(samples, lifecycle, namespace, extra): + """Emit pod lifecycle percentile samples for all three phases.""" + _PHASE_MAP = [ + ("time_to_created_s", "time_to_created"), + ("time_to_scheduled_s", "time_to_scheduled"), + ("time_to_running_s", "time_to_running"), + ] + for lifecycle_key, metric_base in _PHASE_MAP: + phase_data = lifecycle.get(lifecycle_key, {}) + for stat in ("p50", "p95", "max"): + val = phase_data.get(stat) + if val is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_base}_{stat}", + val, + "seconds", + namespace, + extra, + ) + ) + count = phase_data.get("count") + if count is not None: + samples.append( + utils.MakeSample( + f"{BENCHMARK_NAME}_{metric_base}_count", + float(count), + "count", + namespace, + extra, + ) + ) diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py index a56fe72b99..eeabaae0b3 100644 --- a/perfkitbenchmarker/providers/gcp/flags.py +++ b/perfkitbenchmarker/providers/gcp/flags.py @@ -580,6 +580,21 @@ ' the size derived from max_vm_count. Use when the cluster will scale' ' beyond the default node pool (e.g. kubernetes_node_scale with 5k nodes).', ) + + +GKE_ADDITIONAL_FLAGS = flags.DEFINE_list( + 'gke_additional_flags', + [], + 'Additional flags to pass to gcloud container clusters create. ' + 'Example: --gke_additional_flags=--enable-pod-snapshots,--enable-dataplane-v2', +) + +GKE_ADDITIONAL_NODEPOOL_FLAGS = flags.DEFINE_list( + 'gke_additional_nodepool_flags', + [], + 'Additional flags to pass to gcloud container node-pools create. ' + 'Example: --gke_additional_nodepool_flags=--max-pods-per-node=250', +) GCE_PERFORMANCE_MONITORING_UNIT = flags.DEFINE_enum( 'gce_performance_monitoring_unit', None, diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index f943a53ff1..06d4a295dc 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -102,14 +102,25 @@ def _Delete(self): ).Issue() def RemoteBuild(self, image: container.ContainerImage): - """Builds the image remotely.""" - if not gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value: - full_tag = self.GetFullRegistryTag(image.name) + """Builds the image remotely. + + If --container_remote_build_config is set, uses it as the + --config argument to `gcloud builds submit` and passes the + image tag via --substitutions _IMAGE=. + Otherwise uses the simple --tag shorthand. + """ + full_tag = self.GetFullRegistryTag(image.name) + if gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value: + build_cmd = util.GcloudCommand( + self, 'builds', 'submit', + '--config', gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value, + '--substitutions', f'_IMAGE={full_tag}', + image.directory, + ) else: - full_tag = gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value - build_cmd = util.GcloudCommand( - self, 'builds', 'submit', '--tag', full_tag, image.directory - ) + build_cmd = util.GcloudCommand( + self, 'builds', 'submit', '--tag', full_tag, image.directory, + ) build_cmd.Issue(timeout=None) @@ -417,6 +428,10 @@ def _Create(self): if self.enable_aam: cmd.args.append('--auto-monitoring-scope=ALL') + # --- PKB Extension: additional cluster create flags --- + for additional_flag in gcp_flags.GKE_ADDITIONAL_FLAGS.value: + cmd.args.append(additional_flag) + self._RunClusterCreateCommand(cmd) self._GetKubeconfig() self._CreateCustomComputeClass(self.default_nodepool) @@ -432,6 +447,10 @@ def _CreateNodePools(self): nodepool, cmd, ) + # --- PKB Extension: additional node pool create flags --- + for additional_flag in gcp_flags.GKE_ADDITIONAL_NODEPOOL_FLAGS.value: + cmd.args.append(additional_flag) + self._IssueResourceCreationCommand(cmd) self._CreateCustomComputeClass(nodepool)