diff --git a/.gitignore b/.gitignore
index 1e1c6fe077..6f0c9cb603 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@
/.idea
/*git_ignore*
.DS_Store
+.adk
+tmp/
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore b/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore
new file mode 100644
index 0000000000..78cf8c8595
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/.dockerignore
@@ -0,0 +1,165 @@
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+
+### OSX ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### Windows ###
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+
+### Vagrant ###
+.vagrant/
+### Local rules, see .gitignore.tail to override! ###
+shippable
+.git
+
+tmp/
+sessions.db
+.adk/
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore b/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore
new file mode 100644
index 0000000000..fb34b7833c
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/.gcloudignore
@@ -0,0 +1,25 @@
+# This file tells gcloud builds submit which files to exclude from the upload.
+# Without it, gcloud ignores .dockerignore and uploads everything (including .venv).
+
+.git
+.venv/
+venv/
+ENV/
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+*.egg
+dist/
+build/
+.tox/
+.cache/
+.coverage
+htmlcov/
+*.log
+.env
+.adk/
+sessions.db
+tmp/
+.DS_Store
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile b/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile
new file mode 100644
index 0000000000..417ad58946
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.13-slim
+WORKDIR /app
+
+# Install kubectl (required by k8s-agent-sandbox for port-forwarding to sandbox pods)
+# Uses TARGETARCH (injected by BuildKit) to download the correct binary for amd64 or arm64
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends curl ca-certificates && \
+ ARCH=$(dpkg --print-architecture) && \
+ curl -LO "https://dl.k8s.io/release/$(curl -sL https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl" && \
+ install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \
+ rm kubectl && \
+ apt-get purge -y curl && \
+ apt-get autoremove -y && \
+ rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+RUN adduser --disabled-password --gecos "" myuser && \
+ chown -R myuser:myuser /app
+
+COPY . .
+
+USER myuser
+
+ENV PATH="/home/myuser/.local/bin:$PATH"
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
+
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py
new file mode 100644
index 0000000000..5271a8ef60
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/__init__.py
@@ -0,0 +1 @@
+# ADK Agent package
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml b/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml
new file mode 100644
index 0000000000..653f07fcf8
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/cloudbuild-arm64.yaml
@@ -0,0 +1,20 @@
+# Cloud Build config for cross-compiling to ARM64.
+# Used by PKB when --container_remote_build_config points to this file.
+# The _IMAGE substitution is passed by PKB RemoteBuild() automatically.
+steps:
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['run', '--privileged', 'multiarch/qemu-user-static', '--reset', '-p', 'yes']
+ id: 'qemu-setup'
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['buildx', 'create', '--use', '--name', 'multiarch-builder']
+ id: 'create-builder'
+ waitFor: ['qemu-setup']
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['buildx', 'build', '--platform', 'linux/arm64', '-t', '${_IMAGE}', '--push', '.']
+ id: 'build-and-push'
+ waitFor: ['create-builder']
+options:
+ logging: CLOUD_LOGGING_ONLY
+ machineType: E2_HIGHCPU_32
+substitutions:
+ _IMAGE: ''
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py
new file mode 100644
index 0000000000..c6df9a7a2a
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/__init__.py
@@ -0,0 +1,2 @@
+# GKE Performance Agent package
+from . import agent
\ No newline at end of file
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py
new file mode 100644
index 0000000000..6561942960
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/gke_performance_agent/agent.py
@@ -0,0 +1,276 @@
+"""GKE Performance Agent -- ADK agent definition.
+
+This file runs INSIDE the GKE cluster as part of the adk-agent Deployment
+(see gke_deploy_utils.py for the K8s manifest). It is NOT run from the
+machine executing PKB. The ADK agent pod serves a FastAPI app (main.py)
+that PKB calls via HTTP through a kubectl port-forward tunnel.
+
+Execution flow:
+ PKB (your laptop/CI) -> kubectl port-forward -> adk-agent pod -> this file
+ -> GkeCodeExecutor -> SandboxClient -> gVisor sandbox pod
+"""
+
+"""GKE Performance Agent â ADK agent definition for sandbox benchmarking.
+
+EXECUTION CONTEXT:
+ This file runs INSIDE the GKE cluster, NOT on the PKB orchestrator machine.
+ It is packaged into a container image (see ../Dockerfile) and deployed as
+ the 'adk-agent' Deployment in the benchmark namespace.
+
+ Execution flow:
+ PKB machine GKE Cluster
+ ---------- -----------
+ benchmark.Run()
+ -> CallAgentApi("/benchmark/...") -> main.py (FastAPI)
+ -> Runner(agent=root_agent)
+ -> MockLlm yields code
+ -> V3GkeCodeExecutor._execute_in_sandbox()
+ -> SandboxClient.create_sandbox()
+ -> sandbox.files.write("script.py", code)
+ -> sandbox.commands.run("python3 script.py")
+ -> SandboxClient.delete_sandbox()
+
+ The PKB machine communicates with this agent via HTTP (port-forwarded
+ through kubectl or via a LoadBalancer/ClusterIP service).
+"""
+
+from google.adk.agents import LlmAgent
+from google.adk.code_executors import GkeCodeExecutor
+from google.adk.code_executors.code_execution_utils import CodeExecutionResult
+from google.adk.models.base_llm import BaseLlm
+from google.adk.models.llm_response import LlmResponse
+from google.genai import types
+from concurrent.futures import ThreadPoolExecutor
+from dotenv import load_dotenv
+from google.adk.apps import App
+import logging
+import os
+
+# --- Configure Logging ---
+logging.basicConfig(level=logging.INFO)
+
+# =========================================================================
+# 1. Environment and Configuration
+# =========================================================================
+
+basedir = os.path.abspath(os.path.dirname(__file__))
+agent_dir = os.path.join(basedir, "..")
+
+# Load generated.env (rendered by gke_image_build_utils._GenerateEnvFile from PKB flags).
+# In GKE, K8s manifest env vars take precedence.
+load_dotenv(os.path.join(agent_dir, "generated.env"))
+
+# =========================================================================
+# 2. Mock LLM Definition (Inheriting from BaseLlm for Pydantic)
+# =========================================================================
+
+# Load the benchmark scripts
+density_script_path = os.path.join(
+ basedir, "../sandboxed_apps/python_test_app/benchmark_density.py"
+)
+try:
+ with open(density_script_path, "r") as f:
+ density_benchmark_code = f.read()
+except Exception:
+ density_benchmark_code = "import os; print(os.uname())"
+
+payload_script_path = os.path.join(
+ basedir, "../sandboxed_apps/python_test_app/benchmark_payload.py"
+)
+try:
+ with open(payload_script_path, "r") as f:
+ payload_benchmark_code = f.read()
+except Exception:
+ payload_benchmark_code = "import os; print(os.uname())"
+
+qps_script_path = os.path.join(
+ basedir, "../sandboxed_apps/python_test_app/benchmark_qps.py"
+)
+try:
+ with open(qps_script_path, "r") as f:
+ qps_benchmark_code = f.read()
+except Exception:
+ qps_benchmark_code = "import json; print(json.dumps({'sandbox_status': 'ok'}))"
+
+# Keys that main.py sets in os.environ per-request. We inject them into
+# the script so they reach the sandbox pod. If unset, the benchmark scripts
+# use their own built-in defaults.
+_DENSITY_ENV_KEYS = ["SAMPLE_COUNT", "SAMPLE_WARMUP"]
+_PAYLOAD_ENV_KEYS = ["PAYLOAD_SIZE_MB", "PAYLOAD_ITERATIONS"]
+_QPS_ENV_KEYS: list[str] = [] # QPS script needs no env config
+
+
+def _build_benchmark_code() -> str:
+ """Build the benchmark script with current env values injected.
+
+ Selects the script based on BENCHMARK_MODE env var:
+ - 'density' → benchmark_density.py
+ - 'payload' → benchmark_payload.py
+ - 'qps' → benchmark_qps.py
+ """
+ mode = os.getenv("BENCHMARK_MODE", "density")
+
+ if mode == "payload":
+ env_keys = _PAYLOAD_ENV_KEYS
+ script = payload_benchmark_code
+ elif mode == "qps":
+ env_keys = _QPS_ENV_KEYS
+ script = qps_benchmark_code
+ else:
+ env_keys = _DENSITY_ENV_KEYS
+ script = density_benchmark_code
+
+ lines = ["import os"]
+ for k in env_keys:
+ v = os.getenv(k)
+ if v is not None:
+ lines.append(f"os.environ['{k}'] = '{v}'")
+ return "\n".join(lines) + "\n\n" + script
+
+
+class MockLlm(BaseLlm):
+ model: str = "mock-model"
+
+ async def generate_content_async(self, llm_request, stream=False):
+ """Mock the ADK response loop.
+
+ BaseLlm.generate_content_async is an AsyncGenerator — it must YIELD
+ LlmResponse objects, never return them.
+ """
+ # ADK appends the code execution result to the conversation
+ # history before calling the LLM again. If the history has
+ # grown beyond the initial user prompt, code has already
+ # executed — return plain text to stop the loop.
+ has_execution_result = len(llm_request.contents) > 1
+
+ if has_execution_result:
+ part = types.Part(text="Execution Complete")
+ else:
+ # Create an ADK-compliant result with executable code.
+ # Build at request time so SAMPLE_COUNT/SAMPLE_WARMUP reflect
+ # the current os.environ values set by main.py per-request.
+ part = types.Part(
+ executable_code=types.ExecutableCode(
+ language="PYTHON", code=_build_benchmark_code()
+ )
+ )
+
+ content = types.Content(role="model", parts=[part])
+ response = LlmResponse(content=content, partial=False)
+
+ # Yield exactly one final response (both streaming and non-streaming)
+ yield response
+
+
+# =========================================================================
+# 3. Agent Initialization
+# =========================================================================
+
+
+# Module-level thread pool for sandbox I/O operations.
+# Initialized once at import time to avoid thread-safety issues
+# with lazy initialization inside _execute_in_sandbox().
+_SANDBOX_POOL = ThreadPoolExecutor(max_workers=16)
+
+
+class V3GkeCodeExecutor(GkeCodeExecutor):
+ def _execute_in_sandbox(self, code: str) -> CodeExecutionResult:
+ """Executes code using the v0.4.6 compatible SandboxClient."""
+ from k8s_agent_sandbox.sandbox_client import SandboxClient
+ from k8s_agent_sandbox.models import SandboxDirectConnectionConfig
+ import logging
+ import time
+
+ logging.info("Executing via V3 SandboxClient (v0.4.6 compatible).")
+
+ # _SANDBOX_POOL is initialized at module level (thread-safe).
+
+ # Use DirectConnection when SANDBOX_ROUTER_URL is set (in-cluster),
+ # otherwise fall back to kubectl port-forward (dev mode).
+ router_url = os.getenv("SANDBOX_ROUTER_URL")
+ if router_url:
+ client = SandboxClient(
+ connection_config=SandboxDirectConnectionConfig(api_url=router_url)
+ )
+ else:
+ client = SandboxClient()
+ # v0.4.6 create_sandbox uses 'template' and 'namespace' arguments
+ create_ms = upload_ms = run_ms = delete_ms = 0.0
+ sandbox = None
+ # Time sandbox creation
+ t0 = time.time()
+ create_future = _SANDBOX_POOL.submit(
+ client.create_sandbox,
+ template=self.sandbox_template,
+ namespace=self.namespace,
+ )
+ sandbox = create_future.result()
+ create_ms = (time.time() - t0) * 1000.0
+ try:
+ # v0.4.6 handles file I/O via the .files namespace
+ t0 = time.time()
+ upload_future = _SANDBOX_POOL.submit(sandbox.files.write, "script.py", code)
+ upload_future.result()
+ upload_ms = (time.time() - t0) * 1000.0
+
+ # SANDBOX_EXEC_TIMEOUT_S is set per-request by main.py.
+ # Default 60 s keeps density/snapshot runs tight; payload
+ # sweeps raise it for large blobs.
+ run_timeout = int(os.getenv("SANDBOX_EXEC_TIMEOUT_S", "60"))
+
+ t0 = time.time()
+ run_future = _SANDBOX_POOL.submit(
+ sandbox.commands.run, "python3 script.py", timeout=run_timeout
+ )
+ result = run_future.result()
+ run_ms = (time.time() - t0) * 1000.0
+
+ # ADK's build_code_execution_result_part discards stdout when
+ # stderr is non-empty (OUTCOME_FAILED path). Sandbox scripts
+ # produce benign stderr (C-extension reimport noise, gVisor
+ # warnings) that would cause all sandbox_* metrics to vanish.
+ # Log stderr for debugging, then clear it so ADK passes
+ # stdout through.
+ if result.stderr:
+ logging.warning("Sandbox stderr (ignored): %s", result.stderr[:500])
+
+ logging.info(
+ "SANDBOX_TIMINGS: create_ms=%.3f upload_ms=%.3f run_ms=%.3f",
+ create_ms,
+ upload_ms,
+ run_ms,
+ )
+ return CodeExecutionResult(stdout=result.stdout, stderr="")
+ finally:
+ # Always cleanup the claim
+ t0 = time.time()
+ if sandbox is not None:
+ delete_future = _SANDBOX_POOL.submit(
+ client.delete_sandbox, sandbox.claim_name, namespace=self.namespace
+ )
+ delete_future.result()
+ delete_ms = (time.time() - t0) * 1000.0
+ logging.info("SANDBOX_TIMINGS_DELETE: delete_ms=%.3f", delete_ms)
+
+
+gke_executor = V3GkeCodeExecutor(
+ cluster_name=os.getenv("CLUSTER_NAME"),
+ location=os.getenv("GOOGLE_CLOUD_LOCATION"),
+ namespace=os.getenv("AGENTIC_NAMESPACE"),
+ executor_type="sandbox",
+ sandbox_template="python-sandbox-template",
+)
+
+gke_performance_agent = LlmAgent(
+ name="gke_performance_agent", # Must be a valid identifier (no dashes)
+ model=MockLlm(model="mock-model"),
+ code_executor=gke_executor,
+)
+
+root_agent = gke_performance_agent
+
+app = App(
+ name=root_agent.name,
+ root_agent=root_agent,
+ # enable_tracing=True,
+)
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py
new file mode 100644
index 0000000000..473c2072c2
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/main.py
@@ -0,0 +1,1107 @@
+"""FastAPI service fronting the GKE Performance Agent.
+
+Exposes REST endpoints that PKB calls to trigger benchmarks. The agent
+runs *inside* the GKE cluster so it can reach the Sandbox Controller and
+create gVisor sandboxes natively.
+
+Endpoints:
+ GET /healthz → liveness probe
+ POST /benchmark/python/density → run the Python density benchmark (UC-B)
+ POST /benchmark/python/payload → run the payload transfer benchmark (UC-D)
+ POST /benchmark/python/qps → run the QPS saturation benchmark (UC-F)
+ POST /benchmark/chromium/density → run the Chromium density benchmark (UC-C)
+ POST /run → raw ADK agent interaction
+
+POST /benchmark/python/density — Request:
+ {
+ "sample_count": int — iterations per sandbox session (default: 100)
+ "sample_warmup": int — warmup iterations excluded from stats (default: 5)
+ "concurrent_sessions": int — parallel sandbox sessions (default: 1)
+ "sandbox_exec_timeout_s": int — sandbox command execution timeout in seconds (default: 60)
+ }
+
+POST /benchmark/python/density — Response:
+ {
+ "concurrent_sessions": int — requested session count
+ "successful_sessions": int — sessions completed without error
+ "failed_sessions": int — sessions that returned an error
+ "aggregate": {
+ --- Orchestrator-side (timed in _run_single_session, stats in benchmark_density) ---
+ "orchestrator_cel_mean_ms": mean round-trip across sessions
+ "orchestrator_cel_p50_ms": P50 round-trip
+ "orchestrator_cel_p99_ms": P99 round-trip
+ "orchestrator_cel_min_ms": min round-trip
+ "orchestrator_cel_max_ms": max round-trip
+
+ --- Sandbox-side overall (from benchmark_density.py, mean across sessions) ---
+ "sandbox_ttfe_ms": Time To First Execution
+ "sandbox_total_cel_mean_ms": mean total CEL per iteration (sum of all task types)
+ "sandbox_total_cel_p50_ms": P50 total CEL per iteration
+ "sandbox_total_cel_p99_ms": P99 total CEL per iteration
+ "sandbox_total_cel_min_ms": min total CEL per iteration
+ "sandbox_total_cel_max_ms": max total CEL per iteration
+
+ --- Sandbox RSS (from benchmark_density.py, mean across sessions) ---
+ "sandbox_rss_start_mb": RSS at benchmark start
+ "sandbox_rss_end_mb": RSS at benchmark end
+ "sandbox_rss_growth_mb": RSS growth during benchmark
+
+ --- Per-type CEL breakdown (from benchmark_density.py, mean across sessions) ---
+ "sandbox_compute_cel_{mean,p50,p99,min,max}_ms": CPU-bound (math.factorial)
+ "sandbox_syscall_cel_{mean,p50,p99,min,max}_ms": gVisor Sentry (os.stat/listdir)
+ "sandbox_import_cel_{mean,p50,p99,min,max}_ms": Gofer FS I/O (importlib)
+ }
+ "sessions": [ per-session detail array
+ {
+ "session_id": int — zero-based session index
+ "orchestrator_total_ms": float — full round-trip for this session
+ "raw_output": str — raw code execution stdout
+ "sandbox_ttfe_ms": float — TTFE for this session
+ "sandbox_total_cel_mean_ms": float — total CEL mean for this session
+ ... all other sandbox_* metrics for this session
+ }
+ ]
+ }
+
+Data Flow:
+ benchmark_density.py (inside gVisor) → all sandbox_* metrics per session
+ main.py (this file) → orchestrator_* timing + cross-session aggregation
+"""
+
+import json
+import logging
+import os
+import re
+import time
+import asyncio
+from typing import Optional
+from concurrent.futures import ThreadPoolExecutor
+
+import uvicorn
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from google.genai import types
+from google.adk.sessions import InMemorySessionService
+from google.adk.artifacts import InMemoryArtifactService
+from google.adk.runners import Runner
+
+from dotenv import load_dotenv
+
+basedir = os.path.abspath(os.path.dirname(__file__))
+
+# Load generated.env (rendered by gke_image_build_utils._GenerateEnvFile from PKB flags).
+# In GKE, K8s manifest env vars take precedence.
+load_dotenv(os.path.join(basedir, "generated.env"))
+
+from gke_performance_agent import agent
+
+
+# ── SandboxClient factory (DirectConnection vs Dev-mode tunnel) ──────────
+def _make_sandbox_client():
+ """Create a SandboxClient with the optimal connection strategy.
+
+ When SANDBOX_ROUTER_URL is set (in-cluster), uses DirectConnectionConfig
+ to bypass kubectl port-forward SPDY tunnels — enabling true N-way
+ parallelism. Without it, falls back to LocalTunnelConnectionConfig
+ (dev mode, serialized through a single SPDY stream).
+ """
+ from k8s_agent_sandbox.sandbox_client import SandboxClient
+
+ router_url = os.getenv("SANDBOX_ROUTER_URL")
+ if router_url:
+ from k8s_agent_sandbox.models import SandboxDirectConnectionConfig
+
+ return SandboxClient(
+ connection_config=SandboxDirectConnectionConfig(api_url=router_url)
+ )
+ return SandboxClient()
+
+
+# --- Constants ---
+APP_NAME = "gke_performance_agent_app"
+USER_ID = "benchmark_user"
+
+# --- Configure Logging ---
+try:
+ import google.cloud.logging as gcl
+
+ gcl.Client().setup_logging()
+except Exception:
+ logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+# =========================================================================
+# FastAPI Application
+# =========================================================================
+# --- Adaptive ThreadPool based on Agent CPU ---
+def _compute_thread_count() -> int:
+ """Compute a recommended max worker count for ThreadPoolExecutor.
+
+ Heuristic: use ~2x the detected CPU count to provide overlap for blocking
+ I/O (port-forward, file upload) while avoiding CPU oversubscription.
+ Cap between 2 and 64 workers.
+ """
+ cpu = os.cpu_count() or 1
+ return max(2, min(64, cpu * 2))
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Lifespan handler: configure a tuned ThreadPoolExecutor for asyncio.
+
+ Sets the default executor so `asyncio.to_thread` uses our tuned pool,
+ and shuts it down on application exit.
+ """
+ workers = _compute_thread_count()
+ executor = ThreadPoolExecutor(max_workers=workers)
+ loop = asyncio.get_running_loop()
+ loop.set_default_executor(executor)
+ logging.info(
+ "Default ThreadPoolExecutor set to %d workers (cpu=%s)", workers, os.cpu_count()
+ )
+ try:
+ yield
+ finally:
+ try:
+ executor.shutdown(wait=False)
+ logging.info("ThreadPoolExecutor shut down")
+ except Exception:
+ logging.exception("Error shutting down ThreadPoolExecutor")
+
+
+app = FastAPI(title="GKE Benchmark Agent", version="0.2.0", lifespan=lifespan)
+
+# Serialise benchmark requests so concurrent POSTs cannot clobber the
+# shared env vars (BENCHMARK_MODE, SAMPLE_COUNT, …) that agent.py reads.
+_benchmark_lock = asyncio.Lock()
+
+
+def _percentile_stats(sorted_values: list, prefix: str) -> dict:
+ """Compute mean/p50/p95/p99/min/max from a pre-sorted list of numbers."""
+ n = len(sorted_values)
+ if n == 0:
+ return {}
+ return {
+ f"{prefix}_mean_ms": round(sum(sorted_values) / n, 6),
+ f"{prefix}_p50_ms": round(sorted_values[n // 2], 6),
+ f"{prefix}_p95_ms": round(sorted_values[min(int(n * 0.95), n - 1)], 6),
+ f"{prefix}_p99_ms": round(sorted_values[min(int(n * 0.99), n - 1)], 6),
+ f"{prefix}_min_ms": round(sorted_values[0], 6),
+ f"{prefix}_max_ms": round(sorted_values[-1], 6),
+ }
+
+
+# --- Request / Response Models ---
+class BenchmarkRequest(BaseModel):
+ sample_count: int = Field(
+ default=100, ge=1, description="Sample count per sandbox session"
+ )
+ sample_warmup: int = Field(
+ default=5, ge=0, description="Warmup iterations per sandbox session"
+ )
+ concurrent_sessions: int = Field(
+ default=1, ge=1, description="Number of parallel sandbox sessions"
+ )
+ sandbox_exec_timeout_s: int = Field(
+ default=60, ge=10, description="Sandbox command execution timeout in seconds"
+ )
+
+
+class RunRequest(BaseModel):
+ prompt: str = "Please start the GKE performance benchmark workflow."
+
+
+class PayloadBenchmarkRequest(BaseModel):
+ payload_size_mb: float = Field(default=1, gt=0, description="Payload size in MB")
+ payload_iterations: int = Field(
+ default=20, ge=1, description="Number of transfer iterations"
+ )
+ concurrent_sessions: int = Field(
+ default=1, ge=1, description="Number of parallel sandbox sessions"
+ )
+ sandbox_exec_timeout_s: int = Field(
+ default=60, ge=10, description="Sandbox command execution timeout in seconds"
+ )
+
+
+class QpsBenchmarkRequest(BaseModel):
+ target_qps: float = Field(
+ default=10.0, ge=0.1, description="Target requests per second"
+ )
+ duration_s: float = Field(
+ default=60.0, ge=5.0, description="Duration of the QPS burst in seconds"
+ )
+ sandbox_exec_timeout_s: int = Field(
+ default=30, ge=10, description="Sandbox command execution timeout in seconds"
+ )
+
+
+class ChromiumBenchmarkRequest(BaseModel):
+ task_count: int = Field(
+ default=10, ge=1, description="Iterations per Chromium session"
+ )
+ warmup_tasks: int = Field(
+ default=2, ge=0, description="Warmup iterations excluded from stats"
+ )
+ concurrent_sessions: int = Field(
+ default=1, ge=1, description="Number of parallel Chromium sessions"
+ )
+ sandbox_exec_timeout_s: int = Field(
+ default=120, ge=10, description="Sandbox command execution timeout in seconds"
+ )
+
+
+# --- JSON extraction helper ---
+_JSON_RE = re.compile(r"\{[^{}]*\}", re.DOTALL)
+
+
+def _parse_sandbox_json(raw_output: str) -> Optional[dict]:
+ """Extract the sandbox JSON summary from code execution output.
+
+ The sandbox script prints a JSON blob to stdout among other log lines.
+ We find the last valid JSON object that contains sandbox_ keys.
+ """
+ matches = _JSON_RE.findall(raw_output)
+ for candidate in reversed(matches):
+ try:
+ obj = json.loads(candidate)
+ if any(k.startswith("sandbox_") for k in obj):
+ return obj
+ except json.JSONDecodeError:
+ continue
+ return None
+
+
+# --- Agent helper ---
+async def _run_agent(prompt: str) -> str:
+ """Create a fresh session, run the agent, return the final text output."""
+ session_service = InMemorySessionService()
+ artifact_service = InMemoryArtifactService()
+ session = await session_service.create_session(
+ app_name=APP_NAME,
+ user_id=USER_ID,
+ state={},
+ )
+
+ runner = Runner(
+ agent=agent.root_agent,
+ app_name=APP_NAME,
+ session_service=session_service,
+ artifact_service=artifact_service,
+ )
+
+ content = types.Content(
+ role="user",
+ parts=[types.Part(text=prompt)],
+ )
+
+ final_response = ""
+ code_execution_output = ""
+ async with runner:
+ async for event in runner.run_async(
+ user_id=USER_ID,
+ session_id=session.id,
+ new_message=content,
+ ):
+ if event.content and event.content.parts:
+ for part in event.content.parts:
+ cer = getattr(part, "code_execution_result", None) or getattr(
+ part, "codeExecutionResult", None
+ )
+ if cer:
+ code_execution_output = getattr(cer, "output", "") or ""
+ if event.is_final_response() and event.content and event.content.parts:
+ final_response = event.content.parts[0].text
+
+ await session_service.delete_session(
+ app_name=APP_NAME,
+ user_id=USER_ID,
+ session_id=session.id,
+ )
+ return code_execution_output if code_execution_output else final_response
+
+
+async def _run_single_session(session_id: int, prompt: str) -> dict:
+ """Run one agent session and return orchestrator + sandbox metrics."""
+ orchestrator_start = time.perf_counter()
+ logging.info("SESSION_START: session_id=%d start_ts=%.3f", session_id, time.time())
+
+ try:
+ raw_output = await _run_agent(prompt)
+ except Exception as e:
+ return {
+ "session_id": session_id,
+ "error": str(e),
+ }
+
+ orchestrator_elapsed_ms = round(
+ (time.perf_counter() - orchestrator_start) * 1000, 6
+ )
+ logging.info(
+ "SESSION_END: session_id=%d elapsed_ms=%.3f",
+ session_id,
+ orchestrator_elapsed_ms,
+ )
+
+ # Parse sandbox-side metrics from the code execution output
+ sandbox_metrics = _parse_sandbox_json(raw_output) or {}
+
+ return {
+ "session_id": session_id,
+ "orchestrator_total_ms": orchestrator_elapsed_ms,
+ "raw_output": raw_output,
+ **sandbox_metrics,
+ }
+
+
+# --- Endpoints ---
+@app.get("/healthz")
+async def healthz():
+ return {"status": "ok"}
+
+
+@app.post("/benchmark/python/density")
+async def benchmark_python_density(req: BenchmarkRequest):
+ """Trigger the Python density benchmark (Use Case B).
+
+ Fires `concurrent_sessions` parallel agent sessions. Each session
+ claims its own sandbox, runs the benchmark script with the given
+ iteration/warmup counts, and returns both orchestrator-side and
+ sandbox-side metrics.
+ """
+ async with _benchmark_lock:
+ os.environ["BENCHMARK_MODE"] = "density"
+ os.environ["SAMPLE_COUNT"] = str(req.sample_count)
+ os.environ["SAMPLE_WARMUP"] = str(req.sample_warmup)
+ os.environ["SANDBOX_EXEC_TIMEOUT_S"] = str(req.sandbox_exec_timeout_s)
+
+ logger.info(
+ "Starting Python benchmark: sample_count=%d sample_warmup=%d concurrent_sessions=%d",
+ req.sample_count,
+ req.sample_warmup,
+ req.concurrent_sessions,
+ )
+
+ prompt = "Please start the GKE performance benchmark workflow."
+
+ # Fire concurrent sessions.
+ # DESIGN NOTE: Each session runs in its own thread via asyncio.to_thread()
+ # with a nested asyncio.run() to create a per-thread event loop. This is
+ # intentional -- the ADK Runner performs blocking I/O (sandbox lifecycle
+ # via kubectl/HTTP) that would starve a shared event loop and serialize
+ # session starts. The per-thread event loop overhead (~0.1ms) is negligible
+ # compared to sandbox round-trip times (~200ms+).
+ thread_tasks = [
+ asyncio.create_task(
+ asyncio.to_thread(
+ lambda sid=i: asyncio.run(_run_single_session(sid, prompt))
+ )
+ )
+ for i in range(req.concurrent_sessions)
+ ]
+ session_results = await asyncio.gather(*thread_tasks)
+
+ # Separate successful vs failed sessions
+ successful = [r for r in session_results if "error" not in r]
+ failed = [r for r in session_results if "error" in r]
+
+ # Aggregate orchestrator-side metrics across all successful sessions
+ aggregate = {}
+ if successful:
+ orch_times = sorted(r["orchestrator_total_ms"] for r in successful)
+ aggregate.update(_percentile_stats(orch_times, "orchestrator_cel"))
+
+ # Aggregate sandbox-side metrics across sessions
+ sandbox_keys = [k for k in successful[0] if k.startswith("sandbox_")]
+ for key in sandbox_keys:
+ sample_val = successful[0].get(key)
+ if isinstance(sample_val, list):
+ # Pool raw latency arrays across sandboxes → true cross-sandbox stats
+ pooled = sorted(
+ v
+ for r in successful
+ for v in (r.get(key) or [])
+ if isinstance(r.get(key), list)
+ )
+ if pooled:
+ base = key[:-3] if key.endswith("_ms") else key
+ aggregate.update(_percentile_stats(pooled, base))
+ elif isinstance(sample_val, (int, float)):
+ vals = [
+ r[key]
+ for r in successful
+ if key in r and isinstance(r[key], (int, float))
+ ]
+ if vals:
+ if key.endswith("_cel_ms"):
+ # Latency scalars (e.g. import_cel_ms): compute
+ # cross-sandbox percentile stats, like array metrics.
+ base = key[:-3]
+ aggregate.update(_percentile_stats(sorted(vals), base))
+ else:
+ # Non-latency scalars (e.g. rss_mb, ttfe_ms): average
+ aggregate[key] = round(sum(vals) / len(vals), 6)
+
+ return {
+ "concurrent_sessions": req.concurrent_sessions,
+ "successful_sessions": len(successful),
+ "failed_sessions": len(failed),
+ "aggregate": aggregate,
+ "sessions": session_results,
+ }
+
+
+@app.post("/benchmark/python/payload")
+async def benchmark_python_payload(req: PayloadBenchmarkRequest):
+ """Trigger the payload transfer benchmark (Use Case D).
+
+ Measures the cost of returning large observation payloads from a
+ gVisor sandbox back to the orchestrator. Each session generates a
+ payload of `payload_size_mb` MB, encodes it (base64), writes it
+ through the gVisor Gofer path, and reports latency breakdowns.
+ """
+ async with _benchmark_lock:
+ os.environ["BENCHMARK_MODE"] = "payload"
+ os.environ["PAYLOAD_SIZE_MB"] = str(req.payload_size_mb)
+ os.environ["PAYLOAD_ITERATIONS"] = str(req.payload_iterations)
+ os.environ["SANDBOX_EXEC_TIMEOUT_S"] = str(req.sandbox_exec_timeout_s)
+
+ logger.info(
+ "Starting Payload benchmark: payload_size_mb=%s iterations=%d concurrent_sessions=%d",
+ req.payload_size_mb,
+ req.payload_iterations,
+ req.concurrent_sessions,
+ )
+
+ prompt = "Please start the GKE performance benchmark workflow."
+
+ # Fire concurrent sessions.
+ # DESIGN NOTE: Each session runs in its own thread via asyncio.to_thread()
+ # with a nested asyncio.run() to create a per-thread event loop. This is
+ # intentional -- the ADK Runner performs blocking I/O (sandbox lifecycle
+ # via kubectl/HTTP) that would starve a shared event loop and serialize
+ # session starts. The per-thread event loop overhead (~0.1ms) is negligible
+ # compared to sandbox round-trip times (~200ms+).
+ thread_tasks = [
+ asyncio.create_task(
+ asyncio.to_thread(
+ lambda sid=i: asyncio.run(_run_single_session(sid, prompt))
+ )
+ )
+ for i in range(req.concurrent_sessions)
+ ]
+ session_results = await asyncio.gather(*thread_tasks)
+
+ # Separate successful vs failed sessions
+ successful = [r for r in session_results if "error" not in r]
+ failed = [r for r in session_results if "error" in r]
+
+ # Aggregate orchestrator-side metrics across all successful sessions
+ aggregate = {}
+ if successful:
+ orch_times = sorted(r["orchestrator_total_ms"] for r in successful)
+ aggregate.update(_percentile_stats(orch_times, "orchestrator_transfer"))
+
+ # Aggregate sandbox-side metrics (mean across sessions, numeric only)
+ sandbox_keys = [k for k in successful[0] if k.startswith("sandbox_")]
+ for key in sandbox_keys:
+ vals = [
+ r[key]
+ for r in successful
+ if key in r and isinstance(r[key], (int, float))
+ ]
+ if vals:
+ aggregate[key] = round(sum(vals) / len(vals), 6)
+
+ return {
+ "payload_size_mb": req.payload_size_mb,
+ "payload_iterations": req.payload_iterations,
+ "concurrent_sessions": req.concurrent_sessions,
+ "successful_sessions": len(successful),
+ "failed_sessions": len(failed),
+ "aggregate": aggregate,
+ "sessions": session_results,
+ }
+
+
+@app.post("/benchmark/python/qps")
+async def benchmark_python_qps(req: QpsBenchmarkRequest):
+ """Trigger the QPS saturation benchmark (Use Case F).
+
+ Fires sandbox claim requests at a controlled rate (target_qps) for
+ duration_s seconds. Each request claims a sandbox from the warm pool,
+ runs a trivial script, and releases it. Returns per-request TTFE
+ (claim + upload + execute + delete) and aggregate latency stats.
+
+ Uses a lightweight path that calls SandboxClient directly — bypasses
+ the full ADK Runner/MockLLM pipeline to avoid per-request overhead
+ and accurately measure sandbox lifecycle latency at high QPS.
+
+ When the warm pool drains faster than it refills, TTFE spikes from
+ ~200ms to seconds — identifying the QPS saturation point.
+ """
+
+ # Load the QPS script once
+ qps_script_path = os.path.join(
+ basedir, "sandboxed_apps/python_test_app/benchmark_qps.py"
+ )
+ try:
+ with open(qps_script_path, "r") as f:
+ qps_code = f.read()
+ except Exception:
+ qps_code = "import json; print(json.dumps({'sandbox_status': 'ok'}))"
+
+ sandbox_template = os.getenv("SANDBOX_TEMPLATE", "python-sandbox-template")
+ sandbox_namespace = os.getenv("AGENTIC_NAMESPACE", "agentic")
+ exec_timeout = req.sandbox_exec_timeout_s
+ qps_claim_label = {"created-by": "pkb-qps-benchmark"}
+
+ def _run_qps_request(request_id: int) -> dict:
+ """Lightweight sandbox claim→execute→release cycle."""
+ t_total = time.perf_counter()
+ client = _make_sandbox_client()
+ sandbox = None
+ try:
+ # Claim
+ t0 = time.perf_counter()
+ sandbox = client.create_sandbox(
+ template=sandbox_template,
+ namespace=sandbox_namespace,
+ labels=qps_claim_label,
+ )
+ claim_ms = (time.perf_counter() - t0) * 1000
+
+ # Upload
+ t0 = time.perf_counter()
+ sandbox.files.write("script.py", qps_code)
+ upload_ms = (time.perf_counter() - t0) * 1000
+
+ # Execute
+ t0 = time.perf_counter()
+ result = sandbox.commands.run("python3 script.py", timeout=exec_timeout)
+ exec_ms = (time.perf_counter() - t0) * 1000
+
+ ttfe_ms = (time.perf_counter() - t_total) * 1000
+
+ return {
+ "request_id": request_id,
+ "ttfe_ms": round(ttfe_ms, 3),
+ "claim_ms": round(claim_ms, 3),
+ "upload_ms": round(upload_ms, 3),
+ "exec_ms": round(exec_ms, 3),
+ }
+ except Exception as e:
+ ttfe_ms = (time.perf_counter() - t_total) * 1000
+ return {
+ "request_id": request_id,
+ "ttfe_ms": round(ttfe_ms, 3),
+ "error": f"{type(e).__name__}: {e}",
+ }
+ finally:
+ if sandbox is not None:
+ try:
+ client.delete_sandbox(
+ sandbox.claim_name, namespace=sandbox_namespace
+ )
+ except Exception:
+ pass
+
+ async with _benchmark_lock:
+ logger.info(
+ "Starting QPS benchmark: target_qps=%.1f duration_s=%.1f",
+ req.target_qps,
+ req.duration_s,
+ )
+
+ interval = 1.0 / req.target_qps
+
+ # Use a scoped executor sized to the expected concurrency.
+ # Each sandbox request takes ~0.5-5s depending on environment
+ # (in-cluster vs port-forward). We need enough workers so the
+ # thread pool itself is never the bottleneck — only real sandbox
+ # contention should limit throughput.
+ peak_concurrency = int(req.target_qps * req.duration_s)
+ qps_workers = max(16, min(512, peak_concurrency))
+ qps_executor = ThreadPoolExecutor(max_workers=qps_workers)
+ loop = asyncio.get_running_loop()
+ logger.info(
+ "QPS executor: %d workers for ~%d expected requests",
+ qps_workers,
+ peak_concurrency,
+ )
+
+ # Schedule requests at the target QPS rate
+ tasks: list[asyncio.Task] = []
+ t_start = time.time()
+ next_fire = t_start
+ request_id = 0
+
+ while True:
+ now = time.time()
+ elapsed = now - t_start
+ if elapsed >= req.duration_s:
+ break
+ if now >= next_fire:
+ rid = request_id
+ request_id += 1
+ fut = loop.run_in_executor(qps_executor, _run_qps_request, rid)
+ tasks.append(fut)
+ next_fire += interval
+ else:
+ await asyncio.sleep(min(0.001, next_fire - now))
+
+ # Wait for in-flight requests with a drain timeout.
+ drain_timeout = max(60.0, req.duration_s)
+ done, pending = await asyncio.wait(tasks, timeout=drain_timeout)
+
+ # Clean up the scoped executor
+ qps_executor.shutdown(wait=False)
+
+ # Collect completed results (guard against individual task exceptions)
+ session_results = []
+ for t in done:
+ try:
+ session_results.append(t.result())
+ except Exception as exc:
+ session_results.append(
+ {
+ "request_id": -1,
+ "error": str(exc),
+ }
+ )
+
+ # Cancel tasks still queued/running and mark as timed out
+ for t in pending:
+ t.cancel()
+ if pending:
+ logger.warning(
+ "QPS drain timeout: %d/%d requests still pending after %.0fs",
+ len(pending),
+ len(tasks),
+ drain_timeout,
+ )
+ for t in pending:
+ session_results.append(
+ {
+ "request_id": -1,
+ "error": "drain_timeout",
+ }
+ )
+
+ # Bulk-delete SandboxClaims left by cancelled tasks.
+ # Only targets claims labelled created-by=pkb-qps-benchmark so
+ # we never touch claims created by other workloads.
+ try:
+ import subprocess as _sp
+
+ _claims = _sp.run(
+ [
+ "kubectl",
+ "get",
+ "sandboxclaim",
+ "-n",
+ sandbox_namespace,
+ "-l",
+ "created-by=pkb-qps-benchmark",
+ "-o",
+ "jsonpath={.items[*].metadata.name}",
+ ],
+ capture_output=True,
+ text=True,
+ )
+ claim_names = _claims.stdout.strip().split()
+ if claim_names and claim_names != [""]:
+ logger.info("Cleaning up %d lingering pkb-qps claims", len(claim_names))
+ _sp.run(
+ [
+ "kubectl",
+ "delete",
+ "sandboxclaim",
+ "-l",
+ "created-by=pkb-qps-benchmark",
+ "-n",
+ sandbox_namespace,
+ "--wait=false",
+ ],
+ capture_output=True,
+ text=True,
+ )
+ except Exception:
+ logger.warning("Failed to clean up lingering claims", exc_info=True)
+
+ wall_time = time.time() - t_start
+
+ # Separate successful vs failed
+ successful = [r for r in session_results if "error" not in r]
+ failed = [r for r in session_results if "error" in r]
+
+ # Compute TTFE stats
+ aggregate = {}
+ if successful:
+ ttfe_values = sorted(r["ttfe_ms"] for r in successful)
+ if ttfe_values:
+ aggregate.update(_percentile_stats(ttfe_values, "ttfe"))
+
+ # Also compute claim latency stats (the warm-pool-sensitive metric)
+ claim_values = sorted(r["claim_ms"] for r in successful if "claim_ms" in r)
+ if claim_values:
+ aggregate.update(_percentile_stats(claim_values, "claim"))
+
+ return {
+ "target_qps": req.target_qps,
+ "actual_qps": round(request_id / wall_time, 2) if wall_time > 0 else 0,
+ "duration_s": round(wall_time, 2),
+ "total_requests": request_id,
+ "successful_requests": len(successful),
+ "failed_requests": len(failed),
+ "aggregate": aggregate,
+ "sessions": session_results,
+ }
+
+
+@app.post("/benchmark/chromium/density")
+async def benchmark_chromium_density(req: ChromiumBenchmarkRequest):
+ """Trigger the Chromium density benchmark (Use Case C).
+
+ Fires `concurrent_sessions` parallel Chromium sandbox sessions. Each
+ session claims its own sandbox from the chromium warm pool, connects to
+ the sandbox's Chrome instance via CDP (Chrome DevTools Protocol), and
+ drives the benchmark from the orchestrator using Playwright.
+
+ Architecture:
+ - Sandbox: runs headless Chromium (upstream chrome-sandbox image) with
+ --remote-debugging-port=9222 --remote-debugging-address=0.0.0.0
+ - Orchestrator: connects Playwright via connect_over_cdp() to the
+ sandbox pod IP:9222 and drives navigate/click/evaluate/screenshot.
+ - This isolates pure Chrome-under-gVisor overhead without Node.js or
+ a runtime server in the sandbox.
+ """
+ from playwright.async_api import async_playwright
+ from kubernetes import client as k8s_client, config as k8s_config
+
+ async with _benchmark_lock:
+
+ sandbox_namespace = os.getenv("AGENTIC_NAMESPACE", "agentic")
+ sandbox_template = "chromium-sandbox-template"
+
+ logger.info(
+ "Starting Chromium density benchmark (CDP): concurrent_sessions=%d "
+ "task_count=%d warmup_tasks=%d",
+ req.concurrent_sessions,
+ req.task_count,
+ req.warmup_tasks,
+ )
+
+ # Initialize K8s client for pod IP lookup
+ try:
+ k8s_config.load_incluster_config()
+ except k8s_config.ConfigException:
+ k8s_config.load_kube_config()
+ core_v1 = k8s_client.CoreV1Api()
+
+ # Inline HTML test page (data: URL avoids network dependencies)
+ test_page = """data:text/html,
+
+
+
PKB Chromium Benchmark
+
+ Hello Sandbox
+
+
+
+
+
+"""
+
+ # Limit concurrent K8s Metrics API calls to avoid overwhelming metrics-server
+ _metrics_semaphore = asyncio.Semaphore(5)
+
+ async def _run_chromium_session_cdp(session_id: int) -> dict:
+ """Run one Chromium benchmark session via CDP."""
+ sb_client = _make_sandbox_client()
+ sandbox = None
+ t_start = time.time()
+ claim_ms = 0.0
+ cold_start_ms = 0.0
+ try:
+ # 1. Claim sandbox from warm pool
+ t0 = time.time()
+ sandbox = sb_client.create_sandbox(
+ template=sandbox_template,
+ namespace=sandbox_namespace,
+ )
+ claim_ms = (time.time() - t0) * 1000.0
+
+ # 2. Resolve pod IP
+ pod_name = sandbox.get_pod_name()
+ pod = core_v1.read_namespaced_pod(pod_name, sandbox_namespace)
+ pod_ip = pod.status.pod_ip
+ if not pod_ip:
+ raise RuntimeError(f"Pod {pod_name} has no IP assigned")
+
+ cdp_url = f"http://{pod_ip}:9223"
+
+ # 3. Connect Playwright via CDP
+ async with async_playwright() as pw:
+ # Wait for Chrome to be ready (retry connection)
+ browser = None
+ for attempt in range(20):
+ try:
+ browser = await pw.chromium.connect_over_cdp(cdp_url)
+ break
+ except Exception:
+ if attempt >= 19:
+ raise
+ await asyncio.sleep(0.5)
+
+ # Cold start = claim + CDP connect (time until browser ready)
+ cold_start_ms = (time.time() - t_start) * 1000.0
+
+ context = await browser.new_context()
+ page = await context.new_page()
+
+ # Navigate once before measurement loop
+ await page.goto(test_page, wait_until="domcontentloaded")
+
+ # Latency arrays (filled during measured runs only)
+ navigate_ms = []
+ screenshot_ms = []
+ evaluate_ms = []
+ click_ms = []
+ fill_ms = []
+ interaction_ms = []
+
+ total_runs = req.warmup_tasks + req.task_count
+ for run_idx in range(total_runs):
+ measuring = run_idx >= req.warmup_tasks
+
+ # 1. Navigate (reload page)
+ t0 = time.time()
+ await page.goto(test_page, wait_until="domcontentloaded")
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ navigate_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # 2. DOM evaluate — read heading text
+ t0 = time.time()
+ await page.evaluate(
+ "() => document.getElementById('heading').textContent"
+ )
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ evaluate_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # 3. Fill input
+ t0 = time.time()
+ await page.fill("#search", f"query-{run_idx}")
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ fill_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # 4. Click button
+ t0 = time.time()
+ await page.click("#btn")
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ click_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # 5. Verify click effect (DOM mutation)
+ t0 = time.time()
+ await page.evaluate(
+ "() => document.getElementById('output').textContent"
+ )
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ evaluate_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # 6. Screenshot
+ t0 = time.time()
+ await page.screenshot()
+ elapsed = (time.time() - t0) * 1000.0
+ if measuring:
+ screenshot_ms.append(elapsed)
+ interaction_ms.append(elapsed)
+
+ # Read pod memory usage from K8s Metrics API
+ rss_mb = None
+ try:
+ async with _metrics_semaphore:
+ custom_api = k8s_client.CustomObjectsApi()
+ pod_metrics = await asyncio.to_thread(
+ custom_api.get_namespaced_custom_object,
+ group="metrics.k8s.io",
+ version="v1beta1",
+ namespace=sandbox_namespace,
+ plural="pods",
+ name=pod_name,
+ )
+ for c in pod_metrics.get("containers", []):
+ usage = c.get("usage", {}).get("memory", "")
+ if usage.endswith("Ki"):
+ rss_mb = round(int(usage[:-2]) / 1024, 1)
+ elif usage.endswith("Mi"):
+ rss_mb = round(float(usage[:-2]), 1)
+ elif usage.endswith("Gi"):
+ rss_mb = round(float(usage[:-2]) * 1024, 1)
+ break
+ except Exception:
+ logger.warning(
+ "Failed to read pod metrics for %s",
+ pod_name,
+ exc_info=True,
+ )
+
+ await browser.close()
+
+ total_ms = (time.time() - t_start) * 1000.0
+
+ # Compute stats helper
+ def _compute_stats(arr):
+ if not arr:
+ return None
+ s = sorted(arr)
+ n = len(s)
+ return {
+ "mean_ms": round(sum(s) / n, 3),
+ "p50_ms": round(s[min(int(n * 0.50), n - 1)], 3),
+ "p95_ms": round(s[min(int(n * 0.95), n - 1)], 3),
+ "p99_ms": round(s[min(int(n * 0.99), n - 1)], 3),
+ "min_ms": round(s[0], 3),
+ "max_ms": round(s[-1], 3),
+ }
+
+ return {
+ "session_id": session_id,
+ "sandbox_status": "ok",
+ "orchestrator_total_ms": round(total_ms, 3),
+ "claim_ms": round(claim_ms, 3),
+ "cold_start_ms": round(cold_start_ms, 3),
+ "rss_mb": rss_mb,
+ "navigate": _compute_stats(navigate_ms),
+ "evaluate": _compute_stats(evaluate_ms),
+ "fill": _compute_stats(fill_ms),
+ "click": _compute_stats(click_ms),
+ "screenshot": _compute_stats(screenshot_ms),
+ "interaction": _compute_stats(interaction_ms),
+ }
+
+ except Exception as e:
+ total_ms = (time.time() - t_start) * 1000.0
+ logger.exception("Chromium CDP session %d failed", session_id)
+ return {
+ "session_id": session_id,
+ "orchestrator_total_ms": round(total_ms, 3),
+ "claim_ms": round(claim_ms, 3),
+ "error": f"{type(e).__name__}: {e}",
+ }
+ finally:
+ if sandbox is not None:
+ try:
+ sb_client.delete_sandbox(
+ sandbox.claim_name, namespace=sandbox_namespace
+ )
+ except Exception:
+ logger.warning(
+ "Failed to delete sandbox for session %d",
+ session_id,
+ exc_info=True,
+ )
+
+ # Fire concurrent sessions
+ tasks = [_run_chromium_session_cdp(i) for i in range(req.concurrent_sessions)]
+ session_results = await asyncio.gather(*tasks)
+
+ # Separate successful vs failed
+ successful = [r for r in session_results if "error" not in r]
+ failed = [r for r in session_results if "error" in r]
+
+ # Aggregate metrics
+ aggregate = {}
+ if successful:
+ orch_times = sorted(r["orchestrator_total_ms"] for r in successful)
+ aggregate.update(_percentile_stats(orch_times, "orchestrator_total"))
+
+ claim_times = sorted(r["claim_ms"] for r in successful if "claim_ms" in r)
+ if claim_times:
+ aggregate.update(_percentile_stats(claim_times, "claim"))
+
+ # Aggregate cold start and RSS
+ cold_starts = sorted(
+ r["cold_start_ms"] for r in successful if "cold_start_ms" in r
+ )
+ if cold_starts:
+ aggregate["cold_start_mean_ms"] = round(
+ sum(cold_starts) / len(cold_starts), 3
+ )
+ aggregate["cold_start_p95_ms"] = round(
+ cold_starts[min(int(len(cold_starts) * 0.95), len(cold_starts) - 1)], 3
+ )
+
+ rss_vals = sorted(
+ r["rss_mb"] for r in successful if r.get("rss_mb") is not None
+ )
+ if rss_vals:
+ aggregate["rss_end_mb"] = round(sum(rss_vals) / len(rss_vals), 1)
+
+ # Aggregate per-task-type interaction stats
+ for metric_key in (
+ "interaction",
+ "navigate",
+ "evaluate",
+ "click",
+ "fill",
+ "screenshot",
+ ):
+ means = sorted(
+ r[metric_key]["mean_ms"]
+ for r in successful
+ if isinstance(r.get(metric_key), dict) and "mean_ms" in r[metric_key]
+ )
+ p95s = sorted(
+ r[metric_key]["p95_ms"]
+ for r in successful
+ if isinstance(r.get(metric_key), dict) and "p95_ms" in r[metric_key]
+ )
+ if means:
+ aggregate[f"{metric_key}_mean_ms"] = round(sum(means) / len(means), 3)
+ if p95s:
+ aggregate[f"{metric_key}_p95_ms"] = round(
+ p95s[min(int(len(p95s) * 0.95), len(p95s) - 1)], 3
+ )
+
+ return {
+ "concurrent_sessions": req.concurrent_sessions,
+ "successful_sessions": len(successful),
+ "failed_sessions": len(failed),
+ "aggregate": aggregate,
+ "sessions": session_results,
+ }
+
+
+@app.post("/run")
+async def run_agent(req: RunRequest):
+ """Raw agent interaction — send any prompt, get back the agent text."""
+ try:
+ output = await _run_agent(req.prompt)
+ return {"response": output}
+ except Exception as e:
+ logger.exception("Agent run failed")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+# =========================================================================
+# Entry point
+# =========================================================================
+if __name__ == "__main__":
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt b/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt
new file mode 100644
index 0000000000..4ca072323c
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/requirements.txt
@@ -0,0 +1,11 @@
+# Requirements for GKE Performance Agent
+google-adk[gke,extensions]==1.34.1
+k8s-agent-sandbox==0.4.6
+kubernetes>=36.0.1 # Fix: v36.0.0 has auth key mismatch bug (PR #2585)
+google-cloud-aiplatform[adk]==1.153.1
+google-cloud-logging==3.15.0
+fastapi==0.135.3
+uvicorn[standard]==0.44.0
+python-dotenv==1.0.1
+playwright==1.59.0
+
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py
new file mode 100644
index 0000000000..c1d20ecbfb
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_density.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+Agentic Python Sandbox Benchmark
+Measures: TTFE (Time to First Execution), CEL (Command Execution Latency), RSS Memory
+
+Three task categories:
+ - compute: CPU-bound (matrix multiply, sorting large lists)
+ - syscall: gVisor Sentry stress (large file I/O, many stat calls)
+ - import: Gofer FS I/O + memory (import heavy stdlib, build data)
+
+Metrics: all sandbox_* keys.
+"""
+import time
+import json
+import os
+import resource
+import sys
+import math
+import random
+import warnings
+
+warnings.filterwarnings("ignore")
+
+SAMPLE_COUNT = int(os.environ.get("SAMPLE_COUNT") or "20")
+SAMPLE_WARMUP = int(os.environ.get("SAMPLE_WARMUP") or "0")
+
+print(f"SAMPLE_COUNT: {SAMPLE_COUNT}")
+print(f"SAMPLE_WARMUP: {SAMPLE_WARMUP}")
+
+# ── Persistent allocations (retained across iterations to grow RSS) ──
+# ~20MB baseline allocation that stays resident
+_RESIDENT_DATA = [bytearray(1024 * 1024) for _ in range(20)] # 20 × 1MB
+
+
+def get_rss_mb():
+ """Get current RSS memory in MB."""
+ return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+
+
+def get_static_tasks():
+ """Return deterministic static tasks to measure execution latency.
+
+ Three task categories enable decomposition of CEL degradation:
+ - compute: sort a 100k-element list + matrix-like multiply
+ - syscall: write/read 1MB temp files, 2000 stat calls
+ - import: import 15 heavy stdlib modules + build large dicts
+ """
+ return [
+ {
+ "id": 1,
+ "type": "compute",
+ "code": (
+ "import math, random\n"
+ "random.seed(42)\n"
+ "data = [random.random() for _ in range(100_000)]\n"
+ "data.sort()\n"
+ "# Matrix-like multiply (flattened 200×200)\n"
+ "a = list(range(40_000))\n"
+ "b = [x * 0.001 for x in a]\n"
+ "_ = sum(x * y for x, y in zip(a, b))\n"
+ ),
+ },
+ {
+ "id": 2,
+ "type": "syscall",
+ "code": (
+ "import os, tempfile\n"
+ "d = tempfile.gettempdir()\n"
+ "# Write + read 1MB file through gVisor Gofer\n"
+ "path = os.path.join(d, 'bench_heavy.bin')\n"
+ "data = b'x' * (1024 * 1024)\n"
+ "with open(path, 'wb') as f:\n"
+ " f.write(data)\n"
+ "with open(path, 'rb') as f:\n"
+ " _ = f.read()\n"
+ "os.unlink(path)\n"
+ "# Heavy stat/listdir\n"
+ "[os.stat(d) for _ in range(1000)]\n"
+ "[os.listdir(d) for _ in range(1000)]\n"
+ ),
+ },
+ {
+ "id": 3,
+ "type": "import",
+ "code": (
+ "import importlib, sys\n"
+ "mods = [\n"
+ " 'json', 'csv', 'html', 'email', 'unittest', 'logging',\n"
+ " 'xml.etree.ElementTree', 'http.client', 'urllib.request',\n"
+ " 'argparse', 'pprint', 'textwrap', 'difflib',\n"
+ "]\n"
+ "for _ in range(20):\n"
+ " for m in mods:\n"
+ " try:\n"
+ " sys.modules.pop(m, None)\n"
+ " importlib.import_module(m)\n"
+ " except Exception:\n"
+ " pass\n"
+ "# Build a large dict to add memory pressure\n"
+ "_ = {str(i): list(range(100)) for i in range(10_000)}\n"
+ ),
+ },
+ ]
+
+
+def _percentile(sorted_vals, pct):
+ """Return the value at the given percentile from a pre-sorted list."""
+ idx = int(len(sorted_vals) * pct)
+ return sorted_vals[min(idx, len(sorted_vals) - 1)]
+
+
+def run_benchmark():
+ results = {"ttfe_ms": None, "cel_ms": [], "rss_mb_start": None, "rss_mb_end": None}
+
+ # Measure TTFE
+ ttfe_start = time.perf_counter()
+ exec("x = 1 + 1", globals())
+ results["ttfe_ms"] = round((time.perf_counter() - ttfe_start) * 1000, 6)
+
+ results["rss_mb_start"] = get_rss_mb()
+
+ tasks = get_static_tasks()
+ sampled_tasks = [t for t in tasks if t["type"] != "import"]
+ import_task = next((t for t in tasks if t["type"] == "import"), None)
+
+ # Warmup — sampled tasks only (import uses C-extension modules that
+ # error on repeated reimport, so it runs once outside the loop)
+ for _ in range(SAMPLE_WARMUP):
+ for task in sampled_tasks:
+ exec(task["code"], globals())
+
+ # Benchmark iterations — compute + syscall only
+ for i in range(SAMPLE_COUNT):
+ # Grow resident memory slightly each iteration (~100KB)
+ _RESIDENT_DATA.append(bytearray(100 * 1024))
+
+ for task in sampled_tasks:
+ start = time.perf_counter()
+ exec(task["code"], globals())
+ elapsed_ms = round((time.perf_counter() - start) * 1000, 6)
+ results["cel_ms"].append({
+ "iteration": i,
+ "task_id": task["id"],
+ "type": task["type"],
+ "latency_ms": elapsed_ms,
+ })
+
+ # Import task — single run (C-extension modules break on repeated reimport)
+ import_elapsed_ms = 0.0
+ if import_task:
+ import_start = time.perf_counter()
+ exec(import_task["code"], globals())
+ import_elapsed_ms = round((time.perf_counter() - import_start) * 1000, 6)
+
+ results["rss_mb_end"] = get_rss_mb()
+
+ # --- Raw per-iteration totals (compute + syscall) ---
+ iteration_totals = []
+ for i in range(SAMPLE_COUNT):
+ total = sum(r["latency_ms"] for r in results["cel_ms"] if r["iteration"] == i)
+ iteration_totals.append(round(total, 6))
+
+ # --- Raw per-type latencies ---
+ types_seen = sorted(set(r["type"] for r in results["cel_ms"]))
+ per_type_raw = {}
+ for t in types_seen:
+ per_type_raw[t] = [round(r["latency_ms"], 6)
+ for r in results["cel_ms"] if r["type"] == t]
+
+ # Output raw arrays — cross-sandbox stats computed by main.py
+ summary = {
+ "hostname": os.environ.get("HOSTNAME", "unknown"),
+ "sandbox_ttfe_ms": results["ttfe_ms"],
+ "sandbox_total_cel_ms": iteration_totals,
+ "sandbox_import_cel_ms": import_elapsed_ms,
+ "sandbox_rss_start_mb": results["rss_mb_start"],
+ "sandbox_rss_end_mb": results["rss_mb_end"],
+ "sandbox_rss_growth_mb": round(results["rss_mb_end"] - results["rss_mb_start"], 6),
+ "sample_count": SAMPLE_COUNT,
+ "sample_warmup": SAMPLE_WARMUP,
+ "total_iterations": len(iteration_totals),
+ "task_types": len(types_seen) + (1 if import_task else 0),
+ }
+
+ for t, raw in per_type_raw.items():
+ summary[f"sandbox_{t}_cel_ms"] = raw
+
+ print(json.dumps(summary))
+
+ with open("/tmp/benchmark_results.json", "w") as f:
+ json.dump(results, f)
+
+ return summary
+
+if __name__ == "__main__":
+ run_benchmark()
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py
new file mode 100644
index 0000000000..f92a3e694d
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_payload.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""Agentic Payload Transfer Benchmark (Use Case D).
+
+Measures the cost of returning large "Observation" payloads from a gVisor
+sandbox back to the Orchestrator via the real data path:
+ stdout → code_execution_result.output → orchestrator HTTP response.
+
+For a given PAYLOAD_SIZE_MB, the script:
+ 1. Generates a payload of that size (os.urandom + base64)
+ 2. Measures generation, serialization, and stdout-write times separately
+ 3. Repeats for PAYLOAD_ITERATIONS to compute stable percentiles
+ 4. On the final iteration, writes the actual payload to stdout (measuring
+ real end-to-end transfer); other iterations write to /dev/null to
+ measure write-syscall cost without flooding the return channel.
+ 5. Emits a JSON summary to stderr (parsed by main.py)
+
+Metrics are split so that pass/fail thresholds can exclude generation
+time (os.urandom), which is not part of data transfer.
+
+Environment variables (injected by the agent):
+ PAYLOAD_SIZE_MB — target payload size in megabytes (default: 1)
+ PAYLOAD_ITERATIONS — number of transfer iterations (default: 20)
+"""
+
+import base64
+import json
+import os
+import resource
+import sys
+import time
+
+PAYLOAD_SIZE_MB = float(os.environ.get("PAYLOAD_SIZE_MB") or "1")
+PAYLOAD_ITERATIONS = int(os.environ.get("PAYLOAD_ITERATIONS") or "20")
+
+
+# Use stderr for all diagnostic/metric output so stdout is reserved for
+# the actual payload transfer (the measured data path).
+def _log(msg):
+ print(msg, file=sys.stderr, flush=True)
+
+
+_log(f"PAYLOAD_SIZE_MB: {PAYLOAD_SIZE_MB}")
+_log(f"PAYLOAD_ITERATIONS: {PAYLOAD_ITERATIONS}")
+
+
+def get_rss_mb():
+ """Get current RSS memory in MB."""
+ return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
+
+
+def _percentile(sorted_vals, pct):
+ """Return the value at the given percentile from a pre-sorted list."""
+ if not sorted_vals:
+ return 0.0
+ idx = int(len(sorted_vals) * pct)
+ return sorted_vals[min(idx, len(sorted_vals) - 1)]
+
+
+def _stats_for(latencies):
+ """Compute mean/p50/p95/p99/min/max for a list of latencies (ms)."""
+ latencies.sort()
+ return {
+ "mean": round(sum(latencies) / len(latencies), 6),
+ "p50": round(latencies[len(latencies) // 2], 6),
+ "p95": round(_percentile(latencies, 0.95), 6),
+ "p99": round(_percentile(latencies, 0.99), 6),
+ "min": round(latencies[0], 6),
+ "max": round(latencies[-1], 6),
+ }
+
+
+def run_benchmark():
+ """Execute the payload transfer benchmark and print JSON results."""
+ target_bytes = int(PAYLOAD_SIZE_MB * 1024 * 1024)
+ rss_start = get_rss_mb()
+
+ generation_times = []
+ serialization_times = []
+ stdout_times = [] # stdout write syscall time
+ transfer_times = [] # serialize + stdout write (the threshold metric)
+ throughputs = [] # MB/s based on stdout write time
+
+ # --- Warmup (2 iterations, not recorded) ---
+ for _ in range(2):
+ raw = os.urandom(target_bytes)
+ _ = base64.b64encode(raw).decode("ascii")
+
+ # --- Measured iterations ---
+ for i in range(PAYLOAD_ITERATIONS):
+ # 1. Generate payload (os.urandom — NOT data transfer)
+ t0 = time.perf_counter()
+ raw = os.urandom(target_bytes)
+ t_gen = time.perf_counter()
+
+ # 2. Serialize (base64 encode — mirrors real observation encoding)
+ encoded = base64.b64encode(raw).decode("ascii")
+ t_ser = time.perf_counter()
+
+ # 3. Transfer — write payload to stdout (the real sandbox→orchestrator path).
+ # Only the final iteration writes to actual stdout to measure real
+ # end-to-end transfer without flooding the return channel.
+ # Other iterations write to /dev/null (same gVisor write-syscall path,
+ # data discarded by host kernel).
+ t_xfer_start = time.perf_counter()
+ if i == PAYLOAD_ITERATIONS - 1:
+ sys.stdout.write(encoded)
+ sys.stdout.flush()
+ else:
+ with open("/dev/null", "w") as devnull:
+ devnull.write(encoded)
+ t_xfer = time.perf_counter()
+
+ gen_ms = (t_gen - t0) * 1000
+ ser_ms = (t_ser - t_gen) * 1000
+ stdout_ms = (t_xfer - t_xfer_start) * 1000
+ transfer_ms = ser_ms + stdout_ms # excludes generation
+
+ generation_times.append(gen_ms)
+ serialization_times.append(ser_ms)
+ stdout_times.append(stdout_ms)
+ transfer_times.append(transfer_ms)
+
+ # Throughput in MB/s (based on encoded size and stdout write time)
+ encoded_size_mb = len(encoded) / (1024 * 1024)
+ if stdout_ms > 0:
+ throughputs.append(encoded_size_mb / (stdout_ms / 1000))
+
+ rss_end = get_rss_mb()
+
+ # Compute stats
+ gen_stats = _stats_for(generation_times)
+ ser_stats = _stats_for(serialization_times)
+ stdout_stats = _stats_for(stdout_times)
+ transfer_stats = _stats_for(transfer_times)
+ throughput_stats = _stats_for(throughputs) if throughputs else {}
+
+ # Payload metadata
+ encoded_size_bytes = len(base64.b64encode(os.urandom(target_bytes)))
+
+ summary = {
+ "hostname": os.environ.get("HOSTNAME", "unknown"),
+ # Payload config
+ "sandbox_payload_size_bytes": target_bytes,
+ "sandbox_payload_encoded_size_bytes": encoded_size_bytes,
+ "sandbox_payload_iterations": PAYLOAD_ITERATIONS,
+ # Generation time (os.urandom — NOT data transfer, excluded from threshold)
+ "sandbox_generation_time_mean_ms": gen_stats["mean"],
+ "sandbox_generation_time_p50_ms": gen_stats["p50"],
+ "sandbox_generation_time_p95_ms": gen_stats["p95"],
+ "sandbox_generation_time_p99_ms": gen_stats["p99"],
+ "sandbox_generation_time_min_ms": gen_stats["min"],
+ "sandbox_generation_time_max_ms": gen_stats["max"],
+ # Serialization time (base64 encode — CPU bound)
+ "sandbox_serialization_time_mean_ms": ser_stats["mean"],
+ "sandbox_serialization_time_p50_ms": ser_stats["p50"],
+ "sandbox_serialization_time_p95_ms": ser_stats["p95"],
+ "sandbox_serialization_time_p99_ms": ser_stats["p99"],
+ "sandbox_serialization_time_min_ms": ser_stats["min"],
+ "sandbox_serialization_time_max_ms": ser_stats["max"],
+ # Stdout write time (the raw write-syscall through gVisor)
+ "sandbox_stdout_time_mean_ms": stdout_stats["mean"],
+ "sandbox_stdout_time_p50_ms": stdout_stats["p50"],
+ "sandbox_stdout_time_p95_ms": stdout_stats["p95"],
+ "sandbox_stdout_time_p99_ms": stdout_stats["p99"],
+ "sandbox_stdout_time_min_ms": stdout_stats["min"],
+ "sandbox_stdout_time_max_ms": stdout_stats["max"],
+ # Transfer time (serialization + stdout write — the threshold metric)
+ "sandbox_transfer_time_mean_ms": transfer_stats["mean"],
+ "sandbox_transfer_time_p50_ms": transfer_stats["p50"],
+ "sandbox_transfer_time_p95_ms": transfer_stats["p95"],
+ "sandbox_transfer_time_p99_ms": transfer_stats["p99"],
+ "sandbox_transfer_time_min_ms": transfer_stats["min"],
+ "sandbox_transfer_time_max_ms": transfer_stats["max"],
+ # Throughput (MB/s based on transfer write time)
+ "sandbox_throughput_mean_mbps": throughput_stats.get("mean"),
+ "sandbox_throughput_p50_mbps": throughput_stats.get("p50"),
+ "sandbox_throughput_min_mbps": throughput_stats.get("min"),
+ # RSS
+ "sandbox_rss_start_mb": rss_start,
+ "sandbox_rss_end_mb": rss_end,
+ "sandbox_rss_growth_mb": rss_end - rss_start,
+ }
+
+ # Emit JSON summary to stderr for diagnostics.
+ _log("---BENCHMARK_RESULT_JSON---")
+ _log(json.dumps(summary, indent=2))
+
+ # Also emit to stdout (after the payload data) so that
+ # _parse_sandbox_json() can find it in code_execution_result.output.
+ # ADK only captures stdout, not stderr.
+ print("\n---BENCHMARK_RESULT_JSON---", flush=True)
+ print(json.dumps(summary), flush=True)
+
+ return summary
+
+
+if __name__ == "__main__":
+ try:
+ run_benchmark()
+ except Exception as e:
+ import traceback
+
+ traceback.print_exc()
diff --git a/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py
new file mode 100644
index 0000000000..07ef6309db
--- /dev/null
+++ b/perfkitbenchmarker/data/docker/agentic/adk-agent/sandboxed_apps/python_test_app/benchmark_qps.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+"""Minimal QPS benchmark script for UC-F (Scheduling Throughput).
+
+Runs inside the GKE Agent Sandbox to validate claim readiness.
+Executes a trivial operation and reports status. The orchestrator-side
+timing (orchestrator_total_ms) serves as the primary TTFE measurement —
+when the warm pool drains, that metric spikes because fresh pods must be
+cold-started.
+"""
+import json
+import time
+
+t0 = time.perf_counter()
+
+# Trivial computation to prove the sandbox is functional
+result = sum(range(10_000))
+
+elapsed_ms = (time.perf_counter() - t0) * 1000
+
+print(json.dumps({
+ "sandbox_status": "ok",
+ "sandbox_qps_exec_ms": round(elapsed_ms, 3),
+ "sandbox_compute_result": result,
+}))
diff --git a/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml b/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml
new file mode 100644
index 0000000000..69922efdb0
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml
@@ -0,0 +1,150 @@
+# Agentic Benchmark Configuration for GKE
+# Used with: --benchmark_config_file=perfkitbenchmarker/data/k8s_agents/config/agentic_benchmark_config.yaml
+#
+# User/environment-specific flags that MUST be passed on CLI:
+# --project=
+# --owner=
+# --gce_network_name=-agentic-vpc
+# --gke_additional_flags="--workload-pool=.svc.id.goog,--subnetwork=-agentic-subnet,--enable-master-authorized-networks,--master-authorized-networks=$(curl -s ifconfig.me)/32"
+#
+# Per-run flags:
+# --run_stage=provision|prepare|run,cleanup|teardown
+# --run_uri=
+# --temp_dir=
+#
+# Benchmark-specific sweep parameters (vary per run):
+# --k8s_python_density_concurrent_sandbox_count=N
+# --k8s_snapshot_preload_mb=N
+# etc.
+
+# ===========================================================================
+# Shared configuration (defined once, referenced by all benchmarks via YAML
+# anchors). PKB ignores top-level keys that don't match a benchmark name.
+# ===========================================================================
+
+_shared_flags: &shared_flags
+ # --- Cluster creation flags ---
+ gke_additional_flags:
+ - "--enable-pod-snapshots"
+ - "--enable-dataplane-v2"
+ - "--enable-private-nodes"
+ - "--enable-ip-alias"
+ - "--master-ipv4-cidr=172.16.0.0/28"
+ gke_additional_nodepool_flags:
+ - "--max-pods-per-node=250"
+ container_cluster_version: "1.35.5-gke.1057002"
+ gke_enable_shielded_nodes: false
+ gce_subnet_region: "us-central1"
+
+ # --- Agentic workload flags ---
+ k8s_namespace: "agentic"
+ agent_sandbox_version: "v0.4.6"
+ k8s_gvisor: true
+ k8s_agent_api_url: "http://localhost:8080"
+
+_shared_cluster: &shared_cluster
+ cloud: GCP
+ type: Kubernetes
+ vm_count: 1
+ vm_spec:
+ GCP:
+ machine_type: c4-standard-8
+ zone: us-central1-a
+ boot_disk_type: hyperdisk-balanced
+ boot_disk_size: 50
+ nodepools:
+ sandbox:
+ vm_count: 1
+ vm_spec:
+ GCP:
+ machine_type: c4-standard-8
+ zone: us-central1-a
+ boot_disk_type: hyperdisk-balanced
+ boot_disk_size: 100
+ sandbox_config:
+ type: gvisor
+
+_shared_registry: &shared_registry
+ cloud: GCP
+ spec:
+ GCP:
+ zone: us-central1-a
+
+
+_shared_container_specs: &shared_container_specs
+ adk_agent:
+ image: agentic/adk-agent
+
+# ===========================================================================
+# Benchmark definitions (each references the shared anchors above)
+# ===========================================================================
+
+k8s_python_density:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_chromium_density:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_payload:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_qps:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_snapshot:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_warmpool:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
+
+k8s_deletion:
+ flags:
+ <<: *shared_flags
+ container_registry:
+ <<: *shared_registry
+ container_specs:
+ <<: *shared_container_specs
+ container_cluster:
+ <<: *shared_cluster
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2
new file mode 100644
index 0000000000..068b50be11
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/adk-agent.yaml.j2
@@ -0,0 +1,118 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: adk-agent-sa
+ namespace: {{ ns }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: adk-agent-sandbox-role
+rules:
+ - apiGroups: ["agents.x-k8s.io"]
+ resources: ["sandboxes"]
+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+ - apiGroups: ["agents.x-k8s.io"]
+ resources: ["sandboxwarmpool", "sandboxwarmpools"]
+ verbs: ["get", "list", "watch"]
+ - apiGroups: ["extensions.agents.x-k8s.io"]
+ resources: ["sandboxclaims"]
+ verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+ - apiGroups: [""]
+ resources: ["pods", "pods/log", "pods/exec", "services", "configmaps"]
+ verbs: ["get", "list", "watch"]
+ - apiGroups: [""]
+ resources: ["pods/portforward"]
+ verbs: ["create"]
+ - apiGroups: ["metrics.k8s.io"]
+ resources: ["pods"]
+ verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: adk-agent-sandbox-binding
+ namespace: {{ ns }}
+subjects:
+ - kind: ServiceAccount
+ name: adk-agent-sa
+ namespace: {{ ns }}
+roleRef:
+ kind: ClusterRole
+ name: adk-agent-sandbox-role
+ apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: adk-agent
+ namespace: {{ ns }}
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: adk-agent
+ template:
+ metadata:
+ labels:
+ app: adk-agent
+ spec:
+ serviceAccountName: adk-agent-sa
+ containers:
+ - name: adk-agent
+ imagePullPolicy: Always
+ image: {{ adk_image }}
+ resources:
+ limits:
+ memory: "16384Mi"
+ cpu: "6000m"
+ requests:
+ memory: "512Mi"
+ cpu: "1000m"
+ ports:
+ - containerPort: 8080
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: 8080
+ initialDelaySeconds: 15
+ periodSeconds: 30
+ timeoutSeconds: 10
+ failureThreshold: 6
+ readinessProbe:
+ httpGet:
+ path: /healthz
+ port: 8080
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 5
+ failureThreshold: 3
+ env:
+ - name: PORT
+ value: "8080"
+ - name: GOOGLE_CLOUD_PROJECT
+ value: "{{ project }}"
+ - name: GOOGLE_CLOUD_LOCATION
+ value: "{{ region }}"
+ - name: GOOGLE_GENAI_USE_VERTEXAI
+ value: "true"
+ - name: CLUSTER_NAME
+ value: "{{ cluster }}"
+ - name: AGENTIC_NAMESPACE
+ value: "{{ ns }}"
+ - name: SANDBOX_ROUTER_URL
+ value: "http://sandbox-router-svc.{{ ns }}.svc.cluster.local:8080"
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: adk-agent
+ namespace: {{ ns }}
+spec:
+ type: ClusterIP
+ ports:
+ - port: 80
+ targetPort: 8080
+ selector:
+ app: adk-agent
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2
new file mode 100644
index 0000000000..d76f851e95
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/psi-reader.yaml.j2
@@ -0,0 +1,56 @@
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: psi-reader
+ namespace: {{ ns }}
+ labels:
+ app: psi-reader
+spec:
+ selector:
+ matchLabels:
+ app: psi-reader
+ template:
+ metadata:
+ labels:
+ app: psi-reader
+ spec:
+ nodeSelector:
+ pkb_nodepool: sandbox
+ tolerations:
+ - key: "sandbox.gke.io/runtime"
+ operator: "Equal"
+ value: "gvisor"
+ effect: "NoSchedule"
+ - key: "dedicated"
+ operator: "Equal"
+ value: "agentic-sandbox"
+ effect: "NoSchedule"
+ hostPID: true
+ containers:
+ - name: reader
+ image: busybox:1.36
+ command: ["sleep", "infinity"]
+ securityContext:
+ privileged: true
+ volumeMounts:
+ - name: cgroup
+ mountPath: /host/sys/fs/cgroup
+ readOnly: true
+ - name: proc
+ mountPath: /host/proc
+ readOnly: true
+ resources:
+ requests:
+ cpu: "10m"
+ memory: "16Mi"
+ limits:
+ cpu: "50m"
+ memory: "32Mi"
+ volumes:
+ - name: cgroup
+ hostPath:
+ path: /sys/fs/cgroup
+ - name: proc
+ hostPath:
+ path: /proc
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2
new file mode 100644
index 0000000000..0d0541cfe7
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-router.yaml.j2
@@ -0,0 +1,69 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: sandbox-router-svc
+ namespace: {{ ns }}
+spec:
+ type: ClusterIP
+ selector:
+ app: sandbox-router
+ ports:
+ - name: http
+ protocol: TCP
+ port: 8080
+ targetPort: 8080
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: sandbox-router-deployment
+ namespace: {{ ns }}
+spec:
+ replicas: 2
+ selector:
+ matchLabels:
+ app: sandbox-router
+ template:
+ metadata:
+ labels:
+ app: sandbox-router
+ spec:
+ serviceAccountName: adk-agent-sa
+ topologySpreadConstraints:
+ - maxSkew: 1
+ topologyKey: topology.kubernetes.io/zone
+ whenUnsatisfiable: ScheduleAnyway
+ labelSelector:
+ matchLabels:
+ app: sandbox-router
+ containers:
+ - name: router
+ image: {{ router_image }}
+ ports:
+ - containerPort: 8080
+ env:
+ - name: ALLOW_UNAUTHENTICATED_ROUTER
+ value: "true"
+ readinessProbe:
+ httpGet:
+ path: /healthz
+ port: 8080
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ livenessProbe:
+ httpGet:
+ path: /healthz
+ port: 8080
+ initialDelaySeconds: 10
+ periodSeconds: 10
+ resources:
+ requests:
+ cpu: "250m"
+ memory: "512Mi"
+ limits:
+ cpu: "1000m"
+ memory: "1Gi"
+ securityContext:
+ runAsUser: 1000
+ runAsGroup: 1000
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2
new file mode 100644
index 0000000000..e9af43332d
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/sandbox-templates.yaml.j2
@@ -0,0 +1,103 @@
+---
+apiVersion: extensions.agents.x-k8s.io/v1alpha1
+kind: SandboxTemplate
+metadata:
+ name: python-sandbox-template
+ namespace: {{ ns }}
+spec:
+ podTemplate:
+ metadata:
+ labels:
+ sandbox: python-sandbox-example
+ spec:
+ runtimeClassName: gvisor
+ containers:
+ - name: python-runtime
+ image: {{ python_image }}
+ nodeSelector:
+ pkb_nodepool: sandbox
+ tolerations:
+ - key: "sandbox.gke.io/runtime"
+ operator: "Equal"
+ value: "gvisor"
+ effect: "NoSchedule"
+ - key: "dedicated"
+ operator: "Equal"
+ value: "agentic-sandbox"
+ effect: "NoSchedule"
+ restartPolicy: "OnFailure"
+---
+apiVersion: extensions.agents.x-k8s.io/v1alpha1
+kind: SandboxWarmPool
+metadata:
+ name: python-sandbox-warmpool
+ namespace: {{ ns }}
+spec:
+ replicas: {{ warmpool_replicas }}
+ sandboxTemplateRef:
+ name: python-sandbox-template
+---
+apiVersion: extensions.agents.x-k8s.io/v1alpha1
+kind: SandboxTemplate
+metadata:
+ name: chromium-sandbox-template
+ namespace: {{ ns }}
+spec:
+ podTemplate:
+ metadata:
+ labels:
+ sandbox: chromium-sandbox-example
+ spec:
+ runtimeClassName: gvisor
+ containers:
+ - name: chromium-runtime
+ image: {{ chromium_image }}
+ command: ["/bin/sh", "-c"]
+ args:
+ - |
+ socat TCP-LISTEN:9223,fork,reuseaddr TCP:127.0.0.1:9222 &
+ exec chromium --headless --no-sandbox --disable-gpu --disable-dev-shm-usage --remote-debugging-port=9222 --no-first-run --disable-field-trial-config --user-data-dir=/tmp/chrome-data about:blank
+ ports:
+ - containerPort: 9223
+ nodeSelector:
+ pkb_nodepool: sandbox
+ tolerations:
+ - key: "sandbox.gke.io/runtime"
+ operator: "Equal"
+ value: "gvisor"
+ effect: "NoSchedule"
+ - key: "dedicated"
+ operator: "Equal"
+ value: "agentic-sandbox"
+ effect: "NoSchedule"
+ restartPolicy: "OnFailure"
+---
+apiVersion: extensions.agents.x-k8s.io/v1alpha1
+kind: SandboxWarmPool
+metadata:
+ name: chromium-sandbox-warmpool
+ namespace: {{ ns }}
+spec:
+ replicas: {{ chromium_replicas }}
+ sandboxTemplateRef:
+ name: chromium-sandbox-template
+---
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+ name: allow-orchestrator-to-chromium
+ namespace: {{ ns }}
+spec:
+ podSelector:
+ matchLabels:
+ sandbox: chromium-sandbox-example
+ policyTypes:
+ - Ingress
+ ingress:
+ - from:
+ - podSelector:
+ matchLabels:
+ app: adk-agent
+ ports:
+ - protocol: TCP
+ port: 9223
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2
new file mode 100644
index 0000000000..afc4e0ee4c
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-crds.yaml.j2
@@ -0,0 +1,24 @@
+---
+apiVersion: podsnapshot.gke.io/v1
+kind: PodSnapshotStorageConfig
+metadata:
+ name: benchmark-pssc-gcs
+spec:
+ snapshotStorageConfig:
+ gcs:
+ bucket: "{{ bucket_name }}"
+ path: "{{ snapshot_folder }}"
+---
+apiVersion: podsnapshot.gke.io/v1
+kind: PodSnapshotPolicy
+metadata:
+ name: benchmark-psp
+ namespace: {{ ns }}
+spec:
+ storageConfigName: benchmark-pssc-gcs
+ selector:
+ matchLabels:
+ app: snapshot-benchmark-workload
+ triggerConfig:
+ type: manual
+ postCheckpoint: resume
diff --git a/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2 b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2
new file mode 100644
index 0000000000..11850eb444
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/manifests/snapshot-sandbox-template.yaml.j2
@@ -0,0 +1,46 @@
+---
+apiVersion: extensions.agents.x-k8s.io/v1alpha1
+kind: SandboxTemplate
+metadata:
+ name: {{ template_name }}
+ namespace: {{ namespace }}
+spec:
+ podTemplate:
+ metadata:
+ labels:
+ app: snapshot-benchmark-workload
+ spec:
+ serviceAccountName: {{ ksa_name }}
+ runtimeClassName: gvisor
+ containers:
+ - name: preloader
+ image: python:3.11-slim
+ command: ["python3", "-c"]
+ args:
+ - |
+ import time, os
+ preload_mb = int(os.environ.get("PRELOAD_MB", "10"))
+ print(f"Preloading {preload_mb} MB of memory...", flush=True)
+ _ballast = bytearray(preload_mb * 1024 * 1024)
+ print(f"Preload complete. Starting counter.", flush=True)
+ i = 0
+ while True:
+ print(f"Count: {i}", flush=True)
+ i += 1
+ time.sleep(1)
+ env:
+ - name: PRELOAD_MB
+ value: "{{ preload_mb }}"
+ resources:
+ requests:
+ cpu: "250m"
+ memory: "{{ memory_mi }}Mi"
+ ephemeral-storage: "512Mi"
+ nodeSelector:
+ pkb_nodepool: sandbox
+ tolerations:
+ - key: "sandbox.gke.io/runtime"
+ operator: "Equal"
+ value: "gvisor"
+ effect: "NoSchedule"
+ restartPolicy: "OnFailure"
diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md
new file mode 100644
index 0000000000..86b33c8486
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/README.md
@@ -0,0 +1,64 @@
+# Vibe Coding Startup Scripts
+
+Pluggable startup scripts for the UC-A snapshot saturation harness (`sweeps/snapshot_saturation_search.py`). Each script simulates a realistic "vibe coding" sandbox cold-start — the kind of environment setup that happens when an AI coding agent provisions a new sandbox for a user.
+
+## How It Works
+
+When `--preload_mode=script:` is passed to the sweep harness:
+
+1. The script is read from disk and embedded into the pod's container entrypoint
+2. The pod runs the script to completion (installs packages, starts services, etc.)
+3. After the script exits 0, the harness prints `SCRIPT_READY` and starts a counter loop
+4. **TTFE** is measured as the total time from SandboxClaim creation to `SCRIPT_READY`
+
+This lets you compare cold-start TTFE (full script execution) against snapshot/restore TTFE (resuming from a pre-snapshotted state where the script already ran).
+
+## Scripts
+
+### startup_pip_fastapi.sh
+
+**Lightweight Python variant.** Runs natively in the `python:3.11-slim` base image.
+
+Steps: `pip install fastapi uvicorn` → create app → start uvicorn → wait for first HTTP response.
+
+Typical cold-start: ~5–8s on GKE with fast network.
+
+```bash
+# Cold-start only
+python sweeps/snapshot_saturation_search.py \
+ --skip_snapshot \
+ --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \
+ --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \
+ --ttfe_threshold_s=20
+
+# With snapshot/restore (shows restore speedup vs cold-start)
+python sweeps/snapshot_saturation_search.py \
+ --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \
+ --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \
+ --ttfe_threshold_s=20 --restore_threshold_s=10
+```
+
+### startup_npm_vite.sh
+
+**Heavier Node.js variant.** Installs Node.js + npm from apt, then npm-installs Vite and starts a dev server.
+
+Steps: `apt-get install nodejs npm` → `npm install vite` → start Vite dev server → wait for first page served.
+
+Typical cold-start: ~30–60s (apt + npm on cold cache).
+
+```bash
+python sweeps/snapshot_saturation_search.py \
+ --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \
+ --burst_size=3 --search_mode=binary --search_min=10 --search_max=30 \
+ --ttfe_threshold_s=120 --restore_threshold_s=10
+```
+
+## Writing Your Own Script
+
+Requirements:
+- Must be a bash script (runs via `bash -c` in a `python:3.11-slim` container)
+- Must exit 0 on success (use `set -e` for fail-fast)
+- Should print progress to stdout (visible in pod logs for debugging)
+- The harness appends `SCRIPT_READY` + counter loop after your script — don't add your own
+
+The `PRELOAD_MB` env var is available but unused by these scripts. The sweep varies it to test different memory request levels on the pod.
diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh
new file mode 100644
index 0000000000..f3e9c9c235
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_npm_vite.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Vibe Coding Startup Script — npm + Vite dev server
+#
+# Simulates a typical agentic sandbox "vibe coding" cold-start:
+# 1. Install Node.js dependencies (bun/npm)
+# 2. Start a Vite dev server
+# 3. Wait for the server to be ready (first page served)
+#
+# This script is designed to run inside the sandbox container (python:3.11-slim).
+# It installs Node.js + npm + dependencies from scratch to measure realistic
+# cold-start latency including package installation.
+#
+# Usage (cold-start only):
+# python sweeps/snapshot_saturation_search.py \
+# --skip_snapshot \
+# --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \
+# --burst_size=3 \
+# --search_mode=binary --search_min=10 --search_max=30 \
+# --ttfe_threshold_s=120
+#
+# Usage (with snapshot/restore):
+# python sweeps/snapshot_saturation_search.py \
+# --preload_mode=script:workloads/vibe_coding/startup_npm_vite.sh \
+# --burst_size=3 \
+# --search_mode=binary --search_min=10 --search_max=30 \
+# --ttfe_threshold_s=120 --restore_threshold_s=10
+#
+# NOTE: --search_min/--search_max control the PRELOAD_MB env var passed to
+# the container; in script mode this is unused by the script itself but
+# varies memory requests to test different resource pressure levels.
+
+set -e
+
+echo "[vibe-coding] Installing Node.js..."
+apt-get update -qq && apt-get install -y -qq nodejs npm > /dev/null 2>&1
+
+echo "[vibe-coding] Creating project scaffold..."
+mkdir -p /tmp/vibe-project && cd /tmp/vibe-project
+
+# Create a minimal package.json with Vite
+cat > package.json << 'EOF'
+{
+ "name": "vibe-sandbox",
+ "private": true,
+ "scripts": {
+ "dev": "vite --host 0.0.0.0 --port 5173"
+ },
+ "dependencies": {
+ "vite": "^5.0.0"
+ }
+}
+EOF
+
+# Create minimal index.html for Vite to serve
+cat > index.html << 'EOF'
+
+Vibe
+Ready
+
+EOF
+
+echo "[vibe-coding] Installing npm dependencies..."
+npm install --prefer-offline 2>&1 | tail -5
+
+echo "[vibe-coding] Starting Vite dev server..."
+npx vite --host 0.0.0.0 --port 5173 &
+VITE_PID=$!
+
+echo "[vibe-coding] Waiting for server to be ready..."
+MAX_WAIT=60
+ELAPSED=0
+while ! curl -s http://localhost:5173 > /dev/null 2>&1; do
+ sleep 1
+ ELAPSED=$((ELAPSED + 1))
+ if [ $ELAPSED -ge $MAX_WAIT ]; then
+ echo "[vibe-coding] ERROR: Server did not start within ${MAX_WAIT}s"
+ exit 1
+ fi
+done
+
+echo "[vibe-coding] First page served successfully (${ELAPSED}s)"
+
+# Kill the vite server — we only needed to measure startup time
+kill $VITE_PID 2>/dev/null || true
diff --git a/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh
new file mode 100644
index 0000000000..d54a851bda
--- /dev/null
+++ b/perfkitbenchmarker/data/k8s_agents/workloads/vibe_coding/startup_pip_fastapi.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Lightweight Vibe Coding Startup Script — pip install + FastAPI
+#
+# Simulates a Python-based agentic sandbox cold-start:
+# 1. Install Python packages (FastAPI + uvicorn)
+# 2. Start a web server
+# 3. Wait for the server to respond
+#
+# This is lighter weight than the npm/Vite variant and runs natively
+# in the python:3.11-slim base image without needing to install Node.js.
+#
+# Usage (cold-start only):
+# python sweeps/snapshot_saturation_search.py \
+# --skip_snapshot \
+# --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \
+# --burst_size=3 \
+# --search_mode=binary --search_min=10 --search_max=30 \
+# --ttfe_threshold_s=20
+#
+# Usage (with snapshot/restore):
+# python sweeps/snapshot_saturation_search.py \
+# --preload_mode=script:workloads/vibe_coding/startup_pip_fastapi.sh \
+# --burst_size=3 \
+# --search_mode=binary --search_min=10 --search_max=30 \
+# --ttfe_threshold_s=20 --restore_threshold_s=10
+#
+# NOTE: --search_min/--search_max control the PRELOAD_MB env var passed to
+# the container; in script mode this is unused by the script itself but
+# varies memory requests to test different resource pressure levels.
+
+set -e
+
+echo "[vibe-coding] Installing Python packages..."
+pip install --quiet fastapi uvicorn 2>&1 | tail -3
+
+echo "[vibe-coding] Creating app..."
+cat > /tmp/app.py << 'EOF'
+from fastapi import FastAPI
+app = FastAPI()
+
+@app.get("/")
+def root():
+ return {"status": "ready"}
+EOF
+
+echo "[vibe-coding] Starting uvicorn server..."
+python -m uvicorn app:app --host 0.0.0.0 --port 8000 --app-dir /tmp &
+SERVER_PID=$!
+
+echo "[vibe-coding] Waiting for server to be ready..."
+MAX_WAIT=30
+ELAPSED=0
+while ! python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/')" 2>/dev/null; do
+ sleep 1
+ ELAPSED=$((ELAPSED + 1))
+ if [ $ELAPSED -ge $MAX_WAIT ]; then
+ echo "[vibe-coding] ERROR: Server did not start within ${MAX_WAIT}s"
+ exit 1
+ fi
+done
+
+echo "[vibe-coding] First request served successfully (${ELAPSED}s)"
+
+# Kill the server — we only needed to measure startup time
+kill $SERVER_PID 2>/dev/null || true
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py
new file mode 100644
index 0000000000..6dfb59b981
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py
new file mode 100644
index 0000000000..6dfb59b981
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py
new file mode 100644
index 0000000000..b2d31e026b
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_deploy_utils.py
@@ -0,0 +1,497 @@
+"""Shared workload deployment utilities for GKE Agent Sandbox benchmarks.
+
+Provides idempotent functions to deploy the Agent Sandbox ecosystem
+(CRDs, templates, warm pools, router, ADK agent, PSI reader) onto a
+pre-provisioned GKE cluster. Called by each benchmark's Prepare() stage.
+
+All functions are idempotent -- safe to call repeatedly without side effects.
+"""
+
+import logging
+import os
+
+from absl import flags
+from jinja2 import Template
+from perfkitbenchmarker import data
+from perfkitbenchmarker import vm_util
+from perfkitbenchmarker.resources.container_service import kubectl
+
+FLAGS = flags.FLAGS
+
+# ---------------------------------------------------------------------------
+# Flags (registered once; shared across all benchmarks)
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_string(
+ "agent_sandbox_version",
+ "v0.4.6",
+ "Agent Sandbox controller version (GitHub release tag).",
+)
+
+flags.DEFINE_string(
+ "agent_sandbox_router_image",
+ "",
+ "Sandbox router container image. If empty, router deployment is skipped.",
+)
+
+flags.DEFINE_string(
+ "k8s_agent_image",
+ "",
+ "ADK agent container image. If empty, agent deployment is skipped.",
+)
+
+flags.DEFINE_string(
+ "k8s_chromium_image",
+ "",
+ "Chromium sandbox container image. If empty, uses placeholder.",
+)
+
+flags.DEFINE_integer(
+ "agent_sandbox_warmpool_replicas",
+ 2,
+ "Default warm pool replica count for SandboxWarmPool resources.",
+)
+
+flags.DEFINE_integer(
+ "agent_sandbox_chromium_replicas",
+ 1,
+ "Default Chromium warm pool replica count.",
+)
+
+flags.DEFINE_string(
+ "k8s_python_image",
+ "registry.k8s.io/agent-sandbox/python-runtime-sandbox:v0.1.0",
+ "Python runtime sandbox container image.",
+)
+
+flags.DEFINE_integer(
+ "k8s_deploy_timeout",
+ 120,
+ "Timeout in seconds for workload deployment rollout.",
+)
+
+
+
+
+# Module-level derived images (set during DeployWorkloads)
+_derived_images = {}
+
+# ---------------------------------------------------------------------------
+# Template loading
+# ---------------------------------------------------------------------------
+
+_MANIFESTS_DIR = "k8s_agents/manifests"
+
+
+def _LoadTemplate(template_name):
+ """Load a Jinja2 template from the data directory."""
+ template_path = os.path.join(
+ data.ResourcePath(_MANIFESTS_DIR), template_name
+ )
+ with open(template_path, "r") as f:
+ return Template(f.read())
+
+
+def _RenderAndApply(template_name, **kwargs):
+ """Load a Jinja2 template, render it, write to file, and kubectl apply."""
+ template = _LoadTemplate(template_name)
+ rendered = template.render(**kwargs)
+
+ # Write rendered YAML to tmp dir (RunKubectlCommand does not support stdin)
+ tmp_dir = os.path.join(
+ data.ResourcePath(_MANIFESTS_DIR), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+
+ # Strip .j2 extension for the rendered file
+ rendered_name = template_name.replace(".j2", "")
+ rendered_path = os.path.join(tmp_dir, rendered_name)
+ with open(rendered_path, "w") as f:
+ f.write(rendered)
+
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", rendered_path],
+ raise_on_failure=False,
+ )
+ if retcode != 0:
+ logging.warning(
+ "kubectl apply failed for %s: %s", template_name, stderr[:500]
+ )
+ return retcode == 0
+
+
+flags.DEFINE_bool(
+ "skip_deploy_snapshots",
+ False,
+ "Skip deployment of Pod Snapshot infrastructure. "
+ "Set to True on non-GKE clusters where pod snapshots are not supported.",
+)
+
+flags.DEFINE_string(
+ "k8s_snapshot_ksa_name",
+ "pod-snapshot-sa",
+ "Kubernetes service account for pod snapshots.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def _DeriveImagePaths(project, region, arch):
+ """Derive container image paths from cluster config.
+
+ Args:
+ project: GCP project ID.
+ region: GCP region (e.g. us-central1).
+ arch: Docker platform architecture (amd64 or arm64).
+
+ Returns:
+ Dict with keys: adk_agent, sandbox_router, chromium.
+ """
+ return {
+ "adk_agent": f"{region}-docker.pkg.dev/{project}/adk-repo/adk-agent:{arch}",
+ "sandbox_router": f"{region}-docker.pkg.dev/{project}/agent-sandbox/sandbox-router:{arch}",
+ "chromium": f"{region}-docker.pkg.dev/{project}/agent-sandbox/chrome-sandbox:{arch}",
+ }
+
+def DeployWorkloads(benchmark_spec=None):
+ """Deploy the full Agent Sandbox ecosystem onto the GKE cluster.
+
+ Idempotent: safe to call repeatedly. Sequence:
+ 1. Build images (if --skip_image_build=False)
+ 2. Create namespace
+ 3. Install Agent Sandbox CRDs
+ 4. Deploy SandboxTemplates + WarmPools
+ 5. Deploy Sandbox Router
+ 6. Deploy ADK Agent (Deployment + Service + RBAC)
+ 7. Deploy PSI Reader DaemonSet
+ 8. Wait for ADK Agent rollout
+ """
+ ns = FLAGS.k8s_namespace
+ logging.info("=== DeployWorkloads: namespace=%s ===", ns)
+
+ # Derive project, region, machine_type, cluster_name from benchmark_spec
+ project = ""
+ region = ""
+ machine_type = ""
+ cluster_name = ""
+ cluster = None
+ if benchmark_spec:
+ cluster = getattr(benchmark_spec, 'container_cluster', None)
+ if cluster:
+ project = getattr(cluster, 'project', '') or ''
+ zone = getattr(cluster, 'zone', '') or ''
+ region = zone[:-2] if zone else ''
+ cluster_name = getattr(cluster, 'name', '') or ''
+ # Prefer sandbox nodepool machine_type
+ nodepools = getattr(cluster, 'nodepools', None)
+ if nodepools and isinstance(nodepools, dict):
+ sandbox_pool = nodepools.get('sandbox')
+ if sandbox_pool and hasattr(sandbox_pool, 'vm_spec'):
+ machine_type = getattr(sandbox_pool.vm_spec, 'machine_type', '') or ''
+ if not machine_type and hasattr(cluster, 'vm_spec'):
+ machine_type = getattr(cluster.vm_spec, 'machine_type', '') or ''
+ # Fallback to global FLAGS if benchmark_spec not available
+ if not project:
+ project = getattr(FLAGS, 'project', '') or ''
+ if not region:
+ zone = getattr(FLAGS, 'zone', '') or ''
+ region = zone[:-2] if zone else ''
+
+ # Derive image paths for template rendering.
+ # Chrome and Router images are built during prerequisites
+ # (gke_prerequisites.py), not during Prepare.
+ # ADK agent image is built by PKB container_specs during Provision.
+ from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_image_build_utils,
+ )
+ arch = FLAGS.target_arch or "amd64"
+ global _derived_images
+ _derived_images = _DeriveImagePaths(project, region, arch)
+ logging.info(
+ "DeployWorkloads: project=%s region=%s arch=%s",
+ project, region, arch,
+ )
+ logging.info("_derived_images: %s", _derived_images)
+
+ _CreateNamespace(ns)
+ _InstallCRDs()
+ _DeploySandboxTemplates(ns)
+ _DeploySandboxRouter(ns)
+ # Prefer ADK image from PKB-native container_specs (built during Provision).
+ # Falls back to FLAGS.k8s_agent_image or derived image path.
+ adk_image_from_specs = ""
+ if benchmark_spec:
+ specs = getattr(benchmark_spec, "container_specs", {})
+ adk_spec = specs.get("adk_agent")
+ if adk_spec and getattr(adk_spec, "image", None):
+ adk_image_from_specs = adk_spec.image
+ logging.info("Using ADK image from container_specs: %s", adk_image_from_specs)
+ _DeployADKAgent(ns, project=project, region=region, cluster_name=cluster_name, adk_image_override=adk_image_from_specs)
+ _DeployPSIReader(ns)
+ _WaitForAgentReady(ns)
+
+ logging.info("DeployWorkloads complete.")
+
+
+def DeploySnapshots():
+ """Deploy Pod Snapshot infrastructure.
+
+ Idempotent: safe to call repeatedly. Sequence:
+ 1. Create GCS bucket (hierarchical namespace)
+ 2. Create managed folder
+ 3. Create KSA for snapshots
+ 4. Bind IAM roles
+ 5. Deploy PodSnapshotStorageConfig + PodSnapshotPolicy
+ """
+ if FLAGS.skip_deploy_snapshots:
+ logging.info("Skipping snapshot infrastructure (--skip_deploy_snapshots=True).")
+ return
+
+ ns = FLAGS.k8s_namespace
+ project = getattr(FLAGS, 'project', '') or ''
+ zone = getattr(FLAGS, 'zone', '') or ''
+ region = zone[:-2] if zone else ''
+
+ if not project:
+ logging.warning("DeploySnapshots: FLAGS.project not set, skipping.")
+ return
+
+ bucket_name = "agent-sandbox-snapshots-{}".format(project)
+ snapshot_folder = "benchmark-snapshots"
+ ksa_name = FLAGS.k8s_snapshot_ksa_name
+
+ logging.info("=== DeploySnapshots: bucket=%s ===", bucket_name)
+
+ # 1. Create GCS bucket
+ vm_util.IssueCommand(
+ [
+ "gcloud", "storage", "buckets", "create",
+ "gs://{}".format(bucket_name),
+ "--uniform-bucket-level-access",
+ "--enable-hierarchical-namespace",
+ "--soft-delete-duration=0d",
+ "--location={}".format(region),
+ "--project={}".format(project),
+ ],
+ raise_on_failure=False,
+ )
+
+ # 2. Create managed folder
+ vm_util.IssueCommand(
+ [
+ "gcloud", "storage", "managed-folders", "create",
+ "gs://{}/{}/".format(bucket_name, snapshot_folder),
+ "--project={}".format(project),
+ ],
+ raise_on_failure=False,
+ )
+
+ # 3. Create KSA
+ kubectl.RunKubectlCommand(
+ ["create", "serviceaccount", ksa_name, "--namespace", ns],
+ raise_on_failure=False,
+ )
+
+ # 4. IAM bindings
+ project_number = _GetProjectNumber(project)
+ if project_number:
+ _BindSnapshotIAM(bucket_name, project, project_number, ns, ksa_name)
+
+ # 5. Deploy PSSC + PSP
+ _RenderAndApply(
+ "snapshot-crds.yaml.j2",
+ ns=ns,
+ bucket_name=bucket_name,
+ snapshot_folder=snapshot_folder,
+ )
+
+ logging.info("DeploySnapshots complete.")
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _CreateNamespace(ns):
+ """Create namespace if it doesn't exist."""
+ kubectl.RunKubectlCommand(
+ ["create", "namespace", ns],
+ raise_on_failure=False,
+ )
+
+
+def _InstallCRDs():
+ """Install Agent Sandbox CRDs from GitHub release."""
+ version = FLAGS.agent_sandbox_version
+ base_url = (
+ "https://github.com/kubernetes-sigs/agent-sandbox"
+ "/releases/download/{}".format(version)
+ )
+ logging.info("Installing Agent Sandbox CRDs (%s)", version)
+ kubectl.RunKubectlCommand(
+ [
+ "apply",
+ "-f", "{}/manifest.yaml".format(base_url),
+ "-f", "{}/extensions.yaml".format(base_url),
+ ],
+ raise_on_failure=False,
+ )
+
+
+def _DeploySandboxTemplates(ns):
+ """Deploy SandboxTemplate + WarmPool for Python and Chromium."""
+ python_image = FLAGS.k8s_python_image
+ chromium_image = FLAGS.k8s_chromium_image or _derived_images.get("chromium", "chromium-placeholder:latest")
+ warmpool_replicas = FLAGS.agent_sandbox_warmpool_replicas
+ chromium_replicas = FLAGS.agent_sandbox_chromium_replicas
+
+ _RenderAndApply(
+ "sandbox-templates.yaml.j2",
+ ns=ns,
+ python_image=python_image,
+ chromium_image=chromium_image,
+ warmpool_replicas=warmpool_replicas,
+ chromium_replicas=chromium_replicas,
+ )
+
+
+def _DeploySandboxRouter(ns):
+ """Deploy the Sandbox Router Deployment + Service."""
+ router_image = FLAGS.agent_sandbox_router_image or _derived_images.get("sandbox_router", "")
+ if not router_image:
+ logging.info("Sandbox router image not set, skipping router deployment.")
+ return
+
+ _RenderAndApply(
+ "sandbox-router.yaml.j2",
+ ns=ns,
+ router_image=router_image,
+ )
+
+
+def _DeployADKAgent(ns, project="", region="", cluster_name="", adk_image_override=""):
+ """Deploy ADK Agent: SA, ClusterRole, RoleBinding, Deployment, Service."""
+ adk_image = adk_image_override or FLAGS.k8s_agent_image or _derived_images.get("adk_agent", "")
+
+ # Validate the image looks like a registry path, not a Dockerfile path.
+ # When Prepare runs separately from Provision, container_specs may not
+ # have the built image path. The config YAML default (agentic/adk-agent)
+ # is the Dockerfile lookup path, not a valid registry reference.
+ if adk_image and "docker.pkg.dev" not in adk_image:
+ derived = _derived_images.get("adk_agent", "")
+ if derived:
+ logging.warning(
+ "ADK image %s is not a registry path. Using derived: %s",
+ adk_image, derived,
+ )
+ adk_image = derived
+
+ if not adk_image:
+ logging.info("ADK agent image not set, skipping agent deployment.")
+ return
+
+ logging.info("Using ADK image: %s", adk_image)
+
+ project = project or ""
+ region = region or ""
+ cluster = cluster_name or ""
+
+ _RenderAndApply(
+ "adk-agent.yaml.j2",
+ ns=ns,
+ adk_image=adk_image,
+ project=project,
+ region=region,
+ cluster=cluster,
+ )
+
+
+def _DeployPSIReader(ns):
+ """Deploy PSI Reader DaemonSet for cgroup pressure metrics."""
+ _RenderAndApply("psi-reader.yaml.j2", ns=ns)
+
+
+def _WaitForAgentReady(ns):
+ """Wait for ADK agent deployment to be ready.
+
+ Always attempts the rollout wait regardless of how the image was
+ specified (FLAGS.k8s_agent_image, container_specs, or _derived_images).
+ kubectl rollout status returns non-zero harmlessly if the deployment
+ does not exist, and raise_on_failure=False prevents that from
+ propagating.
+ """
+ timeout = FLAGS.k8s_deploy_timeout
+ logging.info("Waiting for adk-agent rollout (timeout=%ds)...", timeout)
+ _, stderr, retcode = kubectl.RunKubectlCommand(
+ [
+ "rollout", "status", "deployment/adk-agent",
+ "-n", ns,
+ "--timeout={}s".format(timeout),
+ ],
+ raise_on_failure=False,
+ )
+ if retcode != 0:
+ logging.warning(
+ "adk-agent rollout status returned %d: %s",
+ retcode, stderr.strip()[:200],
+ )
+
+
+def _GetProjectNumber(project):
+ """Get GCP project number from project ID."""
+ stdout, _, retcode = vm_util.IssueCommand(
+ [
+ "gcloud", "projects", "describe", project,
+ "--format=value(projectNumber)",
+ ],
+ raise_on_failure=False,
+ )
+ return stdout.strip() if retcode == 0 else None
+
+
+def _BindSnapshotIAM(bucket_name, project, project_number, ns, ksa_name):
+ """Bind IAM roles for pod snapshot access."""
+ # bucketViewer to namespace
+ vm_util.IssueCommand(
+ [
+ "gcloud", "storage", "buckets", "add-iam-policy-binding",
+ "gs://{}".format(bucket_name),
+ "--member=principalSet://iam.googleapis.com/projects/{}"
+ "/locations/global/workloadIdentityPools/{}.svc.id.goog"
+ "/namespace/{}".format(project_number, project, ns),
+ "--role=roles/storage.bucketViewer",
+ "--quiet",
+ ],
+ raise_on_failure=False,
+ )
+
+ # objectAdmin to KSA
+ vm_util.IssueCommand(
+ [
+ "gcloud", "storage", "buckets", "add-iam-policy-binding",
+ "gs://{}".format(bucket_name),
+ "--member=principal://iam.googleapis.com/projects/{}"
+ "/locations/global/workloadIdentityPools/{}.svc.id.goog"
+ "/subject/ns/{}/sa/{}".format(project_number, project, ns, ksa_name),
+ "--role=roles/storage.objectAdmin",
+ "--quiet",
+ ],
+ raise_on_failure=False,
+ )
+
+ # objectUser to GKE snapshot controller
+ vm_util.IssueCommand(
+ [
+ "gcloud", "storage", "buckets", "add-iam-policy-binding",
+ "gs://{}".format(bucket_name),
+ "--member=serviceAccount:service-{}"
+ "@container-engine-robot.iam.gserviceaccount.com".format(project_number),
+ "--role=roles/storage.objectUser",
+ "--quiet",
+ ],
+ raise_on_failure=False,
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py
new file mode 100644
index 0000000000..2e976207f5
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_image_build_utils.py
@@ -0,0 +1,355 @@
+"""Shared image build utilities for GKE Agent Sandbox benchmarks.
+
+Builds and pushes container images (Chrome sandbox, Sandbox Router) via
+Google Cloud Build. Called from gke_deploy_utils.DeployWorkloads() during
+the Prepare stage.
+
+NOTE: The ADK Agent image is built by the PKB native container_specs
+mechanism during the Provision stage, not by this module.
+
+Images built:
+ - Chrome Sandbox: cloned from agent-sandbox repo
+ - Sandbox Router: cloned from agent-sandbox repo
+"""
+
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+
+from absl import flags
+from perfkitbenchmarker import vm_util
+
+FLAGS = flags.FLAGS
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Architecture detection
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_string(
+ "target_arch",
+ "",
+ "Target CPU architecture for container images (amd64 or arm64). "
+ "If set, skips gcloud machine-type detection. "
+ "Use this for non-GCP environments or when gcloud is unavailable.",
+)
+
+_ARCH_MAP = {
+ "X86_64": "amd64",
+ "ARM64": "arm64",
+}
+
+
+def _DetectArchitecture(machine_type, zone, project):
+ """Detect CPU architecture for a GCP machine type.
+
+ Uses gcloud to query the machine type's architecture, then maps
+ GCP naming (X86_64/ARM64) to Docker platform naming (amd64/arm64).
+
+ Falls back to amd64 if gcloud fails.
+ """
+ # Quick exit if user provided arch explicitly
+ if FLAGS.target_arch:
+ arch = FLAGS.target_arch.lower()
+ if arch in ("amd64", "arm64"):
+ logging.info("Using user-provided target_arch: %s", arch)
+ return arch
+ logging.warning(
+ "Invalid --target_arch='%s'. Must be amd64 or arm64. "
+ "Proceeding with gcloud detection.",
+ FLAGS.target_arch,
+ )
+
+ try:
+ stdout, _, retcode = vm_util.IssueCommand(
+ [
+ "gcloud",
+ "compute",
+ "machine-types",
+ "describe",
+ machine_type,
+ f"--zone={zone}",
+ f"--project={project}",
+ "--format=value(architecture)",
+ ],
+ raise_on_failure=False,
+ timeout=30,
+ )
+ if retcode == 0 and stdout.strip():
+ gcp_arch = stdout.strip().upper()
+ docker_arch = _ARCH_MAP.get(gcp_arch)
+ if docker_arch:
+ logging.info(
+ "Detected architecture for %s: %s -> %s",
+ machine_type,
+ gcp_arch,
+ docker_arch,
+ )
+ return docker_arch
+ logging.warning(
+ "Unknown GCP architecture '%s' for %s. Falling back to amd64.",
+ gcp_arch,
+ machine_type,
+ )
+ except Exception as e:
+ logging.warning(
+ "gcloud machine-type describe failed: %s. Falling back to amd64.", e
+ )
+
+ return "amd64"
+
+
+def build_images_with_config(project, region, machine_type, zone, arch):
+ """Core image build logic — no FLAGS dependency.
+
+ Callable from both PKB (via BuildImages()) and prerequisite_setup.py.
+ Uses the project's default Cloud Build SA (no custom SA needed).
+
+ Args:
+ project: GCP project ID.
+ region: GCP region (e.g. "us-central1").
+ machine_type: Machine type string (e.g. "c4-standard-8").
+ Used to derive target architecture (arm64 for c4a, amd64 otherwise).
+ """
+ # Architecture passed in from caller (detected via gcloud)
+ target_arch = arch
+
+ # Derive image paths
+ adk_image = f"{region}-docker.pkg.dev/{project}/adk-repo/adk-agent:{target_arch}"
+ chrome_image = (
+ f"{region}-docker.pkg.dev/{project}/agent-sandbox/chrome-sandbox:{target_arch}"
+ )
+ router_image = (
+ f"{region}-docker.pkg.dev/{project}/agent-sandbox/sandbox-router:{target_arch}"
+ )
+
+ logger.info("=== Building Container Images (Chrome + Router only) ===")
+ logger.info(" Project: %s", project)
+ logger.info(" Region: %s", region)
+ logger.info(" Architecture: %s", target_arch)
+ logger.info(" Cloud Build SA: default (project Cloud Build SA)")
+ logger.info(" NOTE: ADK Agent image is built by PKB via container_specs")
+
+ # 1. Build Chrome Sandbox
+ _BuildChromeSandboxImage(
+ project=project,
+ region=region,
+ target_arch=target_arch,
+ image_path=chrome_image,
+ )
+
+ # 3. Build Sandbox Router
+ _BuildSandboxRouterImage(
+ project=project,
+ region=region,
+ target_arch=target_arch,
+ image_path=router_image,
+ )
+
+ logger.info("=== Chrome + Router images built successfully ===")
+ logger.info(" Chrome Sandbox: %s", chrome_image)
+ logger.info(" Sandbox Router: %s", router_image)
+ logger.info(" (ADK Agent built by PKB via container_specs)")
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _BuildChromeSandboxImage(project, region, target_arch, image_path):
+ """Build and push the Chrome Sandbox image."""
+ logger.info("Building Chrome Sandbox image: %s", image_path)
+
+ tmp_dir = tempfile.mkdtemp(prefix="chrome-sandbox-")
+ try:
+ # Clone agent-sandbox repo (sparse checkout)
+ logger.info("Cloning agent-sandbox chrome-sandbox source...")
+ _RunCmd(
+ [
+ "git",
+ "clone",
+ "--depth",
+ "1",
+ "--filter=blob:none",
+ "--sparse",
+ "https://github.com/kubernetes-sigs/agent-sandbox.git",
+ tmp_dir,
+ ]
+ )
+ _RunCmd(
+ ["git", "sparse-checkout", "set", "examples/chrome-sandbox"],
+ cwd=tmp_dir,
+ )
+
+ build_dir = os.path.join(tmp_dir, "examples", "chrome-sandbox")
+ if not os.path.isfile(os.path.join(build_dir, "Dockerfile")):
+ raise RuntimeError(f"chrome-sandbox Dockerfile not found at {build_dir}")
+
+ # Patch Dockerfile: add socat for CDP proxy
+ dockerfile_path = os.path.join(build_dir, "Dockerfile")
+ with open(dockerfile_path, "r") as f:
+ content = f.read()
+ content = content.replace(
+ "RUN apt-get update && apt-get install --yes --no-install-recommends chromium",
+ "RUN apt-get update && apt-get install --yes --no-install-recommends chromium socat",
+ )
+ with open(dockerfile_path, "w") as f:
+ f.write(content)
+
+ # Submit Cloud Build (generates cloudbuild.yaml in temp dir)
+ _SubmitCloudBuild(
+ source_dir=build_dir,
+ image_path=image_path,
+ target_arch=target_arch,
+ project=project,
+ )
+
+ logger.info("Chrome Sandbox image built successfully.")
+ finally:
+ shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+def _BuildSandboxRouterImage(project, region, target_arch, image_path):
+ """Build and push the Sandbox Router image."""
+ logger.info("Building Sandbox Router image: %s", image_path)
+
+ tmp_dir = tempfile.mkdtemp(prefix="sandbox-router-")
+ try:
+ # Clone agent-sandbox repo (sparse checkout)
+ logger.info("Cloning agent-sandbox router source...")
+ _RunCmd(
+ [
+ "git",
+ "clone",
+ "--depth",
+ "1",
+ "--filter=blob:none",
+ "--sparse",
+ "https://github.com/kubernetes-sigs/agent-sandbox.git",
+ tmp_dir,
+ ]
+ )
+ _RunCmd(
+ [
+ "git",
+ "sparse-checkout",
+ "set",
+ "clients/python/agentic-sandbox-client/sandbox-router",
+ ],
+ cwd=tmp_dir,
+ )
+
+ build_dir = os.path.join(
+ tmp_dir, "clients", "python", "agentic-sandbox-client", "sandbox-router"
+ )
+ if not os.path.isfile(os.path.join(build_dir, "Dockerfile")):
+ raise RuntimeError(f"sandbox-router Dockerfile not found at {build_dir}")
+
+ # Submit Cloud Build (generates cloudbuild.yaml in temp dir)
+ _SubmitCloudBuild(
+ source_dir=build_dir,
+ image_path=image_path,
+ target_arch=target_arch,
+ project=project,
+ )
+
+ logger.info("Sandbox Router image built successfully.")
+ finally:
+ shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+def _SubmitCloudBuild(source_dir, image_path, target_arch, project):
+ """Generate a cloudbuild.yaml with substitutions and submit via Cloud Build.
+
+ Used for Chrome and Router images (built in temp directories).
+ Uses the project's default Cloud Build SA.
+
+ For cross-architecture builds (e.g. arm64 on amd64 workers), uses
+ QEMU emulation + Docker Buildx to produce the target-arch image.
+ A high-CPU machine type (E2_HIGHCPU_32) is used to offset the
+ overhead of QEMU instruction translation.
+ """
+ if target_arch == "amd64":
+ # Native build — no emulation needed
+ cloudbuild_content = """steps:
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['build', '--platform', '${_PLATFORM}', '-t', '${_IMAGE_PATH}', '.']
+ env:
+ - 'DOCKER_BUILDKIT=1'
+images:
+ - '${_IMAGE_PATH}'
+options:
+ logging: CLOUD_LOGGING_ONLY
+substitutions:
+ _IMAGE_PATH: ''
+ _PLATFORM: 'linux/amd64'
+"""
+ else:
+ # Cross-arch build — QEMU + Buildx required.
+ # Cloud Build workers are amd64; QEMU registers binfmt handlers
+ # so the kernel can execute arm64 binaries transparently.
+ # E2_HIGHCPU_32 provides 32 vCPUs to offset emulation overhead.
+ # Buildx --push handles the registry push directly, so no
+ # top-level 'images:' key is needed.
+ cloudbuild_content = """steps:
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['run', '--privileged', 'multiarch/qemu-user-static', '--reset', '-p', 'yes']
+ id: 'qemu-setup'
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['buildx', 'create', '--use', '--name', 'multiarch-builder']
+ id: 'create-builder'
+ waitFor: ['qemu-setup']
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['buildx', 'build', '--platform', '${_PLATFORM}', '-t', '${_IMAGE_PATH}', '--push', '.']
+ id: 'build-and-push'
+ waitFor: ['create-builder']
+options:
+ logging: CLOUD_LOGGING_ONLY
+ machineType: E2_HIGHCPU_32
+substitutions:
+ _IMAGE_PATH: ''
+ _PLATFORM: 'linux/amd64'
+"""
+ cloudbuild_path = os.path.join(source_dir, "cloudbuild.yaml")
+ with open(cloudbuild_path, "w") as f:
+ f.write(cloudbuild_content)
+
+ _RunCmd(
+ [
+ "gcloud",
+ "builds",
+ "submit",
+ source_dir,
+ f"--config={cloudbuild_path}",
+ f"--substitutions=_IMAGE_PATH={image_path},_PLATFORM=linux/{target_arch}",
+ f"--project={project}",
+ ]
+ )
+
+
+def _RunCmd(cmd, cwd=None):
+ """Run a shell command, raising on failure."""
+ logger.info(" CMD: %s", " ".join(cmd))
+ env = os.environ.copy()
+ env["CLOUDSDK_AUTH_DISABLE_SSL_VALIDATION"] = "true"
+ proc = subprocess.run(
+ cmd,
+ capture_output=True,
+ text=True,
+ cwd=cwd,
+ timeout=2400, # 40 min: allows for QEMU cross-arch builds
+ env=env,
+ )
+
+ if proc.returncode != 0:
+ raise RuntimeError(
+ f"Command failed (rc={proc.returncode}): {' '.join(cmd)}\n"
+ f"stderr: {proc.stderr[-500:]}"
+ )
+ return proc.stdout
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py
new file mode 100644
index 0000000000..1bae7b41d4
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_post_teardown.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""Post-Teardown Cleanup for GKE Agentic Benchmarking.
+
+Cleans up infrastructure created by gke_prerequisites.py and DeploySnapshots():
+ - Delete Cloud Build service account + IAM bindings
+ - Delete GCS snapshot bucket
+ - Delete Artifact Registry repositories
+
+Run ONCE after all benchmarks are complete (after PKB Teardown has deleted the cluster):
+ python -m perfkitbenchmarker.linux_benchmarks.kubernetes.agentic.gke_post_teardown \
+ --project_id= --region=
+"""
+
+import argparse
+import logging
+import subprocess
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def _run(cmd, check=False, timeout=300):
+ logger.info("CMD: %s", " ".join(cmd))
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+ if check and result.returncode != 0:
+ logger.warning("Command failed (rc=%d): %s", result.returncode, result.stderr[-300:])
+ return result
+
+
+def revoke_cloudbuild_sa_permissions(project_id):
+ """Revoke extra IAM roles from Cloud Build SA(s).
+
+ Mirrors grant_cloudbuild_sa_permissions() from gke_prerequisites.py.
+ Revokes roles from both possible SAs. Does NOT delete them
+ (they are project-managed).
+ """
+ logger.info("=== Revoking extra permissions from Cloud Build SA(s) ===")
+ result = _run(["gcloud", "projects", "describe", project_id,
+ "--format=value(projectNumber)"])
+ project_number = result.stdout.strip()
+ if not project_number:
+ logger.warning("Could not determine project number, skipping SA cleanup")
+ return
+ sa_emails = [
+ f"{project_number}@cloudbuild.gserviceaccount.com",
+ f"{project_number}-compute@developer.gserviceaccount.com",
+ ]
+ roles = ["roles/logging.logWriter", "roles/storage.objectViewer",
+ "roles/artifactregistry.writer", "roles/serviceusage.serviceUsageConsumer"]
+ for sa_email in sa_emails:
+ for role in roles:
+ _run(["gcloud", "projects", "remove-iam-policy-binding", project_id,
+ f"--member=serviceAccount:{sa_email}", f"--role={role}", "--quiet"])
+ logger.info("Cloud Build SA extra permissions revoked.")
+
+
+def teardown_snapshot_bucket(project_id, region):
+ logger.info("=== Deleting Snapshot Bucket ===")
+ bucket_name = f"agent-sandbox-snapshots-{project_id}"
+ _run(["gcloud", "storage", "rm", f"gs://{bucket_name}/**",
+ f"--project={project_id}", "--quiet"])
+ _run(["gcloud", "storage", "buckets", "delete", f"gs://{bucket_name}",
+ f"--project={project_id}", "--quiet"])
+ logger.info("Snapshot bucket deleted.")
+
+
+def teardown_images(project_id, region):
+ logger.info("=== Deleting AR repos ===")
+ # "adk-repo" is created/deleted by PKB container_registry lifecycle
+ # (Provision creates it, Teardown deletes it). If you skip PKB Teardown,
+ # run: gcloud artifacts repositories delete adk-repo --location=
+ # Only "agent-sandbox" (Chrome + Router images) needs manual cleanup here.
+ for repo in ["agent-sandbox"]:
+ _run(["gcloud", "artifacts", "repositories", "delete", repo,
+ f"--location={region}", f"--project={project_id}", "--quiet"])
+ logger.info("AR repos deleted.")
+
+
+def main():
+ p = argparse.ArgumentParser(description="GKE Agentic Benchmark Post-Teardown")
+ p.add_argument("--project_id", required=True, help="GCP project ID")
+ p.add_argument("--region", default="us-central1", help="GCP region")
+ p.add_argument("--keep_images", action="store_true", help="Skip AR repo deletion")
+ p.add_argument("--keep_bucket", action="store_true", help="Skip snapshot bucket deletion")
+ args = p.parse_args()
+ revoke_cloudbuild_sa_permissions(args.project_id)
+ if not args.keep_bucket:
+ teardown_snapshot_bucket(args.project_id, args.region)
+ if not args.keep_images:
+ teardown_images(args.project_id, args.region)
+ print("\nPost-teardown complete!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py
new file mode 100644
index 0000000000..72c32d5b1f
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/gke_prerequisites.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Prerequisite Setup for GKE Agentic Benchmarking.
+
+Creates infrastructure that PKB cannot manage natively:
+ - Enable required GCP APIs
+ - Create Artifact Registry repositories
+ - Create Cloud Build service account + IAM bindings
+
+Run ONCE before PKB provisioning:
+ python -m perfkitbenchmarker.linux_benchmarks.kubernetes.agentic.gke_prerequisites \
+ --project_id= --region=
+"""
+
+import argparse
+import logging
+import os
+import subprocess
+import time
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def _run(cmd, check=True, timeout=300):
+ logger.info("CMD: %s", " ".join(cmd))
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+ if check and result.returncode != 0:
+ logger.error("Command failed (rc=%d): %s", result.returncode, result.stderr[-500:])
+ raise RuntimeError(f"Command failed: {cmd}")
+ return result
+
+
+def _exists(cmd):
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+ return result.returncode == 0
+
+
+def enable_apis(project_id):
+ logger.info("=== Enabling GCP APIs ===")
+ apis = [
+ "container.googleapis.com",
+ "artifactregistry.googleapis.com",
+ "cloudbuild.googleapis.com",
+ "aiplatform.googleapis.com",
+ "storage.googleapis.com",
+ "iam.googleapis.com",
+ "connectgateway.googleapis.com",
+ "gkehub.googleapis.com",
+ "gkeconnect.googleapis.com",
+ "iap.googleapis.com",
+ ]
+ _run(["gcloud", "services", "enable"] + apis + [f"--project={project_id}"])
+ logger.info("APIs enabled.")
+
+
+def create_artifact_registry(project_id, region):
+ logger.info("=== Creating Artifact Registry Repos ===")
+ # "adk-repo" is no longer needed here -- PKB creates its own AR repo
+ # via container_registry during the Provision stage.
+ # Only "agent-sandbox" is needed for Chrome/Router images.
+ for repo in ["agent-sandbox"]:
+ if _exists(["gcloud", "artifacts", "repositories", "describe", repo,
+ f"--location={region}", f"--project={project_id}"]):
+ logger.info("AR repo %s already exists.", repo)
+ continue
+ _run(["gcloud", "artifacts", "repositories", "create", repo,
+ "--repository-format=docker",
+ f"--location={region}", f"--project={project_id}"])
+ logger.info("AR repo %s created.", repo)
+
+
+def grant_cloudbuild_sa_permissions(project_id):
+ """Grant required IAM roles to the Cloud Build service account(s).
+
+ Auto-detects which SA Cloud Build uses in this project:
+ - Legacy projects: {number}@cloudbuild.gserviceaccount.com
+ - Newer projects: {number}-compute@developer.gserviceaccount.com
+
+ Grants permissions to both SAs to ensure compatibility regardless
+ of project configuration. This is idempotent and safe.
+ """
+ logger.info("=== Granting permissions to Cloud Build SA(s) ===")
+ result = _run(["gcloud", "projects", "describe", project_id,
+ "--format=value(projectNumber)"])
+ project_number = result.stdout.strip()
+ if not project_number:
+ logger.error("Could not determine project number for %s", project_id)
+ return
+
+ # Both possible Cloud Build SAs
+ cloudbuild_sa = f"{project_number}@cloudbuild.gserviceaccount.com"
+ compute_sa = f"{project_number}-compute@developer.gserviceaccount.com"
+
+ # Detect which SA(s) exist
+ sa_emails = []
+ for sa in [cloudbuild_sa, compute_sa]:
+ if _exists(["gcloud", "iam", "service-accounts", "describe",
+ sa, f"--project={project_id}"]):
+ sa_emails.append(sa)
+ logger.info("Found Cloud Build SA: %s", sa)
+ else:
+ logger.info("SA not found (skipping): %s", sa)
+
+ if not sa_emails:
+ logger.error("No Cloud Build SA found in project %s", project_id)
+ return
+
+ roles = [
+ "roles/logging.logWriter",
+ "roles/storage.objectViewer",
+ "roles/artifactregistry.writer",
+ "roles/serviceusage.serviceUsageConsumer",
+ ]
+ for sa_email in sa_emails:
+ logger.info("Granting roles to %s", sa_email)
+ for role in roles:
+ _run(["gcloud", "projects", "add-iam-policy-binding", project_id,
+ f"--member=serviceAccount:{sa_email}",
+ f"--role={role}", "--condition=None", "--quiet"], check=False)
+ logger.info("Cloud Build SA permissions granted.")
+
+
+
+
+def build_sandbox_images(project_id, region, target_arch):
+ """Build Chrome Sandbox and Sandbox Router images via Cloud Build."""
+ logger.info("=== Building Sandbox Images (arch=%s) ===", target_arch)
+ from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import gke_image_build_utils
+
+ chrome_image = (
+ f"{region}-docker.pkg.dev/{project_id}/agent-sandbox/chrome-sandbox:{target_arch}"
+ )
+ router_image = (
+ f"{region}-docker.pkg.dev/{project_id}/agent-sandbox/sandbox-router:{target_arch}"
+ )
+
+ gke_image_build_utils._BuildChromeSandboxImage(
+ project=project_id,
+ region=region,
+ target_arch=target_arch,
+ image_path=chrome_image,
+ )
+
+ gke_image_build_utils._BuildSandboxRouterImage(
+ project=project_id,
+ region=region,
+ target_arch=target_arch,
+ image_path=router_image,
+ )
+
+ logger.info("Sandbox images built successfully.")
+ logger.info(" Chrome: %s", chrome_image)
+ logger.info(" Router: %s", router_image)
+
+def main():
+ p = argparse.ArgumentParser(description="GKE Agentic Benchmark Prerequisites")
+ p.add_argument("--project_id", required=True, help="GCP project ID")
+ p.add_argument("--region", default="us-central1", help="GCP region")
+ p.add_argument(
+ "--target_arch",
+ required=True,
+ choices=["amd64", "arm64"],
+ help="Target CPU architecture for container images (amd64 or arm64)",
+ )
+ p.add_argument(
+ "--skip_image_build",
+ action="store_true",
+ help="Skip Chrome and Router image builds (images already in registry)",
+ )
+ args = p.parse_args()
+ enable_apis(args.project_id)
+ create_artifact_registry(args.project_id, args.region)
+ grant_cloudbuild_sa_permissions(args.project_id)
+ if not args.skip_image_build:
+ build_sandbox_images(args.project_id, args.region, args.target_arch)
+ else:
+ logger.info("Skipping image builds (--skip_image_build)")
+ print("\nPrerequisite setup complete!")
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py
new file mode 100644
index 0000000000..e23aa32a6d
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_benchmark_utils.py
@@ -0,0 +1,506 @@
+"""Shared utilities for GKE Agent Sandbox benchmarks.
+
+Provides helpers for agent API interaction, kubectl commands, warm pool
+management, and sample construction used by all GKE agent benchmark
+definitions.
+"""
+
+import json
+import logging
+import subprocess
+import time
+import urllib.request
+import urllib.error
+
+from absl import flags
+from perfkitbenchmarker import sample
+from perfkitbenchmarker import vm_util
+from perfkitbenchmarker.resources.container_service import kubectl
+
+FLAGS = flags.FLAGS
+
+# Module-level benchmark_spec reference for metadata derivation.
+# Set by each benchmark's Run() via set_benchmark_spec().
+_current_benchmark_spec = None
+
+
+# ---------------------------------------------------------------------------
+# Shared flags (registered once; importable by benchmark modules)
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_string(
+ "k8s_namespace",
+ "agentic",
+ "Kubernetes namespace where the agentic workloads are deployed.",
+)
+
+flags.DEFINE_bool(
+ "k8s_gvisor",
+ True,
+ "Whether the sandbox node pool uses gVisor. Recorded in sample metadata.",
+)
+
+flags.DEFINE_string(
+ "k8s_benchmark_note",
+ "",
+ "Arbitrary note string attached to every sample for tagging runs.",
+)
+
+flags.DEFINE_string(
+ "k8s_agent_api_url",
+ "http://localhost:8080",
+ "Base URL of the ADK Agent API.",
+)
+
+flags.DEFINE_integer(
+ "k8s_agent_api_timeout",
+ 600,
+ "HTTP timeout in seconds for agent API benchmark calls.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Agent API helpers
+# ---------------------------------------------------------------------------
+
+
+def GetAgentApiUrl():
+ """Return the base URL of the ADK agent API service."""
+ return FLAGS.k8s_agent_api_url.rstrip("/")
+
+
+def CheckAgentHealthz(api_url=None, required=True):
+ """Verify the agent API is reachable via /healthz.
+
+ Args:
+ api_url: Base URL to check. Defaults to FLAGS.k8s_agent_api_url.
+ required: If True (default), raise on failure. If False, log warning.
+ """
+ if api_url is None:
+ api_url = GetAgentApiUrl()
+ try:
+ req = urllib.request.Request(f"{api_url}/healthz")
+ with urllib.request.urlopen(req, timeout=15) as resp:
+ logging.info("Agent healthz: %s", resp.read().decode())
+ except (urllib.error.URLError, urllib.error.HTTPError) as e:
+ msg = (
+ f"Agent API is not reachable at {api_url}/healthz: {e}\n"
+ "Hint: ensure kubectl port-forward is running "
+ "(kubectl port-forward svc/adk-agent -n 8080:80)."
+ )
+ if required:
+ raise RuntimeError(msg)
+ else:
+ logging.warning("Health check deferred (non-fatal): %s", msg)
+
+
+def CallAgentApi(endpoint, payload, timeout=None):
+ """POST JSON to an agent API endpoint and return the parsed response."""
+ if timeout is None:
+ timeout = FLAGS.k8s_agent_api_timeout
+ base_url = GetAgentApiUrl()
+ url = f"{base_url}{endpoint}"
+ data = json.dumps(payload).encode("utf-8")
+ req = urllib.request.Request(
+ url, data=data,
+ headers={"Content-Type": "application/json"},
+ method="POST",
+ )
+ logging.info("POST %s payload=%s timeout=%ds", url, payload, timeout)
+ try:
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ body = resp.read().decode("utf-8")
+ except urllib.error.HTTPError as e:
+ body = e.read().decode("utf-8", errors="replace")
+ raise RuntimeError(f"Agent API returned HTTP {e.code}: {body[:500]}")
+ except urllib.error.URLError as e:
+ raise RuntimeError(f"Cannot reach agent API at {url}: {e.reason}")
+ try:
+ return json.loads(body)
+ except json.JSONDecodeError:
+ raise RuntimeError(f"Agent API returned non-JSON response:\n{body[:500]}")
+
+
+# ---------------------------------------------------------------------------
+# kubectl helpers
+# ---------------------------------------------------------------------------
+
+
+def RunKubectl(args, timeout=120, raise_on_failure=True):
+ """Run a kubectl command and return (stdout, stderr, retcode).
+
+ Delegates to PKB's native kubectl module which handles kubeconfig
+ and retries for transient connection errors automatically.
+ """
+ return kubectl.RunKubectlCommand(
+ list(args),
+ timeout=timeout,
+ raise_on_failure=raise_on_failure,
+ )
+
+
+def CountPods(namespace, label, phase=None):
+ """Count pods matching label (and optionally phase)."""
+ cmd = ["get", "pods", "-n", namespace, "-l", label, "-o", "name"]
+ if phase:
+ cmd += [f"--field-selector=status.phase={phase}"]
+ stdout, _, rc = RunKubectl(cmd, raise_on_failure=False)
+ if rc != 0 or not stdout:
+ return 0
+ return len(stdout.strip().splitlines())
+
+
+def PatchWarmPool(namespace, warmpool_name, replicas, label, wait_timeout=180):
+ """Patch SandboxWarmPool replicas and wait for pods to be ready."""
+ logging.info("Patching %s replicas -> %d", warmpool_name, replicas)
+ patch_json = json.dumps({"spec": {"replicas": replicas}})
+ RunKubectl([
+ "patch", "sandboxwarmpool", warmpool_name,
+ "-n", namespace, "--type=merge", f"-p={patch_json}",
+ ])
+ if replicas == 0:
+ return True
+ deadline = time.time() + wait_timeout
+ while time.time() < deadline:
+ running = CountPods(namespace, label, phase="Running")
+ logging.info("%d/%d warm pool pods Running", running, replicas)
+ if running >= replicas:
+ return True
+ time.sleep(3)
+ logging.warning("Timed out waiting for %d warm pool pods", replicas)
+ return False
+
+
+def DrainWarmPool(namespace, warmpool_name, label, timeout=120):
+ """Scale warm pool to 0 and wait for all pods to terminate."""
+ logging.info("Draining warm pool %s to 0", warmpool_name)
+ patch_json = json.dumps({"spec": {"replicas": 0}})
+ RunKubectl([
+ "patch", "sandboxwarmpool", warmpool_name,
+ "-n", namespace, "--type=merge", f"-p={patch_json}",
+ ], raise_on_failure=False)
+
+ # Delete lingering SandboxClaims that may prevent pod termination
+ RunKubectl([
+ "delete", "sandboxclaims", "--all",
+ "-n", namespace, "--ignore-not-found=true",
+ ], timeout=60, raise_on_failure=False)
+
+ deadline = time.time() + timeout
+ while time.time() < deadline:
+ remaining = CountPods(namespace, label)
+ if remaining == 0:
+ logging.info("Warm pool drained successfully")
+ return True
+ logging.info("Draining... %d pods remaining", remaining)
+ time.sleep(2)
+ logging.warning("Drain timed out, %d pods still present",
+ CountPods(namespace, label))
+ return False
+
+
+def set_benchmark_spec(benchmark_spec):
+ """Store benchmark_spec for metadata derivation (called by Run())."""
+ global _current_benchmark_spec
+ _current_benchmark_spec = benchmark_spec
+
+
+
+
+# ---------------------------------------------------------------------------
+# Sample construction
+# ---------------------------------------------------------------------------
+
+
+def BuildMetadata(namespace, extra=None):
+ """Construct the common metadata dict for all samples."""
+ metadata = {
+ "namespace": namespace,
+ "gvisor": FLAGS.k8s_gvisor,
+ }
+ # Derive machine_type from benchmark_spec (set via set_benchmark_spec)
+ machine_type = None
+ if _current_benchmark_spec:
+ cluster = getattr(_current_benchmark_spec, 'container_cluster', None)
+ if cluster:
+ # Prefer sandbox nodepool machine_type over default pool
+ nodepools = getattr(cluster, 'nodepools', None)
+ if nodepools and isinstance(nodepools, dict):
+ sandbox_pool = nodepools.get('sandbox')
+ if sandbox_pool and hasattr(sandbox_pool, 'vm_spec'):
+ machine_type = getattr(sandbox_pool.vm_spec, 'machine_type', None)
+ if not machine_type and hasattr(cluster, 'vm_spec'):
+ machine_type = getattr(cluster.vm_spec, 'machine_type', None)
+ if machine_type:
+ metadata["machine_type"] = machine_type
+ if FLAGS.k8s_benchmark_note:
+ metadata["note"] = FLAGS.k8s_benchmark_note
+ if extra:
+ metadata.update(extra)
+ return metadata
+
+
+def MakeSample(metric, value, unit, namespace, extra_metadata=None):
+ """Create a single sample.Sample with standard metadata."""
+ return sample.Sample(
+ metric=metric,
+ value=value,
+ unit=unit,
+ metadata=BuildMetadata(namespace, extra_metadata),
+ )
+
+
+# ---------------------------------------------------------------------------
+# Port-forward flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_bool(
+ "k8s_auto_portforward",
+ True,
+ "Automatically manage kubectl port-forward to the agent service.",
+)
+
+flags.DEFINE_integer(
+ "k8s_portforward_local_port",
+ 8080,
+ "Local port for kubectl port-forward.",
+)
+
+flags.DEFINE_integer(
+ "k8s_portforward_remote_port",
+ 80,
+ "Remote service port for kubectl port-forward.",
+)
+
+flags.DEFINE_string(
+ "k8s_portforward_service",
+ "svc/adk-agent",
+ "Kubernetes service to port-forward to.",
+)
+
+flags.DEFINE_float(
+ "k8s_portforward_reconnect_delay",
+ 1.0,
+ "Seconds to wait before reconnecting after port-forward drops.",
+)
+
+flags.DEFINE_float(
+ "k8s_portforward_health_timeout",
+ 30.0,
+ "Seconds to wait for agent health check after starting port-forward.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Port-forward manager
+# ---------------------------------------------------------------------------
+
+import atexit
+import os as _os
+import signal
+import threading
+
+
+_PID_FILE = "/tmp/pkb_portforward.pid"
+
+
+class _PortForwardManager:
+ """Manages a kubectl port-forward subprocess with auto-reconnect.
+
+ Mimics the shell pattern:
+ while true; do
+ kubectl port-forward svc/adk-agent -n agentic 8080:80
+ echo "Reconnecting..."
+ sleep 1
+ done
+
+ Thread-safe. Idempotent start/stop. Cleans up orphans via PID file.
+ """
+
+ def __init__(self):
+ self._proc = None
+ self._thread = None
+ self._stop_event = threading.Event()
+ self._lock = threading.Lock()
+ self._started = False
+
+ @property
+ def is_running(self):
+ return self._started and not self._stop_event.is_set()
+
+ def start(self):
+ """Start the port-forward loop (idempotent)."""
+ with self._lock:
+ if self._started and not self._stop_event.is_set():
+ if self._proc and self._proc.poll() is None:
+ return
+ return
+
+ self._kill_orphan()
+ self._stop_event.clear()
+ self._started = True
+ self._thread = threading.Thread(
+ target=self._loop, daemon=True, name="pkb-portforward"
+ )
+ self._thread.start()
+
+ def stop(self):
+ """Stop the port-forward loop and kill the subprocess."""
+ with self._lock:
+ if not self._started:
+ return
+ self._stop_event.set()
+ self._kill_proc()
+ self._started = False
+ self._cleanup_pid_file()
+
+ def _loop(self):
+ """Background reconnect loop."""
+ ns = FLAGS.k8s_namespace
+ svc = FLAGS.k8s_portforward_service
+ local_port = FLAGS.k8s_portforward_local_port
+ remote_port = FLAGS.k8s_portforward_remote_port
+ delay = FLAGS.k8s_portforward_reconnect_delay
+
+ cmd = ["kubectl"]
+ if FLAGS.kubeconfig:
+ cmd += ["--kubeconfig", FLAGS.kubeconfig]
+ cmd += [
+ "port-forward", svc,
+ "-n", ns,
+ f"{local_port}:{remote_port}",
+ ]
+
+ while not self._stop_event.is_set():
+ logging.info("Starting port-forward: %s", " ".join(cmd))
+ try:
+ self._proc = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ self._write_pid_file(self._proc.pid)
+
+ while not self._stop_event.is_set():
+ retcode = self._proc.poll()
+ if retcode is not None:
+ break
+ self._stop_event.wait(timeout=0.5)
+
+ except Exception as e:
+ logging.warning("Port-forward error: %s", e)
+
+ if not self._stop_event.is_set():
+ logging.info(
+ "Port-forward disconnected. Reconnecting in %.1fs...", delay
+ )
+ self._stop_event.wait(timeout=delay)
+
+ def _kill_proc(self):
+ """Kill the current subprocess if alive."""
+ if self._proc and self._proc.poll() is None:
+ try:
+ self._proc.terminate()
+ self._proc.wait(timeout=5)
+ except Exception:
+ try:
+ self._proc.kill()
+ except Exception:
+ pass
+ self._proc = None
+
+ def _write_pid_file(self, pid):
+ """Write PID to file for orphan detection."""
+ try:
+ with open(_PID_FILE, "w") as f:
+ f.write(str(pid))
+ except Exception:
+ pass
+
+ def _cleanup_pid_file(self):
+ """Remove PID file."""
+ try:
+ _os.unlink(_PID_FILE)
+ except OSError:
+ pass
+
+ def _kill_orphan(self):
+ """Kill a port-forward process left by a previous PKB run."""
+ try:
+ if _os.path.exists(_PID_FILE):
+ with open(_PID_FILE, "r") as f:
+ pid = int(f.read().strip())
+ logging.info("Killing orphan port-forward (PID %d)", pid)
+ _os.kill(pid, signal.SIGTERM)
+ import time as _time
+ _time.sleep(0.5)
+ try:
+ _os.kill(pid, signal.SIGKILL)
+ except OSError:
+ pass
+ self._cleanup_pid_file()
+ except (OSError, ValueError):
+ self._cleanup_pid_file()
+
+ local_port = FLAGS.k8s_portforward_local_port
+ try:
+ result = subprocess.run(
+ ["lsof", "-ti", f":{local_port}"],
+ capture_output=True, text=True, timeout=5,
+ )
+ if result.returncode == 0 and result.stdout.strip():
+ for pid_str in result.stdout.strip().split():
+ try:
+ pid = int(pid_str)
+ _os.kill(pid, signal.SIGTERM)
+ logging.info("Killed process %d on port %d", pid, local_port)
+ except (OSError, ValueError):
+ pass
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ pass
+
+
+# Singleton instance
+_port_forward_manager = _PortForwardManager()
+
+# Ensure cleanup on interpreter exit
+atexit.register(_port_forward_manager.stop)
+
+
+def EnsurePortForward():
+ """Start port-forward if auto_portforward is enabled (idempotent).
+
+ Blocks until the agent health check passes or timeout is reached.
+ Safe to call multiple times - only starts one background loop.
+ """
+ if not FLAGS.k8s_auto_portforward:
+ logging.info("Auto port-forward disabled (--k8s_auto_portforward=false)")
+ return
+
+ _port_forward_manager.start()
+
+ import time as _time
+ timeout = FLAGS.k8s_portforward_health_timeout
+ deadline = _time.time() + timeout
+ api_url = GetAgentApiUrl()
+
+ while _time.time() < deadline:
+ try:
+ req = urllib.request.Request(f"{api_url}/healthz")
+ with urllib.request.urlopen(req, timeout=3) as resp:
+ logging.info("Port-forward healthy: %s", resp.read().decode())
+ return
+ except Exception:
+ _time.sleep(1)
+
+ logging.warning(
+ "Port-forward health check did not pass within %.0fs. "
+ "Continuing anyway (Run() will fail if agent is unreachable).",
+ timeout,
+ )
+
+
+def StopPortForward():
+ """Stop the port-forward subprocess and clean up."""
+ _port_forward_manager.stop()
+ logging.info("Port-forward stopped.")
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py
new file mode 100644
index 0000000000..bd9114877c
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_chromium_density_benchmark.py
@@ -0,0 +1,284 @@
+"""PKB Benchmark: GKE Agent Chromium Density Saturation .
+
+Atomic single-point measurement of Chromium browser sandbox density on a
+pre-provisioned GKE cluster with gVisor isolation. Measures interaction
+latency, screenshot generation time, cold start, navigation, evaluation,
+fill, click latencies, and RSS memory at a given concurrent session count.
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the density parameter across iterations to find
+the saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_chromium_density \\
+ --k8s_chromium_density_concurrent_sessions=4 \\
+ --k8s_chromium_density_task_count=10 \\
+ --k8s_chromium_density_warmup_tasks=5 \\
+ --k8s_namespace=agentic \\
+ --k8s_agent_api_url=http://localhost:8080
+
+Samples emitted (per run):
+ - gke_chromium_density_interaction_mean (ms)
+ - gke_chromium_density_interaction_p95 (ms)
+ - gke_chromium_density_navigate_mean (ms)
+ - gke_chromium_density_navigate_p95 (ms)
+ - gke_chromium_density_evaluate_mean (ms)
+ - gke_chromium_density_evaluate_p95 (ms)
+ - gke_chromium_density_fill_mean (ms)
+ - gke_chromium_density_fill_p95 (ms)
+ - gke_chromium_density_click_mean (ms)
+ - gke_chromium_density_click_p95 (ms)
+ - gke_chromium_density_screenshot_mean (ms)
+ - gke_chromium_density_screenshot_p95 (ms)
+ - gke_chromium_density_cold_start_mean (ms)
+ - gke_chromium_density_cold_start_p95 (ms)
+ - gke_chromium_density_rss_end (MB)
+ - gke_chromium_density_rss_growth (MB)
+ - gke_chromium_density_wall_time (seconds)
+"""
+
+import logging
+import time
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_chromium_density"
+BENCHMARK_CONFIG = """
+k8s_chromium_density:
+ description: >
+ Atomic single-point Chromium browser sandbox density measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+_WARMPOOL_NAME = "chromium-sandbox-warmpool"
+_WARMPOOL_LABEL = "sandbox=chromium-sandbox-example"
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_integer(
+ "k8s_chromium_density_concurrent_sessions",
+ 1,
+ "Number of concurrent Chromium browser sessions to run.",
+)
+
+flags.DEFINE_integer(
+ "k8s_chromium_density_task_count",
+ 10,
+ "Number of browser task iterations per Chromium session.",
+)
+
+flags.DEFINE_integer(
+ "k8s_chromium_density_warmup_tasks",
+ 5,
+ "Number of warmup iterations per session (excluded from stats).",
+)
+
+flags.DEFINE_bool(
+ "k8s_chromium_density_patch_warmpool",
+ True,
+ "Patch SandboxWarmPool replicas to match density before measurement.",
+)
+
+flags.DEFINE_integer(
+ "k8s_chromium_density_exec_timeout",
+ 120,
+ "Sandbox command execution timeout in seconds.",
+)
+
+flags.DEFINE_integer(
+ "k8s_chromium_density_provision_timeout",
+ 300,
+ "Max seconds to wait for warm pool pods to reach Running.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads and verify agent API."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+ utils.CheckAgentHealthz(required=False)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Execute a single Chromium density measurement and return samples.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ density = FLAGS.k8s_chromium_density_concurrent_sessions
+
+ logging.info("=== Run: chromium_density=%d ===", density)
+
+ # Ensure port-forward is active (needed when sweeps skip Prepare)
+ utils.EnsurePortForward()
+
+ # Patch warm pool (moved from Prepare for sweep compatibility)
+ if FLAGS.k8s_chromium_density_patch_warmpool:
+ utils.PatchWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ replicas=density,
+ label=_WARMPOOL_LABEL,
+ wait_timeout=FLAGS.k8s_chromium_density_provision_timeout,
+ )
+
+ # POST to agent API
+ payload = {
+ "task_count": FLAGS.k8s_chromium_density_task_count,
+ "warmup_tasks": FLAGS.k8s_chromium_density_warmup_tasks,
+ "concurrent_sessions": density,
+ "sandbox_exec_timeout_s": FLAGS.k8s_chromium_density_exec_timeout,
+ }
+
+ t0 = time.time()
+ result = utils.CallAgentApi("/benchmark/chromium/density", payload)
+ wall_time = time.time() - t0
+
+ successful = result.get("successful_sessions", 0)
+ failed = result.get("failed_sessions", 0)
+ agg = result.get("aggregate", {})
+
+ logging.info(
+ "API response: %d successful, %d failed sessions (%.1fs)",
+ successful,
+ failed,
+ wall_time,
+ )
+
+ # Build samples
+ extra = {
+ "density": density,
+ "successful_sessions": successful,
+ "failed_sessions": failed,
+ "task_count": FLAGS.k8s_chromium_density_task_count,
+ "warmup_tasks": FLAGS.k8s_chromium_density_warmup_tasks,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ # Per-task-type latency: mean and P95 for each
+ _emit(samples, agg, "interaction_mean_ms", "interaction_mean", "ms", ns, extra)
+ _emit(samples, agg, "interaction_p95_ms", "interaction_p95", "ms", ns, extra)
+ _emit(samples, agg, "navigate_mean_ms", "navigate_mean", "ms", ns, extra)
+ _emit(samples, agg, "navigate_p95_ms", "navigate_p95", "ms", ns, extra)
+ _emit(samples, agg, "evaluate_mean_ms", "evaluate_mean", "ms", ns, extra)
+ _emit(samples, agg, "evaluate_p95_ms", "evaluate_p95", "ms", ns, extra)
+ _emit(samples, agg, "fill_mean_ms", "fill_mean", "ms", ns, extra)
+ _emit(samples, agg, "fill_p95_ms", "fill_p95", "ms", ns, extra)
+ _emit(samples, agg, "click_mean_ms", "click_mean", "ms", ns, extra)
+ _emit(samples, agg, "click_p95_ms", "click_p95", "ms", ns, extra)
+ _emit(samples, agg, "screenshot_mean_ms", "screenshot_mean", "ms", ns, extra)
+ _emit(samples, agg, "screenshot_p95_ms", "screenshot_p95", "ms", ns, extra)
+ _emit(samples, agg, "cold_start_mean_ms", "cold_start_mean", "ms", ns, extra)
+ _emit(samples, agg, "cold_start_p95_ms", "cold_start_p95", "ms", ns, extra)
+
+ # RSS memory
+ _emit(samples, agg, "rss_end_mb", "rss_end", "MB", ns, extra)
+ _emit(samples, agg, "rss_growth_mb", "rss_growth", "MB", ns, extra)
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for chromium_density=%d.", len(samples), density)
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Clean up after measurement. Delete claims and drain warm pool."""
+ ns = FLAGS.k8s_namespace
+ logging.info("Cleanup: deleting SandboxClaims and draining warm pool.")
+
+ # Delete any lingering SandboxClaims to release claimed pods
+ utils.RunKubectl(
+ [
+ "delete",
+ "sandboxclaims",
+ "--all",
+ "-n",
+ ns,
+ "--ignore-not-found=true",
+ ],
+ timeout=60,
+ raise_on_failure=False,
+ )
+
+ # Drain warm pool to 0
+ utils.DrainWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ label=_WARMPOOL_LABEL,
+ )
+
+ utils.StopPortForward()
+ logging.info("Cleanup complete (cluster persists).")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra):
+ """Emit a sample if the key exists in the aggregate dict.
+
+ Args:
+ samples: List to append the new sample.Sample to.
+ agg: Aggregate metrics dict returned by the agent API response.
+ agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms").
+ metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric
+ name (e.g. "orchestrator_cel_mean").
+ unit: Unit string for the sample (e.g. "ms", "MB", "seconds").
+ namespace: Kubernetes namespace (included in sample metadata).
+ extra: Dict of additional metadata key-value pairs attached to
+ every sample (density, session counts, wall time, etc.).
+ """
+ value = agg.get(agg_key)
+ if value is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_suffix}",
+ value,
+ unit,
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py
new file mode 100644
index 0000000000..418b5c1ed9
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_deletion_benchmark.py
@@ -0,0 +1,481 @@
+"""PKB Benchmark: GKE Agent Deletion & Cleanup .
+
+Atomic single-point measurement of bulk deletion efficiency and IP
+reclamation on a pre-provisioned GKE cluster with gVisor isolation.
+Provisions N sandbox pods via SandboxWarmPool, then bulk-deletes them
+and measures per-pod deletion latency, aggregate deletion stats, and
+IP address reclamation timing.
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the batch_size parameter across iterations to find
+the deletion saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_deletion \\
+ --k8s_deletion_batch_size=100 \\
+ --k8s_deletion_warmpool_name=python-sandbox-warmpool \\
+ --k8s_deletion_pod_label=sandbox=python-sandbox-example \\
+ --k8s_deletion_poll_interval_s=1.0 \\
+ --k8s_deletion_provision_timeout_s=120.0 \\
+ --k8s_deletion_drain_timeout_s=300.0 \\
+ --k8s_namespace=agentic \\
+ --gke_machine_type=c4-standard-8
+
+Samples emitted (per run):
+ - gke_deletion_provision_time (seconds)
+ - gke_deletion_total_drain_time (seconds)
+ - gke_deletion_latency_p50 (seconds)
+ - gke_deletion_latency_p95 (seconds)
+ - gke_deletion_latency_p99 (seconds)
+ - gke_deletion_latency_max (seconds)
+ - gke_deletion_rate (pods/sec)
+ - gke_deletion_ip_before (count)
+ - gke_deletion_ip_after (count)
+ - gke_deletion_ip_reclaim_time (seconds)
+ - gke_deletion_final_running_count (count)
+ - gke_deletion_wall_time (seconds)
+"""
+
+import json
+import logging
+import time
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_deletion"
+BENCHMARK_CONFIG = """
+k8s_deletion:
+ description: >
+ Atomic single-point bulk deletion and IP reclamation measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_integer(
+ "k8s_deletion_batch_size",
+ 100,
+ "Number of sandbox pods to provision then bulk-delete.",
+)
+
+flags.DEFINE_string(
+ "k8s_deletion_warmpool_name",
+ "python-sandbox-warmpool",
+ "SandboxWarmPool resource name.",
+)
+
+flags.DEFINE_string(
+ "k8s_deletion_pod_label",
+ "sandbox=python-sandbox-example",
+ "Label selector for warm pool pods.",
+)
+
+flags.DEFINE_float(
+ "k8s_deletion_poll_interval_s",
+ 1.0,
+ "Seconds between kubectl polls during deletion.",
+)
+
+flags.DEFINE_float(
+ "k8s_deletion_provision_timeout_s",
+ 120.0,
+ "Max seconds to wait for pods to reach Running before deletion.",
+)
+
+flags.DEFINE_float(
+ "k8s_deletion_drain_timeout_s",
+ 300.0,
+ "Max seconds to wait for all pods to terminate after scale-to-0.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads onto the cluster."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Provision N pods, bulk-delete, measure deletion latency and IP reclamation.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ batch_size = FLAGS.k8s_deletion_batch_size
+ warmpool_name = FLAGS.k8s_deletion_warmpool_name
+ label = FLAGS.k8s_deletion_pod_label
+ poll_interval = FLAGS.k8s_deletion_poll_interval_s
+ provision_timeout = FLAGS.k8s_deletion_provision_timeout_s
+ drain_timeout = FLAGS.k8s_deletion_drain_timeout_s
+
+ logging.info("=== Run: batch_size=%d ===", batch_size)
+
+ # Drain to 0 for clean measurement (moved from Prepare for sweep compatibility)
+ utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(drain_timeout))
+ time.sleep(2)
+
+ t_wall_start = time.time()
+
+ # 1. Provision N pods
+ logging.info("Provisioning %d pods...", batch_size)
+ provision_start = time.time()
+ _PatchReplicas(ns, warmpool_name, batch_size)
+
+ deadline = time.time() + provision_timeout
+ while time.time() < deadline:
+ running = utils.CountPods(ns, label, phase="Running")
+ pct = (running / batch_size * 100) if batch_size > 0 else 0
+ logging.info("Provisioning... %d/%d (%.0f%%)", running, batch_size, pct)
+ if running >= batch_size:
+ break
+ time.sleep(3)
+
+ provision_time = time.time() - provision_start
+ final_running = utils.CountPods(ns, label, phase="Running")
+
+ logging.info(
+ "Provisioned %d/%d pods in %.1fs",
+ final_running,
+ batch_size,
+ provision_time,
+ )
+
+ # If not all pods reached Running, this is a failure
+ if final_running < batch_size:
+ raise RuntimeError(
+ f"Provisioning failed: only {final_running}/{batch_size} pods "
+ f"reached Running within {provision_timeout}s"
+ )
+
+ # 2. Record pod names and IP count before deletion
+ pod_names_before = set(_GetPodNames(ns, label))
+ ip_before = _CountAllocatedIPs(ns, label)
+
+ logging.info(
+ "Recorded %d pods, %d IPs allocated",
+ len(pod_names_before),
+ ip_before,
+ )
+
+ # Brief settle
+ time.sleep(1)
+
+ # 3. Bulk delete: scale to 0
+ logging.info("Scaling to 0 (bulk delete of %d pods)...", len(pod_names_before))
+ _PatchReplicas(ns, warmpool_name, 0)
+
+ # 4. Poll: track pod disappearance and IP reclamation
+ t_delete = time.time()
+ deadline_drain = t_delete + drain_timeout
+ pod_gone_times = {} # pod_name -> elapsed_s when first absent
+ ip_reclaim_time = None
+
+ while time.time() < deadline_drain:
+ elapsed = time.time() - t_delete
+
+ # Current pod names still present
+ current_pods = set(_GetPodNames(ns, label))
+ remaining = len(current_pods)
+
+ # Track which pods have disappeared
+ gone_now = pod_names_before - current_pods
+ for pn in gone_now:
+ if pn not in pod_gone_times:
+ pod_gone_times[pn] = elapsed
+
+ # IP count (scoped to warm pool label)
+ ips = _CountAllocatedIPs(ns, label)
+ if ip_reclaim_time is None and ips == 0:
+ ip_reclaim_time = elapsed
+
+ deleted = len(pod_names_before) - remaining
+ pct = (deleted / len(pod_names_before) * 100) if pod_names_before else 0
+ logging.info(
+ "[%.1fs] Deleted: %d/%d (%.0f%%) IPs: %d",
+ elapsed,
+ deleted,
+ len(pod_names_before),
+ pct,
+ ips,
+ )
+
+ if remaining == 0:
+ break
+
+ time.sleep(poll_interval)
+
+ total_drain_time = time.time() - t_delete
+
+ # Pods we never saw disappear (stuck) get the full drain time
+ for pn in pod_names_before:
+ if pn not in pod_gone_times:
+ pod_gone_times[pn] = total_drain_time
+
+ # 5. Compute per-pod deletion latencies
+ deletion_latencies = sorted(pod_gone_times.values())
+ n = len(deletion_latencies)
+
+ ip_after = _CountAllocatedIPs(ns, label)
+ deletion_rate = (
+ (len(pod_names_before) / total_drain_time) if total_drain_time > 0 else 0
+ )
+
+ logging.info(
+ "Drain complete: %.1fs, rate=%.1f pods/sec, IPs: %d->%d",
+ total_drain_time,
+ deletion_rate,
+ ip_before,
+ ip_after,
+ )
+
+ wall_time = time.time() - t_wall_start
+
+ # 6. Build samples
+ extra = {
+ "batch_size": batch_size,
+ "final_running_count": final_running,
+ "ip_before": ip_before,
+ "ip_after": ip_after,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_provision_time",
+ round(provision_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_total_drain_time",
+ round(total_drain_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ if n > 0:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_latency_p50",
+ round(_Percentile(deletion_latencies, 50), 3),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_latency_p95",
+ round(_Percentile(deletion_latencies, 95), 3),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_latency_p99",
+ round(_Percentile(deletion_latencies, 99), 3),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_latency_max",
+ round(deletion_latencies[-1], 3),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_rate",
+ round(deletion_rate, 2),
+ "pods/sec",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ip_before",
+ float(ip_before),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ip_after",
+ float(ip_after),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ if ip_reclaim_time is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ip_reclaim_time",
+ round(ip_reclaim_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_final_running_count",
+ float(final_running),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for batch_size=%d.", len(samples), batch_size)
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Best-effort drain of warm pool after measurement."""
+ ns = FLAGS.k8s_namespace
+ warmpool_name = FLAGS.k8s_deletion_warmpool_name
+ label = FLAGS.k8s_deletion_pod_label
+
+ logging.info("Cleanup: draining warm pool to 0.")
+ utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_deletion_drain_timeout_s))
+ utils.StopPortForward()
+ logging.info("Cleanup complete.")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _PatchReplicas(namespace, warmpool_name, replicas):
+ """Patch SandboxWarmPool to a specific replica count."""
+ patch_json = json.dumps({"spec": {"replicas": replicas}})
+ utils.RunKubectl(
+ [
+ "patch",
+ "sandboxwarmpool",
+ warmpool_name,
+ "-n",
+ namespace,
+ "--type=merge",
+ f"-p={patch_json}",
+ ],
+ raise_on_failure=False,
+ )
+
+
+def _GetPodNames(namespace, label):
+ """Return list of pod names matching the label selector."""
+ stdout, _, rc = utils.RunKubectl(
+ [
+ "get",
+ "pods",
+ "-n",
+ namespace,
+ "-l",
+ label,
+ "-o",
+ "jsonpath={.items[*].metadata.name}",
+ ],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ if rc != 0 or not stdout:
+ return []
+ return stdout.split()
+
+
+def _CountAllocatedIPs(namespace, label):
+ """Count pod IPs currently allocated for pods matching the label.
+
+ Scoped to the warm pool label to accurately measure IPAM release
+ efficiency for the specific pods being deleted.
+ """
+ stdout, _, rc = utils.RunKubectl(
+ [
+ "get",
+ "pods",
+ "-n",
+ namespace,
+ "-l",
+ label,
+ "-o",
+ "jsonpath={.items[*].status.podIP}",
+ ],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ if rc != 0 or not stdout:
+ return 0
+ return len([ip for ip in stdout.split() if ip])
+
+
+def _Percentile(sorted_values, pct):
+ """Calculate percentile (0-100) with linear interpolation."""
+ if not sorted_values:
+ return 0.0
+ idx = (pct / 100) * (len(sorted_values) - 1)
+ lo = int(idx)
+ hi = min(lo + 1, len(sorted_values) - 1)
+ frac = idx - lo
+ return sorted_values[lo] * (1 - frac) + sorted_values[hi] * frac
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py
new file mode 100644
index 0000000000..109ab0efe6
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_payload_benchmark.py
@@ -0,0 +1,617 @@
+"""PKB Benchmark: GKE Agent Payload Transfer Saturation .
+
+Atomic single-point measurement of payload transfer latency from a gVisor
+sandbox back to the orchestrator on a pre-provisioned GKE cluster. Measures
+generation time, serialization time, stdout write time, total transfer time,
+throughput, and RSS at a given payload_size_mb and concurrent_sessions count.
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the payload_size_mb parameter across iterations to
+find the saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_payload \
+ --k8s_payload_size_mb=50 \
+ --k8s_payload_iterations=20 \
+ --k8s_payload_concurrent_sessions=5 \
+ --k8s_namespace=agentic \
+ --k8s_agent_api_url=http://localhost:8080
+
+Samples emitted (per run):
+ - gke_payload_orchestrator_transfer_mean (ms)
+ - gke_payload_orchestrator_transfer_p50 (ms)
+ - gke_payload_orchestrator_transfer_p95 (ms)
+ - gke_payload_orchestrator_transfer_p99 (ms)
+ - gke_payload_orchestrator_transfer_min (ms)
+ - gke_payload_orchestrator_transfer_max (ms)
+ - gke_payload_sandbox_payload_size_bytes (bytes)
+ - gke_payload_sandbox_payload_encoded_size_bytes (bytes)
+ - gke_payload_sandbox_payload_iterations (count)
+ - gke_payload_sandbox_generation_time_mean (ms)
+ - gke_payload_sandbox_generation_time_p50 (ms)
+ - gke_payload_sandbox_generation_time_p95 (ms)
+ - gke_payload_sandbox_generation_time_p99 (ms)
+ - gke_payload_sandbox_generation_time_min (ms)
+ - gke_payload_sandbox_generation_time_max (ms)
+ - gke_payload_sandbox_serialization_time_mean (ms)
+ - gke_payload_sandbox_serialization_time_p50 (ms)
+ - gke_payload_sandbox_serialization_time_p95 (ms)
+ - gke_payload_sandbox_serialization_time_p99 (ms)
+ - gke_payload_sandbox_serialization_time_min (ms)
+ - gke_payload_sandbox_serialization_time_max (ms)
+ - gke_payload_sandbox_stdout_time_mean (ms)
+ - gke_payload_sandbox_stdout_time_p50 (ms)
+ - gke_payload_sandbox_stdout_time_p95 (ms)
+ - gke_payload_sandbox_stdout_time_p99 (ms)
+ - gke_payload_sandbox_stdout_time_min (ms)
+ - gke_payload_sandbox_stdout_time_max (ms)
+ - gke_payload_sandbox_transfer_time_mean (ms)
+ - gke_payload_sandbox_transfer_time_p50 (ms)
+ - gke_payload_sandbox_transfer_time_p95 (ms)
+ - gke_payload_sandbox_transfer_time_p99 (ms)
+ - gke_payload_sandbox_transfer_time_min (ms)
+ - gke_payload_sandbox_transfer_time_max (ms)
+ - gke_payload_sandbox_throughput_mean (MB/s)
+ - gke_payload_sandbox_throughput_p50 (MB/s)
+ - gke_payload_sandbox_throughput_min (MB/s)
+ - gke_payload_sandbox_rss_start (MB)
+ - gke_payload_sandbox_rss_end (MB)
+ - gke_payload_sandbox_rss_growth (MB)
+ - gke_payload_wall_time (seconds)
+"""
+
+import logging
+import time
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_payload"
+BENCHMARK_CONFIG = """
+k8s_payload:
+ description: >
+ Atomic single-point payload transfer saturation measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+_WARMPOOL_NAME = "python-sandbox-warmpool"
+_WARMPOOL_LABEL = "sandbox=python-sandbox-example"
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_float(
+ "k8s_payload_size_mb",
+ 1.0,
+ "Payload size in megabytes to transfer from the sandbox.",
+)
+
+flags.DEFINE_integer(
+ "k8s_payload_iterations",
+ 20,
+ "Number of transfer iterations per sandbox session.",
+)
+
+flags.DEFINE_integer(
+ "k8s_payload_concurrent_sessions",
+ 5,
+ "Number of parallel sandbox sessions.",
+)
+
+flags.DEFINE_integer(
+ "k8s_payload_exec_timeout",
+ 300,
+ "Sandbox command execution timeout in seconds.",
+)
+
+flags.DEFINE_bool(
+ "k8s_payload_patch_warmpool",
+ True,
+ "Patch SandboxWarmPool replicas to match concurrent_sessions before measurement.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads and verify agent API."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+ utils.CheckAgentHealthz(required=False)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Execute a single payload transfer measurement and return samples.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ payload_size_mb = FLAGS.k8s_payload_size_mb
+ iterations = FLAGS.k8s_payload_iterations
+ concurrent = FLAGS.k8s_payload_concurrent_sessions
+
+ logging.info(
+ "=== Run: payload_size_mb=%s, iterations=%d, concurrent=%d ===",
+ payload_size_mb,
+ iterations,
+ concurrent,
+ )
+
+ # Ensure port-forward is active (needed when sweeps skip Prepare)
+ utils.EnsurePortForward()
+
+ # Patch warm pool (moved from Prepare for sweep compatibility)
+ if FLAGS.k8s_payload_patch_warmpool:
+ utils.PatchWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ replicas=concurrent,
+ label=_WARMPOOL_LABEL,
+ )
+
+ # POST to agent API
+ payload = {
+ "payload_size_mb": payload_size_mb,
+ "payload_iterations": iterations,
+ "concurrent_sessions": concurrent,
+ "sandbox_exec_timeout_s": FLAGS.k8s_payload_exec_timeout,
+ }
+
+ t0 = time.time()
+ result = utils.CallAgentApi("/benchmark/python/payload", payload)
+ wall_time = time.time() - t0
+
+ successful = result.get("successful_sessions", 0)
+ failed = result.get("failed_sessions", 0)
+ agg = result.get("aggregate", {})
+
+ logging.info(
+ "API response: %d successful, %d failed sessions (%.1fs)",
+ successful,
+ failed,
+ wall_time,
+ )
+
+ # Build samples
+ extra = {
+ "payload_size_mb": payload_size_mb,
+ "payload_iterations": iterations,
+ "concurrent_sessions": concurrent,
+ "successful_sessions": successful,
+ "failed_sessions": failed,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ # Orchestrator-side transfer latency
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_mean_ms",
+ "orchestrator_transfer_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_p50_ms",
+ "orchestrator_transfer_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_p95_ms",
+ "orchestrator_transfer_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_p99_ms",
+ "orchestrator_transfer_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_min_ms",
+ "orchestrator_transfer_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "orchestrator_transfer_max_ms",
+ "orchestrator_transfer_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Payload metadata
+ _emit(
+ samples,
+ agg,
+ "sandbox_payload_size_bytes",
+ "sandbox_payload_size_bytes",
+ "bytes",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_payload_encoded_size_bytes",
+ "sandbox_payload_encoded_size_bytes",
+ "bytes",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_payload_iterations",
+ "sandbox_payload_iterations",
+ "count",
+ ns,
+ extra,
+ )
+
+ # Generation time (os.urandom)
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_mean_ms",
+ "sandbox_generation_time_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_p50_ms",
+ "sandbox_generation_time_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_p95_ms",
+ "sandbox_generation_time_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_p99_ms",
+ "sandbox_generation_time_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_min_ms",
+ "sandbox_generation_time_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_generation_time_max_ms",
+ "sandbox_generation_time_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Serialization time (base64 encode)
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_mean_ms",
+ "sandbox_serialization_time_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_p50_ms",
+ "sandbox_serialization_time_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_p95_ms",
+ "sandbox_serialization_time_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_p99_ms",
+ "sandbox_serialization_time_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_min_ms",
+ "sandbox_serialization_time_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_serialization_time_max_ms",
+ "sandbox_serialization_time_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Stdout write time (gVisor Gofer write syscall)
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_mean_ms",
+ "sandbox_stdout_time_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_p50_ms",
+ "sandbox_stdout_time_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_p95_ms",
+ "sandbox_stdout_time_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_p99_ms",
+ "sandbox_stdout_time_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_min_ms",
+ "sandbox_stdout_time_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_stdout_time_max_ms",
+ "sandbox_stdout_time_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Transfer time (serialization + stdout write — threshold metric)
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_mean_ms",
+ "sandbox_transfer_time_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_p50_ms",
+ "sandbox_transfer_time_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_p95_ms",
+ "sandbox_transfer_time_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_p99_ms",
+ "sandbox_transfer_time_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_min_ms",
+ "sandbox_transfer_time_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_transfer_time_max_ms",
+ "sandbox_transfer_time_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Throughput
+ _emit(
+ samples,
+ agg,
+ "sandbox_throughput_mean_mbps",
+ "sandbox_throughput_mean",
+ "MB/s",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_throughput_p50_mbps",
+ "sandbox_throughput_p50",
+ "MB/s",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_throughput_min_mbps",
+ "sandbox_throughput_min",
+ "MB/s",
+ ns,
+ extra,
+ )
+
+ # RSS
+ _emit(samples, agg, "sandbox_rss_start_mb", "sandbox_rss_start", "MB", ns, extra)
+ _emit(samples, agg, "sandbox_rss_end_mb", "sandbox_rss_end", "MB", ns, extra)
+ _emit(samples, agg, "sandbox_rss_growth_mb", "sandbox_rss_growth", "MB", ns, extra)
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info(
+ "Emitted %d samples for payload_size_mb=%s.", len(samples), payload_size_mb
+ )
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Clean up after measurement. Scale warm pool to 0."""
+ ns = FLAGS.k8s_namespace
+ logging.info("Cleanup: draining warm pool.")
+
+ utils.DrainWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ label=_WARMPOOL_LABEL,
+ )
+
+ utils.StopPortForward()
+ logging.info("Cleanup complete (cluster persists).")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra):
+ """Emit a sample if the key exists in the aggregate dict.
+
+ Args:
+ samples: List to append the new sample.Sample to.
+ agg: Aggregate metrics dict returned by the agent API response.
+ agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms").
+ metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric
+ name (e.g. "orchestrator_cel_mean").
+ unit: Unit string for the sample (e.g. "ms", "MB", "seconds").
+ namespace: Kubernetes namespace (included in sample metadata).
+ extra: Dict of additional metadata key-value pairs attached to
+ every sample (density, session counts, wall time, etc.).
+ """
+ value = agg.get(agg_key)
+ if value is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_suffix}",
+ value,
+ unit,
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py
new file mode 100644
index 0000000000..7760f23ff7
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_python_density_benchmark.py
@@ -0,0 +1,378 @@
+"""PKB Benchmark: GKE Agent Python Sandbox Density .
+
+Atomic single-point measurement of Python sandbox density on a
+pre-provisioned GKE cluster with gVisor isolation. Measures Code Execution
+Latency (CEL), Time To First Execution (TTFE), RSS memory growth, and
+per-type latency breakdown (compute, syscall, import) at a given
+concurrent session count.
+
+Workflow per session:
+ 1. Claim a pre-warmed sandbox pod from the SandboxWarmPool
+ 2. Upload and execute the benchmark script inside the gVisor sandbox
+ 3. Run `sample_warmup` iterations (results discarded - stabilizes caches)
+ 4. Run `sample_count` measured iterations (results recorded)
+ 5. Report TTFE, per-iteration CEL, RSS, and per-task-type breakdown
+ 6. Release the sandbox claim
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the density parameter across iterations to find
+the saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_python_density \\
+ --k8s_python_density_concurrent_sandbox_count=16 \\
+ --k8s_python_density_sample_count=20 \\
+ --k8s_python_density_sample_warmup=0 \\
+ --k8s_namespace=agentic \\
+ --k8s_agent_api_url=http://localhost:8080
+
+Samples emitted (per run):
+ - gke_python_density_orchestrator_cel_mean (ms)
+ - gke_python_density_orchestrator_cel_p50 (ms)
+ - gke_python_density_orchestrator_cel_p95 (ms)
+ - gke_python_density_orchestrator_cel_p99 (ms)
+ - gke_python_density_orchestrator_cel_min (ms)
+ - gke_python_density_orchestrator_cel_max (ms)
+ - gke_python_density_sandbox_total_cel_mean (ms)
+ - gke_python_density_sandbox_total_cel_p50 (ms)
+ - gke_python_density_sandbox_total_cel_p95 (ms)
+ - gke_python_density_sandbox_total_cel_p99 (ms)
+ - gke_python_density_sandbox_total_cel_min (ms)
+ - gke_python_density_sandbox_total_cel_max (ms)
+ - gke_python_density_sandbox_ttfe (ms)
+ - gke_python_density_sandbox_rss_start (MB)
+ - gke_python_density_sandbox_rss_end (MB)
+ - gke_python_density_sandbox_rss_growth (MB)
+ - gke_python_density_sandbox_compute_cel_mean (ms)
+ - gke_python_density_sandbox_syscall_cel_mean (ms)
+ - gke_python_density_sandbox_import_cel_mean (ms)
+ - gke_python_density_wall_time (seconds)
+"""
+
+import logging
+import time
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_python_density"
+BENCHMARK_CONFIG = """
+k8s_python_density:
+ description: >
+ Atomic single-point Python sandbox density measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+_WARMPOOL_NAME = "python-sandbox-warmpool"
+_WARMPOOL_LABEL = "sandbox=python-sandbox-example"
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_integer(
+ "k8s_python_density_concurrent_sandbox_count",
+ 1,
+ "Number of concurrent sandbox sessions to run.",
+)
+
+flags.DEFINE_integer(
+ "k8s_python_density_sample_count",
+ 20,
+ "Number of sample iterations per sandbox session.",
+)
+
+flags.DEFINE_integer(
+ "k8s_python_density_sample_warmup",
+ 0,
+ "Number of warmup iterations per session (excluded from stats). "
+ "Warmup iterations execute the same benchmark tasks as measured "
+ "iterations but their latency results are discarded. This allows "
+ "JIT compilation, caches, and gVisor page faults to stabilize "
+ "before measurement begins.",
+)
+
+flags.DEFINE_bool(
+ "k8s_python_density_patch_warmpool",
+ True,
+ "Patch SandboxWarmPool replicas to match density before measurement.",
+)
+
+flags.DEFINE_integer(
+ "k8s_python_density_exec_timeout",
+ 600,
+ "Timeout in seconds for the API call.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads and verify agent API."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+ utils.CheckAgentHealthz(required=False)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Execute a single density measurement and return samples.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ density = FLAGS.k8s_python_density_concurrent_sandbox_count
+
+ logging.info("=== Run: density=%d ===", density)
+
+ # Ensure port-forward is active (needed when sweeps skip Prepare)
+ utils.EnsurePortForward()
+
+ # Patch warm pool to match density (moved from Prepare for sweep compatibility)
+ if FLAGS.k8s_python_density_patch_warmpool:
+ utils.PatchWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ replicas=density,
+ label=_WARMPOOL_LABEL,
+ )
+
+ # POST to agent API
+ payload = {
+ "sample_count": FLAGS.k8s_python_density_sample_count,
+ "sample_warmup": FLAGS.k8s_python_density_sample_warmup,
+ "concurrent_sessions": density,
+ "sandbox_exec_timeout_s": FLAGS.k8s_python_density_exec_timeout,
+ }
+
+ t0 = time.time()
+ result = utils.CallAgentApi("/benchmark/python/density", payload)
+ wall_time = time.time() - t0
+
+ successful = result.get("successful_sessions", 0)
+ failed = result.get("failed_sessions", 0)
+ agg = result.get("aggregate", {})
+
+ logging.info(
+ "API response: %d successful, %d failed sessions (%.1fs)",
+ successful,
+ failed,
+ wall_time,
+ )
+
+ # Build samples
+ extra = {
+ "density": density,
+ "successful_sessions": successful,
+ "failed_sessions": failed,
+ "sample_count": FLAGS.k8s_python_density_sample_count,
+ "sample_warmup": FLAGS.k8s_python_density_sample_warmup,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ # Orchestrator-side CEL
+ _emit(
+ samples,
+ agg,
+ "orchestrator_cel_mean_ms",
+ "orchestrator_cel_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples, agg, "orchestrator_cel_p50_ms", "orchestrator_cel_p50", "ms", ns, extra
+ )
+ _emit(
+ samples, agg, "orchestrator_cel_p95_ms", "orchestrator_cel_p95", "ms", ns, extra
+ )
+ _emit(
+ samples, agg, "orchestrator_cel_p99_ms", "orchestrator_cel_p99", "ms", ns, extra
+ )
+ _emit(
+ samples, agg, "orchestrator_cel_min_ms", "orchestrator_cel_min", "ms", ns, extra
+ )
+ _emit(
+ samples, agg, "orchestrator_cel_max_ms", "orchestrator_cel_max", "ms", ns, extra
+ )
+
+ # Sandbox-side total CEL
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_mean_ms",
+ "sandbox_total_cel_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_p50_ms",
+ "sandbox_total_cel_p50",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_p95_ms",
+ "sandbox_total_cel_p95",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_p99_ms",
+ "sandbox_total_cel_p99",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_min_ms",
+ "sandbox_total_cel_min",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_total_cel_max_ms",
+ "sandbox_total_cel_max",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # TTFE
+ _emit(samples, agg, "sandbox_ttfe_ms", "sandbox_ttfe", "ms", ns, extra)
+
+ # RSS
+ _emit(samples, agg, "sandbox_rss_start_mb", "sandbox_rss_start", "MB", ns, extra)
+ _emit(samples, agg, "sandbox_rss_end_mb", "sandbox_rss_end", "MB", ns, extra)
+ _emit(samples, agg, "sandbox_rss_growth_mb", "sandbox_rss_growth", "MB", ns, extra)
+
+ # Per-type CEL breakdown
+ _emit(
+ samples,
+ agg,
+ "sandbox_compute_cel_mean_ms",
+ "sandbox_compute_cel_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_syscall_cel_mean_ms",
+ "sandbox_syscall_cel_mean",
+ "ms",
+ ns,
+ extra,
+ )
+ _emit(
+ samples,
+ agg,
+ "sandbox_import_cel_mean_ms",
+ "sandbox_import_cel_mean",
+ "ms",
+ ns,
+ extra,
+ )
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for density=%d.", len(samples), density)
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Clean up after measurement. Scale warm pool to 0."""
+ ns = FLAGS.k8s_namespace
+ logging.info("Cleanup: draining warm pool.")
+
+ if FLAGS.k8s_python_density_patch_warmpool:
+ utils.DrainWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ label=_WARMPOOL_LABEL,
+ )
+
+ utils.StopPortForward()
+ logging.info("Cleanup complete (cluster persists).")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _emit(samples, agg, agg_key, metric_suffix, unit, namespace, extra):
+ """Emit a sample if the key exists in the aggregate dict.
+
+ Args:
+ samples: List to append the new sample.Sample to.
+ agg: Aggregate metrics dict returned by the agent API response.
+ agg_key: Key to look up in `agg` (e.g. "orchestrator_cel_mean_ms").
+ metric_suffix: Suffix appended to BENCHMARK_NAME to form the metric
+ name (e.g. "orchestrator_cel_mean").
+ unit: Unit string for the sample (e.g. "ms", "MB", "seconds").
+ namespace: Kubernetes namespace (included in sample metadata).
+ extra: Dict of additional metadata key-value pairs attached to
+ every sample (density, session counts, wall time, etc.).
+ """
+ value = agg.get(agg_key)
+ if value is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_suffix}",
+ value,
+ unit,
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py
new file mode 100644
index 0000000000..feb82c8614
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_qps_benchmark.py
@@ -0,0 +1,805 @@
+"""PKB Benchmark: GKE Agent QPS Saturation .
+
+Atomic single-point measurement of scheduling throughput on a pre-provisioned
+GKE cluster. Fires sandbox claim requests at a controlled QPS rate for a
+fixed duration and measures per-request TTFE (Time To First Execution).
+
+Supports two operating modes:
+ - **agent**: POST to the orchestrator /benchmark/python/qps endpoint
+ - **raw_claim**: Bypass the agent, create SandboxClaims directly via kubectl
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the target_qps parameter across iterations to find
+the QPS saturation point.
+
+Usage:
+ # Agent mode
+ python pkb.py --benchmarks=gke_qps \\
+ --k8s_qps_target_qps=5.0 \\
+ --k8s_qps_pool_size=70 \\
+ --k8s_qps_step_duration_s=30.0 \\
+ --k8s_qps_mode=agent \\
+ --k8s_namespace=agentic \\
+ --k8s_agent_api_url=http://localhost:8080
+
+ # Raw claim mode
+ python pkb.py --benchmarks=gke_qps \\
+ --k8s_qps_target_qps=5.0 \\
+ --k8s_qps_pool_size=70 \\
+ --k8s_qps_step_duration_s=30.0 \\
+ --k8s_qps_mode=raw_claim \\
+ --k8s_qps_claim_timeout_s=60.0 \\
+ --k8s_namespace=agentic
+
+Samples emitted (per run):
+ - gke_qps_ttfe_mean (ms)
+ - gke_qps_ttfe_p50 (ms)
+ - gke_qps_ttfe_p95 (ms)
+ - gke_qps_ttfe_p99 (ms)
+ - gke_qps_ttfe_min (ms)
+ - gke_qps_ttfe_max (ms)
+ - gke_qps_claim_mean (ms)
+ - gke_qps_claim_p95 (ms)
+ - gke_qps_actual_qps (requests/sec)
+ - gke_qps_duration (seconds)
+ - gke_qps_total_requests (count)
+ - gke_qps_successful_requests (count)
+ - gke_qps_failed_requests (count)
+ - gke_qps_pool_before (count)
+ - gke_qps_pool_after (count)
+ - gke_qps_wall_time (seconds)
+"""
+
+import json
+import os
+import logging
+import threading
+import time
+import uuid
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import data
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_qps"
+BENCHMARK_CONFIG = """
+k8s_qps:
+ description: >
+ Atomic single-point QPS saturation measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+_WARMPOOL_NAME = "python-sandbox-warmpool"
+_WARMPOOL_LABEL = "sandbox=python-sandbox-example"
+_SANDBOX_TEMPLATE = "python-sandbox-template"
+_QPS_CLAIM_LABEL = "created-by=pkb-qps-benchmark"
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_float(
+ "k8s_qps_target_qps",
+ 5.0,
+ "Target requests per second (sandbox claims per second).",
+)
+
+flags.DEFINE_integer(
+ "k8s_qps_pool_size",
+ 70,
+ "Warm pool size maintained during the measurement.",
+)
+
+flags.DEFINE_float(
+ "k8s_qps_step_duration_s",
+ 30.0,
+ "Duration of the QPS burst in seconds.",
+)
+
+flags.DEFINE_integer(
+ "k8s_qps_sandbox_exec_timeout_s",
+ 30,
+ "Sandbox command execution timeout in seconds.",
+)
+
+flags.DEFINE_float(
+ "k8s_qps_provision_timeout_s",
+ 180.0,
+ "Max seconds to wait for pool pods to reach Running.",
+)
+
+flags.DEFINE_string(
+ "k8s_qps_mode",
+ "agent",
+ "Operating mode: 'agent' (POST to orchestrator API) or "
+ "'raw_claim' (create SandboxClaims directly via kubectl).",
+)
+
+flags.DEFINE_float(
+ "k8s_qps_claim_timeout_s",
+ 60.0,
+ "Max seconds to wait for a raw claim to bind " "(only used with mode=raw_claim).",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads and verify agent API."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+
+ mode = FLAGS.k8s_qps_mode
+ if mode == "agent":
+ utils.CheckAgentHealthz(required=False)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Execute a single QPS measurement and return samples.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ pool_size = FLAGS.k8s_qps_pool_size
+
+ # Scale warm pool (moved from Prepare for sweep compatibility)
+ utils.PatchWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ replicas=pool_size,
+ label=_WARMPOOL_LABEL,
+ wait_timeout=int(FLAGS.k8s_qps_provision_timeout_s),
+ )
+
+ mode = FLAGS.k8s_qps_mode
+
+ if mode == "raw_claim":
+ return _RunRawClaim(benchmark_spec)
+ else:
+ return _RunAgent(benchmark_spec)
+
+
+def Cleanup(benchmark_spec):
+ """Delete benchmark claims and drain warm pool."""
+ ns = FLAGS.k8s_namespace
+ logging.info("Cleanup: deleting benchmark claims and draining warm pool.")
+
+ # Delete any lingering benchmark claims
+ _DeleteBenchmarkClaims(ns)
+
+ # Drain warm pool
+ utils.DrainWarmPool(
+ namespace=ns,
+ warmpool_name=_WARMPOOL_NAME,
+ label=_WARMPOOL_LABEL,
+ )
+
+ utils.StopPortForward()
+ logging.info("Cleanup complete.")
+
+
+# ---------------------------------------------------------------------------
+# Agent mode
+# ---------------------------------------------------------------------------
+
+
+def _RunAgent(benchmark_spec):
+ """Fire QPS burst via the orchestrator API."""
+ ns = FLAGS.k8s_namespace
+ target_qps = FLAGS.k8s_qps_target_qps
+ pool_size = FLAGS.k8s_qps_pool_size
+ step_duration = FLAGS.k8s_qps_step_duration_s
+
+ logging.info(
+ "=== Run (agent): target_qps=%s, pool_size=%d, duration=%ss ===",
+ target_qps,
+ pool_size,
+ step_duration,
+ )
+
+ # Ensure port-forward is active (needed when sweeps skip Prepare)
+ utils.EnsurePortForward()
+
+ # Record pool state before burst
+ pool_before = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running")
+
+ # POST to agent API
+ payload = {
+ "target_qps": target_qps,
+ "duration_s": step_duration,
+ "sandbox_exec_timeout_s": FLAGS.k8s_qps_sandbox_exec_timeout_s,
+ }
+
+ t0 = time.time()
+ api_timeout = int(step_duration + 300)
+ result = utils.CallAgentApi("/benchmark/python/qps", payload, timeout=api_timeout)
+ wall_time = time.time() - t0
+
+ # Record pool state after burst
+ pool_after = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running")
+
+ # Extract response fields
+ aggregate = result.get("aggregate", {})
+ successful = result.get("successful_requests", 0)
+ failed = result.get("failed_requests", 0)
+ total = result.get("total_requests", 0)
+ actual_qps = result.get("actual_qps", 0)
+ duration_s = result.get("duration_s", 0)
+
+ logging.info(
+ "API response: actual_qps=%s, %d/%d requests ok (%.1fs)",
+ actual_qps,
+ successful,
+ total,
+ wall_time,
+ )
+
+ # Build samples
+ extra = {
+ "target_qps": target_qps,
+ "pool_size": pool_size,
+ "step_duration_s": step_duration,
+ "mode": "agent",
+ "actual_qps": actual_qps,
+ "total_requests": total,
+ "successful_requests": successful,
+ "failed_requests": failed,
+ "pool_before": pool_before,
+ "pool_after": pool_after,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ # TTFE latency stats
+ _emit(samples, aggregate, "ttfe_mean_ms", "ttfe_mean", "ms", ns, extra)
+ _emit(samples, aggregate, "ttfe_p50_ms", "ttfe_p50", "ms", ns, extra)
+ _emit(samples, aggregate, "ttfe_p95_ms", "ttfe_p95", "ms", ns, extra)
+ _emit(samples, aggregate, "ttfe_p99_ms", "ttfe_p99", "ms", ns, extra)
+ _emit(samples, aggregate, "ttfe_min_ms", "ttfe_min", "ms", ns, extra)
+ _emit(samples, aggregate, "ttfe_max_ms", "ttfe_max", "ms", ns, extra)
+
+ # Claim latency stats
+ _emit(samples, aggregate, "claim_mean_ms", "claim_mean", "ms", ns, extra)
+ _emit(samples, aggregate, "claim_p95_ms", "claim_p95", "ms", ns, extra)
+
+ # Throughput and counts
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_actual_qps",
+ actual_qps,
+ "requests/sec",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_duration",
+ duration_s,
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_total_requests",
+ float(total),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_successful_requests",
+ float(successful),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_failed_requests",
+ float(failed),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Pool state
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_pool_before",
+ float(pool_before),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_pool_after",
+ float(pool_after),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for target_qps=%s.", len(samples), target_qps)
+ return samples
+
+
+# ---------------------------------------------------------------------------
+# Raw claim mode
+# ---------------------------------------------------------------------------
+
+
+def _RunRawClaim(benchmark_spec):
+ """Fire SandboxClaims directly at target_qps (no agent)."""
+ ns = FLAGS.k8s_namespace
+ target_qps = FLAGS.k8s_qps_target_qps
+ pool_size = FLAGS.k8s_qps_pool_size
+ step_duration = FLAGS.k8s_qps_step_duration_s
+ claim_timeout = FLAGS.k8s_qps_claim_timeout_s
+
+ logging.info(
+ "=== Run (raw_claim): target_qps=%s, pool_size=%d, duration=%ss ===",
+ target_qps,
+ pool_size,
+ step_duration,
+ )
+
+ # Record pool state before burst
+ pool_before = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running")
+
+ # Calculate total claims to fire
+ total_claims = max(1, int(target_qps * step_duration))
+ interval = 1.0 / target_qps if target_qps > 0 else 1.0
+
+ logging.info(
+ "Firing %d raw SandboxClaims at %s req/s",
+ total_claims,
+ target_qps,
+ )
+
+ # Fire claims at target QPS in parallel threads
+ claim_results = []
+ lock = threading.Lock()
+
+ def _fire_and_wait(idx, fire_time):
+ claim_name = f"pkb-qps-0-{idx}-{uuid.uuid4().hex[:6]}"
+ result = {"request_id": idx, "fire_time_s": round(fire_time, 3)}
+ try:
+ t_create = _CreateClaim(ns, _SANDBOX_TEMPLATE, claim_name)
+ result["create_ts"] = t_create
+ t_bound = _WaitClaimBound(ns, claim_name, claim_timeout)
+ if t_bound is not None:
+ ttfe_ms = (t_bound - t_create) * 1000.0
+ result["ttfe_ms"] = round(ttfe_ms, 3)
+ result["claim_ms"] = round(ttfe_ms, 3)
+ result["error"] = None
+ else:
+ result["ttfe_ms"] = None
+ result["error"] = "Timeout waiting for claim to bind"
+ except Exception as e:
+ result["ttfe_ms"] = None
+ result["error"] = f"{type(e).__name__}: {e}"
+ with lock:
+ claim_results.append(result)
+
+ t0 = time.time()
+ threads = []
+ for i in range(total_claims):
+ fire_time = time.time() - t0
+ t = threading.Thread(target=_fire_and_wait, args=(i, fire_time), daemon=True)
+ threads.append(t)
+ t.start()
+ if i < total_claims - 1:
+ next_fire = t0 + (i + 1) * interval
+ sleep_time = next_fire - time.time()
+ if sleep_time > 0:
+ time.sleep(sleep_time)
+
+ for t in threads:
+ t.join(timeout=claim_timeout + 30)
+
+ wall_time = time.time() - t0
+ actual_qps = round(total_claims / wall_time, 2) if wall_time > 0 else 0
+
+ # Record pool state after burst
+ pool_after = utils.CountPods(ns, _WARMPOOL_LABEL, phase="Running")
+
+ # Aggregate results
+ successful = [r for r in claim_results if r.get("ttfe_ms") is not None]
+ failed = [r for r in claim_results if r.get("error")]
+ ttfe_values = sorted(r["ttfe_ms"] for r in successful)
+
+ logging.info(
+ "Raw claim burst complete: %d/%d ok, actual_qps=%s (%.1fs)",
+ len(successful),
+ total_claims,
+ actual_qps,
+ wall_time,
+ )
+
+ # Build samples
+ extra = {
+ "target_qps": target_qps,
+ "pool_size": pool_size,
+ "step_duration_s": step_duration,
+ "mode": "raw_claim",
+ "actual_qps": actual_qps,
+ "total_requests": total_claims,
+ "successful_requests": len(successful),
+ "failed_requests": len(failed),
+ "pool_before": pool_before,
+ "pool_after": pool_after,
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ samples = []
+
+ # TTFE latency stats (computed from raw claim results)
+ if ttfe_values:
+ n = len(ttfe_values)
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_mean",
+ round(sum(ttfe_values) / n, 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_p50",
+ round(_percentile(ttfe_values, 50), 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_p95",
+ round(_percentile(ttfe_values, 95), 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_p99",
+ round(_percentile(ttfe_values, 99), 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_min",
+ round(ttfe_values[0], 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_ttfe_max",
+ round(ttfe_values[-1], 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+
+ # Claim latency (same as TTFE in raw_claim mode)
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_claim_mean",
+ round(sum(ttfe_values) / n, 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_claim_p95",
+ round(_percentile(ttfe_values, 95), 3),
+ "ms",
+ ns,
+ extra,
+ )
+ )
+
+ # Throughput and counts
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_actual_qps",
+ actual_qps,
+ "requests/sec",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_duration",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_total_requests",
+ float(total_claims),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_successful_requests",
+ float(len(successful)),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_failed_requests",
+ float(len(failed)),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Pool state
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_pool_before",
+ float(pool_before),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_pool_after",
+ float(pool_after),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ # Cleanup benchmark claims
+ _DeleteBenchmarkClaims(ns)
+
+ logging.info("Emitted %d samples for target_qps=%s.", len(samples), target_qps)
+ return samples
+
+
+# ---------------------------------------------------------------------------
+# Raw claim helpers
+# ---------------------------------------------------------------------------
+
+
+def _CreateClaim(namespace, template, claim_name):
+ """Create a single SandboxClaim via kubectl and return creation timestamp."""
+ manifest = json.dumps(
+ {
+ "apiVersion": "extensions.agents.x-k8s.io/v1alpha1",
+ "kind": "SandboxClaim",
+ "metadata": {
+ "name": claim_name,
+ "namespace": namespace,
+ "labels": {"created-by": "pkb-qps-benchmark"},
+ },
+ "spec": {
+ "sandboxTemplateRef": {"name": template},
+ },
+ }
+ )
+ tmp_dir = os.path.join(
+ data.ResourcePath("k8s_agents/manifests"), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+ tmp_path = os.path.join(tmp_dir, f"qps-claim-{claim_name}.json")
+ try:
+ with open(tmp_path, "w") as f:
+ f.write(manifest)
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", tmp_path],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ finally:
+ if os.path.isfile(tmp_path):
+ os.unlink(tmp_path)
+ t_create = time.time()
+ if retcode != 0:
+ raise RuntimeError(
+ f"Failed to create claim {claim_name}: {stderr.strip()}"
+ )
+ return t_create
+
+
+def _WaitClaimBound(namespace, claim_name, timeout_s):
+ """Wait for a SandboxClaim to reach Bound phase. Returns timestamp or None."""
+ deadline = time.time() + timeout_s
+ while time.time() < deadline:
+ stdout, _, rc = utils.RunKubectl(
+ [
+ "get",
+ "sandboxclaim",
+ claim_name,
+ "-n",
+ namespace,
+ "-o",
+ "jsonpath={.status.phase}",
+ ],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if rc == 0 and stdout.lower() in ("bound", "ready"):
+ return time.time()
+ time.sleep(0.1)
+ return None
+
+
+def _DeleteBenchmarkClaims(namespace):
+ """Delete SandboxClaims labelled created-by=pkb-qps-benchmark."""
+ stdout, _, rc = utils.RunKubectl(
+ [
+ "get",
+ "sandboxclaim",
+ "-l",
+ _QPS_CLAIM_LABEL,
+ "-n",
+ namespace,
+ "-o",
+ "jsonpath={.items[*].metadata.name}",
+ ],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ names = stdout.split() if stdout else []
+ if not names or names == [""]:
+ return 0
+
+ count = len(names)
+ logging.info("Deleting %d pkb-qps SandboxClaim(s)", count)
+ utils.RunKubectl(
+ [
+ "delete",
+ "sandboxclaim",
+ "-l",
+ _QPS_CLAIM_LABEL,
+ "-n",
+ namespace,
+ "--wait=false",
+ ],
+ timeout=60,
+ raise_on_failure=False,
+ )
+
+ # Wait for claims to be fully removed
+ t0 = time.time()
+ while time.time() - t0 < 120:
+ stdout, _, _ = utils.RunKubectl(
+ [
+ "get",
+ "sandboxclaim",
+ "-l",
+ _QPS_CLAIM_LABEL,
+ "-n",
+ namespace,
+ "--no-headers",
+ "--ignore-not-found",
+ ],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ remaining = len([l for l in stdout.splitlines() if l]) if stdout else 0
+ if remaining == 0:
+ break
+ time.sleep(2)
+
+ logging.info("Claims cleaned up in %.1fs", time.time() - t0)
+ return count
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _percentile(sorted_values, pct):
+ """Calculate percentile (0-100) with linear interpolation."""
+ if not sorted_values:
+ return 0.0
+ idx = (pct / 100) * (len(sorted_values) - 1)
+ lo = int(idx)
+ hi = min(lo + 1, len(sorted_values) - 1)
+ frac = idx - lo
+ return sorted_values[lo] * (1 - frac) + sorted_values[hi] * frac
+
+
+def _emit(samples, data, data_key, metric_suffix, unit, namespace, extra):
+ """Emit a sample if the key exists in the data dict."""
+ value = data.get(data_key)
+ if value is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_suffix}",
+ value,
+ unit,
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py
new file mode 100644
index 0000000000..8d78c6649b
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_snapshot_benchmark.py
@@ -0,0 +1,1037 @@
+"""PKB Benchmark: GKE Agent Pod Snapshot Saturation .
+
+Atomic single-point measurement of GKE Pod Snapshot create/restore latency
+on a pre-provisioned GKE cluster with gVisor isolation. Measures snapshot
+time, restore time, TTFE (Time To First Execution), and restore correctness
+at a given preload_mb and burst_size.
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the preload_mb parameter across iterations to find
+the saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_snapshot \\
+ --k8s_snapshot_preload_mb=50 \\
+ --k8s_snapshot_burst_size=3 \\
+ --k8s_namespace=agentic \\
+ --k8s_snapshot_skip_snapshot=false
+
+Samples emitted (per run):
+ - k8s_snapshot_snapshot_p50 (seconds)
+ - k8s_snapshot_snapshot_p95 (seconds)
+ - k8s_snapshot_snapshot_max (seconds)
+ - k8s_snapshot_restore_p50 (seconds)
+ - k8s_snapshot_restore_p95 (seconds)
+ - k8s_snapshot_restore_max (seconds)
+ - k8s_snapshot_ttfe_p50 (seconds)
+ - k8s_snapshot_ttfe_p95 (seconds)
+ - k8s_snapshot_ttfe_max (seconds)
+ - k8s_snapshot_startup_time (seconds)
+ - k8s_snapshot_restore_correct_count (count)
+ - k8s_snapshot_wall_time (seconds)
+"""
+
+import json
+import logging
+import os
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from jinja2 import Template
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import data
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker import sample
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_snapshot"
+BENCHMARK_CONFIG = """
+k8s_snapshot:
+ description: >
+ Atomic single-point Pod Snapshot saturation measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_integer(
+ "k8s_snapshot_preload_mb",
+ 10,
+ "Megabytes of memory to pre-allocate in the sandbox before snapshot.",
+)
+
+flags.DEFINE_integer(
+ "k8s_snapshot_burst_size",
+ 1,
+ "Number of concurrent source/snapshot/restore pods per measurement.",
+)
+
+# k8s_snapshot_ksa_name is defined in gke_deploy_utils.py
+# (where DeploySnapshots() consumes it) and is available here
+# via the deploy_utils import.
+
+flags.DEFINE_integer(
+ "k8s_snapshot_pod_timeout",
+ 180,
+ "Max seconds to wait for pod Running / preload.",
+)
+
+flags.DEFINE_boolean(
+ "k8s_snapshot_skip_snapshot",
+ False,
+ "Skip snapshot/restore phases — measure cold-start TTFE only.",
+)
+
+flags.DEFINE_string(
+ "k8s_snapshot_preload_mode",
+ "synthetic",
+ "Preload mode: 'synthetic' (os.urandom fill) or "
+ "'script:' to run a custom startup script.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads, snapshot infra, and validate readiness."""
+ benchmark_spec.always_call_cleanup = True
+ ns = FLAGS.k8s_namespace
+ preload_mb = FLAGS.k8s_snapshot_preload_mb
+
+ logging.info(
+ "=== Prepare: preload_mb=%d, burst_size=%d ===",
+ preload_mb,
+ FLAGS.k8s_snapshot_burst_size,
+ )
+
+ # Deploy Agent Sandbox ecosystem (idempotent)
+ deploy_utils.DeployWorkloads(benchmark_spec)
+
+ # Deploy Pod Snapshot infrastructure (idempotent).
+ # Pod Snapshots are GKE-specific; skip on other platforms.
+ # Only attempt deployment when we have a confirmed GCP cluster
+ # (avoids surprise failures on pre-existing clusters where
+ # benchmark_spec.container_cluster may be None).
+ cluster = getattr(benchmark_spec, "container_cluster", None)
+ if cluster and getattr(cluster, "cloud", None) == "GCP" and not FLAGS.skip_deploy_snapshots:
+ deploy_utils.DeploySnapshots()
+ elif not cluster:
+ logging.info(
+ "Pod Snapshot infrastructure skipped (no container_cluster in "
+ "benchmark_spec). Use --skip_deploy_snapshots=False to force."
+ )
+ elif getattr(cluster, "cloud", None) != "GCP":
+ logging.info(
+ "Pod Snapshot infrastructure skipped (cloud=%s, GKE required).",
+ getattr(cluster, "cloud", "unknown"),
+ )
+
+ # 1. Verify PodSnapshotStorageConfig exists (cluster-scoped).
+ _, _, retcode = utils.RunKubectl(
+ ["get", "podsnapshotstorageconfigs.podsnapshot.gke.io", "--no-headers"],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ if retcode != 0:
+ raise RuntimeError(
+ "PodSnapshotStorageConfig CRD not found. "
+ "Ensure pod snapshots are enabled on the cluster."
+ )
+ logging.info("PodSnapshotStorageConfig verified.")
+
+ # 2. Verify PodSnapshotPolicy exists in the namespace.
+ _, _, retcode = utils.RunKubectl(
+ ["get", "podsnapshotpolicies.podsnapshot.gke.io", "-n", ns, "--no-headers"],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ if retcode != 0:
+ logging.warning("PodSnapshotPolicy not found in namespace %s.", ns)
+
+ # 3. Verify the service account exists.
+ ksa = FLAGS.k8s_snapshot_ksa_name
+ _, _, retcode = utils.RunKubectl(
+ ["get", "serviceaccount", ksa, "-n", ns],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ if retcode != 0:
+ raise RuntimeError(
+ f"ServiceAccount {ksa} not found in namespace {ns}. "
+ "Run setup_snapshot_gke.sh or ensure DeploySnapshots() succeeded."
+ )
+ logging.info("ServiceAccount %s verified.", ksa)
+
+ # 4. Verify the template file exists.
+ template_path = _GetTemplatePath()
+ if not os.path.isfile(template_path):
+ raise RuntimeError(f"Snapshot template not found: {template_path}")
+ logging.info("Template file verified: %s", template_path)
+
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Execute a single snapshot/restore measurement and return samples.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ preload_mb = FLAGS.k8s_snapshot_preload_mb
+ burst_size = FLAGS.k8s_snapshot_burst_size
+ skip_snapshot = FLAGS.k8s_snapshot_skip_snapshot
+ preload_mode = FLAGS.k8s_snapshot_preload_mode
+ ksa_name = FLAGS.k8s_snapshot_ksa_name
+ pod_timeout = FLAGS.k8s_snapshot_pod_timeout
+
+ logging.info(
+ "=== Run: preload_mb=%d, burst_size=%d, skip_snapshot=%s ===",
+ preload_mb,
+ burst_size,
+ skip_snapshot,
+ )
+
+ template_path = _GetTemplatePath()
+ t0 = time.time()
+
+ # Run the snapshot/restore cycle
+ step_result = _RunSnapshotCycle(
+ namespace=ns,
+ preload_mb=preload_mb,
+ burst_size=burst_size,
+ skip_snapshot=skip_snapshot,
+ preload_mode=preload_mode,
+ ksa_name=ksa_name,
+ pod_timeout=pod_timeout,
+ template_path=template_path,
+ )
+
+ wall_time = time.time() - t0
+
+ # Build samples
+ extra = {
+ "preload_mb": preload_mb,
+ "burst_size": burst_size,
+ "skip_snapshot": skip_snapshot,
+ "preload_mode": preload_mode,
+ "restore_correct_count": step_result.get("restore_correct_count", 0),
+ "wall_time_s": round(wall_time, 2),
+ }
+
+ if step_result.get("error"):
+ extra["error"] = step_result["error"]
+
+ samples = []
+
+ # Snapshot metrics
+ _emit(samples, step_result, "snapshot_p50_s", "snapshot_p50", "seconds", ns, extra)
+ _emit(samples, step_result, "snapshot_p95_s", "snapshot_p95", "seconds", ns, extra)
+ _emit(samples, step_result, "snapshot_max_s", "snapshot_max", "seconds", ns, extra)
+
+ # Restore metrics
+ _emit(samples, step_result, "restore_p50_s", "restore_p50", "seconds", ns, extra)
+ _emit(samples, step_result, "restore_p95_s", "restore_p95", "seconds", ns, extra)
+ _emit(samples, step_result, "restore_max_s", "restore_max", "seconds", ns, extra)
+
+ # TTFE metrics
+ _emit(samples, step_result, "ttfe_p50_s", "ttfe_p50", "seconds", ns, extra)
+ _emit(samples, step_result, "ttfe_p95_s", "ttfe_p95", "seconds", ns, extra)
+ _emit(samples, step_result, "ttfe_max_s", "ttfe_max", "seconds", ns, extra)
+
+ # Startup time
+ _emit(samples, step_result, "startup_time_s", "startup_time", "seconds", ns, extra)
+
+ # Restore correctness
+ correct = step_result.get("restore_correct_count")
+ if correct is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_restore_correct_count",
+ correct,
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ round(wall_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for preload_mb=%d.", len(samples), preload_mb)
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Clean up any leftover benchmark resources."""
+ ns = FLAGS.k8s_namespace
+ logging.info("Cleanup — deleting any leftover snapshot-benchmark resources.")
+
+ for kind in (
+ "sandboxclaim",
+ "sandboxtemplate",
+ "podsnapshotmanualtrigger",
+ "podsnapshots.podsnapshot.gke.io",
+ ):
+ utils.RunKubectl(
+ [
+ "delete",
+ kind,
+ "-l",
+ "app=snapshot-benchmark-workload",
+ "-n",
+ ns,
+ "--ignore-not-found=true",
+ ],
+ timeout=60,
+ raise_on_failure=False,
+ )
+ utils.StopPortForward()
+ logging.info("Cleanup complete.")
+
+
+# ---------------------------------------------------------------------------
+# Core snapshot/restore logic
+# ---------------------------------------------------------------------------
+
+
+def _RunSnapshotCycle(
+ namespace,
+ preload_mb,
+ burst_size,
+ skip_snapshot,
+ preload_mode,
+ ksa_name,
+ pod_timeout,
+ template_path,
+):
+ """Execute one full snapshot/restore cycle and return a result dict.
+
+ Handles source creation, snapshot, restore, TTFE measurement,
+ correctness verification, and cleanup.
+ """
+ step_template = f"snap-bench-{preload_mb}mb"
+ source_names = [f"snap-src-0-{i}" for i in range(burst_size)]
+ restore_names = [f"snap-restore-0-{i}" for i in range(burst_size)]
+ trigger_names = [f"snap-trigger-0-{i}" for i in range(burst_size)]
+
+ result = {
+ "preload_mb": preload_mb,
+ "burst_size": burst_size,
+ "snapshot_p50_s": None,
+ "snapshot_p95_s": None,
+ "snapshot_max_s": None,
+ "restore_p50_s": None,
+ "restore_p95_s": None,
+ "restore_max_s": None,
+ "ttfe_p50_s": None,
+ "ttfe_p95_s": None,
+ "ttfe_max_s": None,
+ "startup_time_s": None,
+ "snapshot_counter": None,
+ "restore_correct_count": 0,
+ "burst_results": [],
+ "error": None,
+ }
+
+ try:
+ # 1. Create step-specific SandboxTemplate
+ logging.info(
+ "Creating SandboxTemplate '%s' (PRELOAD_MB=%d, memory=%dMi)",
+ step_template,
+ preload_mb,
+ max(512, preload_mb + 256),
+ )
+ if not _RenderAndApplyTemplate(
+ template_path,
+ step_template,
+ namespace,
+ ksa_name,
+ preload_mb,
+ preload_mode,
+ ):
+ raise RuntimeError("Failed to create SandboxTemplate")
+
+ time.sleep(2)
+
+ # 2. Create source claims and wait for Running + preload
+ logging.info("Creating %d source SandboxClaim(s)", burst_size)
+ t0_sources = time.time()
+ workers = min(burst_size, 50)
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ for sname in source_names:
+ pool.submit(_ApplyClaim, sname, namespace, step_template)
+
+ logging.info("Waiting for %d source pod(s) Running + preload", burst_size)
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ source_futs = [
+ pool.submit(
+ _MeasureSingleSource,
+ sname,
+ namespace,
+ t0_sources,
+ pod_timeout,
+ preload_mode,
+ )
+ for sname in source_names
+ ]
+ source_results = [f.result() for f in source_futs]
+
+ src_failed = [r for r in source_results if r.get("error")]
+ if src_failed:
+ fail_msgs = "; ".join(f"{r['pod']}: {r['error']}" for r in src_failed)
+ raise RuntimeError(
+ f"{len(src_failed)}/{burst_size} source pod(s) failed: {fail_msgs}"
+ )
+
+ startup_times = [
+ r["startup_time_s"]
+ for r in source_results
+ if r["startup_time_s"] is not None
+ ]
+ result["startup_time_s"] = (
+ round(_Percentile(startup_times, 50), 3) if startup_times else None
+ )
+
+ snapshot_counters = {r["pod"]: r["snapshot_counter"] for r in source_results}
+ min_counter = min(
+ (c for c in snapshot_counters.values() if c is not None), default=None
+ )
+ result["snapshot_counter"] = min_counter
+ logging.info("%d source pod(s) ready. Min counter: %s", burst_size, min_counter)
+
+ # --skip_snapshot: measure cold-start TTFE only
+ if skip_snapshot:
+ logging.info("skip_snapshot mode: measuring cold-start TTFE")
+ ttfe_times = []
+ burst_results = []
+ for i, sname in enumerate(source_names):
+ startup = source_results[i]["startup_time_s"]
+ counter = source_results[i]["snapshot_counter"]
+ preload_done = source_results[i].get("preload_complete_time_s")
+ ttfe_s = preload_done if preload_done else startup
+ ttfe_times.append(ttfe_s)
+ burst_results.append(
+ {
+ "pod": sname,
+ "source_pod": sname,
+ "startup_time_s": startup,
+ "snapshot_counter": None,
+ "snapshot_time_s": None,
+ "restore_time_s": None,
+ "ttfe_s": ttfe_s,
+ "restore_counter": counter,
+ "restore_correct": True,
+ "error": None,
+ }
+ )
+
+ result["burst_results"] = burst_results
+ result["restore_correct_count"] = burst_size
+
+ if ttfe_times:
+ result["ttfe_p50_s"] = round(_Percentile(ttfe_times, 50), 3)
+ result["ttfe_p95_s"] = round(_Percentile(ttfe_times, 95), 3)
+ result["ttfe_max_s"] = round(max(ttfe_times), 3)
+
+ # Skip to cleanup
+ return result
+
+ # 3. Trigger snapshots concurrently
+ logging.info("Triggering %d snapshot(s)", burst_size)
+ t0_snap = time.time()
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ snap_futs = [
+ pool.submit(
+ _TriggerAndWaitSnapshot,
+ tname,
+ sname,
+ namespace,
+ t0_snap,
+ )
+ for tname, sname in zip(trigger_names, source_names)
+ ]
+ snap_results = [f.result() for f in snap_futs]
+
+ snap_failed = [r for r in snap_results if r.get("error")]
+ snap_times = [
+ r["snapshot_time_s"]
+ for r in snap_results
+ if r["snapshot_time_s"] is not None
+ ]
+ if snap_times:
+ result["snapshot_p50_s"] = round(_Percentile(snap_times, 50), 3)
+ result["snapshot_p95_s"] = round(_Percentile(snap_times, 95), 3)
+ result["snapshot_max_s"] = round(max(snap_times), 3)
+
+ if snap_failed:
+ fail_msgs = "; ".join(f"{r['trigger']}: {r['error']}" for r in snap_failed)
+ raise RuntimeError(
+ f"{len(snap_failed)}/{burst_size} snapshot(s) failed: {fail_msgs}"
+ )
+
+ # 4. Create restore claims concurrently
+ logging.info("Creating %d restore SandboxClaim(s)", burst_size)
+ t0_burst = time.time()
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ create_futs = [
+ pool.submit(_ApplyClaim, rname, namespace, step_template)
+ for rname in restore_names
+ ]
+ for f in create_futs:
+ f.result()
+
+ # 5. Poll restore pods for Running + TTFE
+ logging.info("Measuring restore + TTFE across %d pod(s)", burst_size)
+ with ThreadPoolExecutor(max_workers=workers) as pool:
+ measure_futs = [
+ pool.submit(
+ _MeasureSingleRestore,
+ rname,
+ namespace,
+ t0_burst,
+ min_counter,
+ pod_timeout,
+ )
+ for rname in restore_names
+ ]
+ burst_results = [f.result() for f in measure_futs]
+
+ # Merge source + snapshot info
+ for i in range(burst_size):
+ burst_results[i]["source_pod"] = source_names[i]
+ burst_results[i]["startup_time_s"] = source_results[i]["startup_time_s"]
+ burst_results[i]["snapshot_counter"] = source_results[i]["snapshot_counter"]
+ burst_results[i]["snapshot_time_s"] = snap_results[i]["snapshot_time_s"]
+
+ result["burst_results"] = burst_results
+
+ # 6. Aggregate
+ restore_times = [
+ r["restore_time_s"]
+ for r in burst_results
+ if r["restore_time_s"] is not None
+ ]
+ ttfe_times = [r["ttfe_s"] for r in burst_results if r["ttfe_s"] is not None]
+ correct_count = sum(1 for r in burst_results if r["restore_correct"])
+
+ result["restore_correct_count"] = correct_count
+
+ if restore_times:
+ result["restore_p50_s"] = round(_Percentile(restore_times, 50), 3)
+ result["restore_p95_s"] = round(_Percentile(restore_times, 95), 3)
+ result["restore_max_s"] = round(max(restore_times), 3)
+
+ if ttfe_times:
+ result["ttfe_p50_s"] = round(_Percentile(ttfe_times, 50), 3)
+ result["ttfe_p95_s"] = round(_Percentile(ttfe_times, 95), 3)
+ result["ttfe_max_s"] = round(max(ttfe_times), 3)
+
+ logging.info("Counter correct: %d/%d", correct_count, burst_size)
+
+ except Exception as e:
+ result["error"] = str(e)
+ logging.error("Snapshot cycle failed: %s", e)
+
+ finally:
+ # Cleanup
+ logging.info("Cleaning up step resources")
+ _CleanupStep(
+ source_names,
+ restore_names,
+ trigger_names,
+ step_template,
+ namespace,
+ )
+ time.sleep(5)
+
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Kubernetes interaction helpers
+# ---------------------------------------------------------------------------
+
+
+def _ApplyClaim(name, namespace, template_name):
+ """Create a SandboxClaim."""
+ manifest = json.dumps(
+ {
+ "apiVersion": "extensions.agents.x-k8s.io/v1alpha1",
+ "kind": "SandboxClaim",
+ "metadata": {
+ "name": name,
+ "namespace": namespace,
+ "labels": {"app": "snapshot-benchmark-workload"},
+ },
+ "spec": {"sandboxTemplateRef": {"name": template_name}},
+ }
+ )
+ tmp_dir = os.path.join(
+ data.ResourcePath("k8s_agents/manifests"), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+ tmp_path = os.path.join(tmp_dir, f"snap-claim-{name}.json")
+ try:
+ with open(tmp_path, "w") as f:
+ f.write(manifest)
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", tmp_path],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ finally:
+ if os.path.isfile(tmp_path):
+ os.unlink(tmp_path)
+ if retcode != 0:
+ raise RuntimeError(f"Failed to create SandboxClaim {name}: {stderr}")
+
+
+def _RenderAndApplyTemplate(
+ template_path,
+ template_name,
+ namespace,
+ ksa_name,
+ preload_mb,
+ preload_mode,
+):
+ """Render the Jinja2 template with step-specific values and kubectl apply."""
+ if preload_mode.startswith("script:"):
+ return _RenderAndApplyScriptTemplate(
+ template_name,
+ namespace,
+ ksa_name,
+ preload_mb,
+ preload_mode,
+ )
+
+ with open(template_path) as f:
+ content = f.read()
+
+ memory_mi = max(512, preload_mb + 256)
+
+ tmpl = Template(content)
+ rendered = tmpl.render(
+ template_name=template_name,
+ namespace=namespace,
+ ksa_name=ksa_name,
+ preload_mb=preload_mb,
+ memory_mi=memory_mi,
+ )
+
+ tmp_dir = os.path.join(
+ data.ResourcePath("k8s_agents/manifests"), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+ tmp_path = os.path.join(tmp_dir, f"snap-template-{template_name}.yaml")
+ try:
+ with open(tmp_path, "w") as f:
+ f.write(rendered)
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", tmp_path],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ finally:
+ if os.path.isfile(tmp_path):
+ os.unlink(tmp_path)
+ if retcode != 0:
+ logging.warning("kubectl apply stderr: %s", stderr)
+ return retcode == 0
+
+
+def _get_sandbox_node_selector():
+ """Return the nodeSelector for sandbox pods."""
+ return {"pkb_nodepool": "sandbox"}
+
+
+def _get_sandbox_tolerations():
+ """Return tolerations for sandbox pods."""
+ return [
+ {
+ "key": "sandbox.gke.io/runtime",
+ "operator": "Equal",
+ "value": "gvisor",
+ "effect": "NoSchedule",
+ },
+ ]
+
+
+def _RenderAndApplyScriptTemplate(
+ template_name,
+ namespace,
+ ksa_name,
+ preload_mb,
+ preload_mode,
+):
+ """Render a SandboxTemplate that runs a user-provided startup script."""
+ script_path = preload_mode.split(":", 1)[1]
+ if not os.path.isfile(script_path):
+ logging.error("Script not found: %s", script_path)
+ return False
+
+ with open(script_path) as f:
+ user_script = f.read()
+
+ memory_mi = max(512, preload_mb + 256)
+
+ entrypoint = (
+ "#!/bin/bash\n"
+ "set -e\n"
+ 'echo "Running startup script..."\n'
+ "# --- User script start ---\n"
+ f"{user_script}\n"
+ "# --- User script end ---\n"
+ 'echo "SCRIPT_READY"\n'
+ 'echo "Starting counter."\n'
+ "i=0\n"
+ "while true; do\n"
+ ' echo "Count: $i"\n'
+ " i=$((i + 1))\n"
+ " sleep 1\n"
+ "done\n"
+ )
+
+ manifest = json.dumps({
+ "apiVersion": "extensions.agents.x-k8s.io/v1alpha1",
+ "kind": "SandboxTemplate",
+ "metadata": {
+ "name": template_name,
+ "namespace": namespace,
+ },
+ "spec": {
+ "podTemplate": {
+ "metadata": {
+ "labels": {"app": "snapshot-benchmark-workload"},
+ },
+ "spec": {
+ "serviceAccountName": ksa_name,
+ "runtimeClassName": "gvisor",
+ "containers": [
+ {
+ "name": "preloader",
+ "image": "python:3.11-slim",
+ "command": ["bash", "-c"],
+ "args": [entrypoint],
+ "env": [{"name": "PRELOAD_MB", "value": str(preload_mb)}],
+ "resources": {
+ "requests": {
+ "cpu": "250m",
+ "memory": f"{memory_mi}Mi",
+ "ephemeral-storage": "512Mi",
+ }
+ },
+ }
+ ],
+ "nodeSelector": _get_sandbox_node_selector(),
+ "tolerations": _get_sandbox_tolerations(),
+ "restartPolicy": "OnFailure",
+ },
+ }
+ },
+ })
+
+ tmp_dir = os.path.join(
+ data.ResourcePath("k8s_agents/manifests"), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+ tmp_path = os.path.join(tmp_dir, f"snap-script-template-{template_name}.json")
+ try:
+ with open(tmp_path, "w") as f:
+ f.write(manifest)
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", tmp_path],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ finally:
+ if os.path.isfile(tmp_path):
+ os.unlink(tmp_path)
+ if retcode != 0:
+ logging.warning("kubectl apply stderr: %s", stderr)
+ return retcode == 0
+
+
+def _MeasureSingleSource(name, namespace, t0, pod_timeout, preload_mode):
+ """Wait for a source pod to be Running and preloaded."""
+ result = {
+ "pod": name,
+ "startup_time_s": None,
+ "preload_complete_time_s": None,
+ "snapshot_counter": None,
+ "error": None,
+ }
+
+ # Wait for Running
+ deadline = t0 + pod_timeout
+ while time.time() < deadline:
+ stdout, _, rc = utils.RunKubectl(
+ ["get", "pod", name, "-n", namespace, "-o", "jsonpath={.status.phase}"],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if stdout == "Running":
+ result["startup_time_s"] = round(time.time() - t0, 3)
+ break
+ time.sleep(1)
+ else:
+ result["error"] = f"Pod {name} did not reach Running within {pod_timeout}s"
+ return result
+
+ # Wait for preload
+ if not _WaitForPreload(name, namespace, pod_timeout, preload_mode):
+ result["error"] = f"Preload did not complete within {pod_timeout}s"
+ return result
+
+ result["preload_complete_time_s"] = round(time.time() - t0, 3)
+
+ # Let counter tick
+ time.sleep(3)
+ result["snapshot_counter"] = _GetLastCounter(name, namespace)
+ return result
+
+
+def _WaitForPreload(name, namespace, timeout_s, preload_mode):
+ """Wait for preload to complete."""
+ deadline = time.time() + timeout_s
+ while time.time() < deadline:
+ stdout, _, rc = utils.RunKubectl(
+ ["logs", name, "-n", namespace, "--tail=20"],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if "SCRIPT_READY" in stdout:
+ return True
+ if "Starting counter" in stdout or re.search(r"Count:\s*\d+", stdout):
+ return True
+ time.sleep(2)
+ return False
+
+
+def _GetLastCounter(name, namespace):
+ """Extract the last Count: N value from pod logs."""
+ stdout, _, rc = utils.RunKubectl(
+ ["logs", name, "-n", namespace, "--tail=10"],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if rc != 0:
+ return None
+ matches = re.findall(r"Count:\s*(\d+)", stdout)
+ return int(matches[-1]) if matches else None
+
+
+def _TriggerAndWaitSnapshot(trigger_name, target_pod, namespace, t0, timeout_s=300):
+ """Create a snapshot trigger and wait for Complete."""
+ result = {
+ "trigger": trigger_name,
+ "pod": target_pod,
+ "snapshot_time_s": None,
+ "error": None,
+ }
+ manifest = json.dumps(
+ {
+ "apiVersion": "podsnapshot.gke.io/v1",
+ "kind": "PodSnapshotManualTrigger",
+ "metadata": {"name": trigger_name, "namespace": namespace},
+ "spec": {"targetPod": target_pod},
+ }
+ )
+ tmp_dir = os.path.join(
+ data.ResourcePath("k8s_agents/manifests"), "tmp"
+ )
+ os.makedirs(tmp_dir, exist_ok=True)
+ tmp_path = os.path.join(tmp_dir, f"snap-trigger-{trigger_name}.json")
+ try:
+ with open(tmp_path, "w") as f:
+ f.write(manifest)
+ stdout, stderr, retcode = kubectl.RunKubectlCommand(
+ ["apply", "-f", tmp_path],
+ timeout=30,
+ raise_on_failure=False,
+ )
+ finally:
+ if os.path.isfile(tmp_path):
+ os.unlink(tmp_path)
+ if retcode != 0:
+ result["error"] = f"Failed to create trigger: {stderr}"
+ return result
+
+ deadline = t0 + timeout_s
+ while time.time() < deadline:
+ stdout, _, rc = utils.RunKubectl(
+ [
+ "get",
+ "podsnapshotmanualtriggers.podsnapshot.gke.io",
+ trigger_name,
+ "-n",
+ namespace,
+ "-o",
+ "jsonpath={.status.conditions[0].reason}",
+ ],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if stdout == "Complete":
+ result["snapshot_time_s"] = round(time.time() - t0, 3)
+ return result
+ time.sleep(2)
+ result["error"] = f"Snapshot {trigger_name} did not complete within {timeout_s}s"
+ return result
+
+
+def _MeasureSingleRestore(name, namespace, t0, snapshot_counter, pod_timeout):
+ """Measure restore_time and TTFE for a single pod."""
+ result = {
+ "pod": name,
+ "restore_time_s": None,
+ "ttfe_s": None,
+ "restore_counter": None,
+ "restore_correct": False,
+ "error": None,
+ }
+
+ # Wait for Running
+ deadline = t0 + pod_timeout
+ while time.time() < deadline:
+ stdout, _, rc = utils.RunKubectl(
+ ["get", "pod", name, "-n", namespace, "-o", "jsonpath={.status.phase}"],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if stdout == "Running":
+ result["restore_time_s"] = round(time.time() - t0, 3)
+ break
+ time.sleep(1)
+ else:
+ result["error"] = f"Pod {name} did not reach Running within {pod_timeout}s"
+ return result
+
+ # Wait for first Count (TTFE)
+ ttfe_deadline = t0 + pod_timeout
+ while time.time() < ttfe_deadline:
+ stdout, _, rc = utils.RunKubectl(
+ ["logs", name, "-n", namespace, "--tail=50"],
+ timeout=10,
+ raise_on_failure=False,
+ )
+ if rc == 0:
+ matches = re.findall(r"Count:\s*(\d+)", stdout)
+ if matches:
+ result["ttfe_s"] = round(time.time() - t0, 3)
+ result["restore_counter"] = int(matches[0])
+ if (
+ snapshot_counter is not None
+ and result["restore_counter"] >= snapshot_counter
+ ):
+ result["restore_correct"] = True
+ return result
+ time.sleep(1)
+
+ result["error"] = f"Pod {name}: no Count output within timeout"
+ return result
+
+
+def _CleanupStep(source_names, restore_names, trigger_names, template_name, namespace):
+ """Delete source claims, restore claims, triggers, snapshots, and template."""
+ to_delete = [("sandboxtemplate", template_name)]
+ for name in source_names:
+ to_delete.append(("sandboxclaim", name))
+ for name in restore_names:
+ to_delete.append(("sandboxclaim", name))
+ for name in trigger_names:
+ to_delete.append(("podsnapshotmanualtrigger", name))
+
+ for kind, name in to_delete:
+ utils.RunKubectl(
+ ["delete", kind, name, "-n", namespace, "--ignore-not-found=true"],
+ timeout=60,
+ raise_on_failure=False,
+ )
+ # Delete any PodSnapshot resources
+ utils.RunKubectl(
+ [
+ "delete",
+ "podsnapshots.podsnapshot.gke.io",
+ "--all",
+ "-n",
+ namespace,
+ "--ignore-not-found=true",
+ ],
+ timeout=60,
+ raise_on_failure=False,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _GetTemplatePath():
+ """Return the absolute path to the snapshot SandboxTemplate template."""
+ return os.path.join(
+ data.ResourcePath("k8s_agents/manifests"),
+ "snapshot-sandbox-template.yaml.j2",
+ )
+
+
+def _Percentile(values, pct):
+ """Calculate percentile (0-100) from a list of values."""
+ if not values:
+ return 0.0
+ s = sorted(values)
+ idx = (pct / 100) * (len(s) - 1)
+ lo = int(idx)
+ hi = min(lo + 1, len(s) - 1)
+ frac = idx - lo
+ return s[lo] * (1 - frac) + s[hi] * frac
+
+
+def _emit(samples, data, data_key, metric_suffix, unit, namespace, extra):
+ """Emit a sample if the key exists in the data dict."""
+ value = data.get(data_key)
+ if value is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_suffix}",
+ value,
+ unit,
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py
new file mode 100644
index 0000000000..9024f9f28e
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes/agentic/k8s_warmpool_benchmark.py
@@ -0,0 +1,426 @@
+"""PKB Benchmark: GKE Agent Warmpool Scale-Up (Use Case E).
+
+Atomic single-point measurement of warm pool provisioning speed on a
+pre-provisioned GKE cluster. Measures how quickly N sandbox pods can be
+provisioned from zero via the SandboxWarmPool controller. No agent API
+is needed; this benchmark interacts directly with the Kubernetes API.
+
+This benchmark is designed to be invoked repeatedly by an external sweep
+controller that varies the target_replicas parameter across iterations to
+find the provisioning saturation point.
+
+Usage:
+ python pkb.py --benchmarks=gke_warmpool \
+ --k8s_warmpool_target_replicas=100 \
+ --k8s_warmpool_name=python-sandbox-warmpool \
+ --k8s_warmpool_pod_label=sandbox=python-sandbox-example \
+ --k8s_warmpool_ready_threshold_s=300 \
+ --k8s_warmpool_poll_interval_s=2.0 \
+ --k8s_warmpool_drain_timeout_s=300 \
+ --k8s_namespace=agentic \
+ --gke_machine_type=c4-standard-8
+
+Samples emitted (per run):
+ - gke_warmpool_total_time_to_ready (seconds)
+ - gke_warmpool_refill_rate (pods/sec)
+ - gke_warmpool_drain_time (seconds)
+ - gke_warmpool_first_pod_running (seconds)
+ - gke_warmpool_final_running_count (count)
+ - gke_warmpool_final_pending_count (count)
+ - gke_warmpool_time_to_created_p50 (seconds)
+ - gke_warmpool_time_to_created_p95 (seconds)
+ - gke_warmpool_time_to_created_max (seconds)
+ - gke_warmpool_time_to_created_count (count)
+ - gke_warmpool_time_to_scheduled_p50 (seconds)
+ - gke_warmpool_time_to_scheduled_p95 (seconds)
+ - gke_warmpool_time_to_scheduled_max (seconds)
+ - gke_warmpool_time_to_scheduled_count (count)
+ - gke_warmpool_time_to_running_p50 (seconds)
+ - gke_warmpool_time_to_running_p95 (seconds)
+ - gke_warmpool_time_to_running_max (seconds)
+ - gke_warmpool_time_to_running_count (count)
+ - gke_warmpool_wall_time (seconds)
+"""
+
+import json
+import logging
+import time
+
+from absl import flags
+from datetime import datetime, timezone
+from perfkitbenchmarker import configs
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ k8s_benchmark_utils as utils,
+)
+from perfkitbenchmarker.linux_benchmarks.kubernetes.agentic import (
+ gke_deploy_utils as deploy_utils,
+)
+
+FLAGS = flags.FLAGS
+
+BENCHMARK_NAME = "k8s_warmpool"
+BENCHMARK_CONFIG = """
+k8s_warmpool:
+ description: >
+ Atomic single-point warm pool scale-up measurement on a
+ pre-provisioned GKE cluster with gVisor isolation.
+"""
+
+# ---------------------------------------------------------------------------
+# Benchmark-specific flags
+# ---------------------------------------------------------------------------
+
+flags.DEFINE_integer(
+ "k8s_warmpool_target_replicas",
+ 100,
+ "Number of warm pool replicas to provision from zero.",
+)
+
+flags.DEFINE_string(
+ "k8s_warmpool_name",
+ "python-sandbox-warmpool",
+ "SandboxWarmPool resource name.",
+)
+
+flags.DEFINE_string(
+ "k8s_warmpool_pod_label",
+ "sandbox=python-sandbox-example",
+ "Label selector for warm pool pods.",
+)
+
+flags.DEFINE_float(
+ "k8s_warmpool_ready_threshold_s",
+ 300.0,
+ "Max seconds allowed for all pods to reach Running.",
+)
+
+flags.DEFINE_float(
+ "k8s_warmpool_poll_interval_s",
+ 2.0,
+ "Seconds between kubectl polls during provisioning.",
+)
+
+flags.DEFINE_float(
+ "k8s_warmpool_drain_timeout_s",
+ 300.0,
+ "Max seconds to wait for drain to 0.",
+)
+
+
+# ---------------------------------------------------------------------------
+# Lifecycle
+# ---------------------------------------------------------------------------
+
+
+def GetConfig(user_config):
+ """Load and return benchmark config.
+
+ No vm_groups — PKB skips Provision() and Teardown().
+ """
+ return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(benchmark_spec):
+ """Deploy workloads onto the cluster."""
+ benchmark_spec.always_call_cleanup = True
+ logging.info("=== Prepare: deploying workloads ===")
+ deploy_utils.DeployWorkloads(benchmark_spec)
+ utils.EnsurePortForward()
+ logging.info("Prepare complete.")
+
+
+def Run(benchmark_spec):
+ """Scale warm pool from 0 to target and measure provisioning time.
+
+ Returns:
+ List of sample.Sample objects.
+ """
+ utils.set_benchmark_spec(benchmark_spec)
+
+ ns = FLAGS.k8s_namespace
+ target = FLAGS.k8s_warmpool_target_replicas
+ warmpool_name = FLAGS.k8s_warmpool_name
+ label = FLAGS.k8s_warmpool_pod_label
+ threshold_s = FLAGS.k8s_warmpool_ready_threshold_s
+ poll_interval = FLAGS.k8s_warmpool_poll_interval_s
+
+ # Drain to 0 for clean measurement (moved from Prepare for sweep compatibility)
+ utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s))
+ time.sleep(3)
+
+ logging.info("=== Run: scaling %s to %d replicas ===", warmpool_name, target)
+
+ t_wall_start = time.time()
+
+ # 1. Measure drain time (should be near-zero since Prepare drained)
+ t0 = time.time()
+ utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s))
+ drain_time_s = round(time.time() - t0, 2)
+
+ time.sleep(2)
+
+ # 2. Scale up
+ logging.info("Patching %s replicas -> %d", warmpool_name, target)
+ patch_json = json.dumps({"spec": {"replicas": target}})
+ utils.RunKubectl(
+ [
+ "patch",
+ "sandboxwarmpool",
+ warmpool_name,
+ "-n",
+ ns,
+ "--type=merge",
+ f"-p={patch_json}",
+ ]
+ )
+
+ # 3. Poll until ready or timeout
+ t_scale = time.time()
+ scale_start_epoch = t_scale
+ deadline = t_scale + threshold_s
+ first_pod_time = None
+
+ while time.time() < deadline:
+ elapsed = time.time() - t_scale
+ running = utils.CountPods(ns, label, "Running")
+ pending = utils.CountPods(ns, label, "Pending")
+
+ if first_pod_time is None and running > 0:
+ first_pod_time = elapsed
+
+ pct = (running / target * 100) if target > 0 else 0
+ logging.info(
+ "[%.1fs] Running: %d/%d (%.0f%%) Pending: %d",
+ elapsed,
+ running,
+ target,
+ pct,
+ pending,
+ )
+
+ if running >= target:
+ break
+
+ time.sleep(poll_interval)
+
+ total_time = round(time.time() - t_scale, 2)
+ final_running = utils.CountPods(ns, label, "Running")
+ final_pending = utils.CountPods(ns, label, "Pending")
+ rate = round(final_running / total_time, 2) if total_time > 0 else 0
+
+ logging.info(
+ "Scale-up complete: %d/%d Running in %.1fs (%.1f pods/sec)",
+ final_running,
+ target,
+ total_time,
+ rate,
+ )
+
+ # 4. Scrape pod lifecycle timestamps
+ lifecycle = _ScrapeLifecycle(ns, label, scale_start_epoch)
+
+ wall_time = round(time.time() - t_wall_start, 2)
+
+ # 5. Build samples
+ extra = {
+ "target_replicas": target,
+ "final_running_count": final_running,
+ "final_pending_count": final_pending,
+ "wall_time_s": wall_time,
+ }
+
+ samples = []
+
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_total_time_to_ready",
+ total_time,
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_refill_rate",
+ rate,
+ "pods/sec",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_drain_time",
+ drain_time_s,
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ if first_pod_time is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_first_pod_running",
+ round(first_pod_time, 2),
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_final_running_count",
+ float(final_running),
+ "count",
+ ns,
+ extra,
+ )
+ )
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_final_pending_count",
+ float(final_pending),
+ "count",
+ ns,
+ extra,
+ )
+ )
+
+ # Pod lifecycle percentiles
+ _EmitLifecycleSamples(samples, lifecycle, ns, extra)
+
+ # Wall time
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_wall_time",
+ wall_time,
+ "seconds",
+ ns,
+ extra,
+ )
+ )
+
+ logging.info("Emitted %d samples for target_replicas=%d.", len(samples), target)
+ return samples
+
+
+def Cleanup(benchmark_spec):
+ """Drain warm pool back to 0 after measurement."""
+ ns = FLAGS.k8s_namespace
+ warmpool_name = FLAGS.k8s_warmpool_name
+ label = FLAGS.k8s_warmpool_pod_label
+
+ logging.info("Cleanup: draining warm pool to 0.")
+ utils.DrainWarmPool(ns, warmpool_name, label, timeout=int(FLAGS.k8s_warmpool_drain_timeout_s))
+ utils.StopPortForward()
+ logging.info("Cleanup complete.")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _ScrapeLifecycle(namespace, label, scale_start_epoch):
+ """Scrape pod metadata to compute time-to-created/scheduled/running.
+
+ Returns a dict with P50/P95/max/count for each phase relative to
+ scale_start_epoch.
+ """
+ stdout, _, rc = utils.RunKubectl(
+ ["get", "pods", "-n", namespace, "-l", label, "-o", "json"],
+ timeout=60,
+ raise_on_failure=False,
+ )
+ if rc != 0 or not stdout:
+ return {}
+
+ pods = json.loads(stdout).get("items", [])
+ created_deltas = []
+ scheduled_deltas = []
+ running_deltas = []
+
+ for pod in pods:
+ meta = pod.get("metadata", {})
+ status = pod.get("status", {})
+
+ # creationTimestamp -> time-to-created
+ created_str = meta.get("creationTimestamp")
+ if created_str:
+ created_ts = datetime.fromisoformat(
+ created_str.replace("Z", "+00:00")
+ ).timestamp()
+ created_deltas.append(created_ts - scale_start_epoch)
+
+ # PodScheduled condition -> time-to-scheduled
+ conditions = status.get("conditions", [])
+ for cond in conditions:
+ if cond.get("type") == "PodScheduled" and cond.get("status") == "True":
+ ts_str = cond.get("lastTransitionTime")
+ if ts_str:
+ ts = datetime.fromisoformat(
+ ts_str.replace("Z", "+00:00")
+ ).timestamp()
+ scheduled_deltas.append(ts - scale_start_epoch)
+ if cond.get("type") == "Ready" and cond.get("status") == "True":
+ ts_str = cond.get("lastTransitionTime")
+ if ts_str:
+ ts = datetime.fromisoformat(
+ ts_str.replace("Z", "+00:00")
+ ).timestamp()
+ running_deltas.append(ts - scale_start_epoch)
+
+ def _pcts(vals):
+ if not vals:
+ return {}
+ vals.sort()
+ n = len(vals)
+ return {
+ "p50": round(vals[n // 2], 2),
+ "p95": round(vals[int(n * 0.95)], 2) if n > 1 else round(vals[-1], 2),
+ "max": round(vals[-1], 2),
+ "count": n,
+ }
+
+ return {
+ "time_to_created_s": _pcts(created_deltas),
+ "time_to_scheduled_s": _pcts(scheduled_deltas),
+ "time_to_running_s": _pcts(running_deltas),
+ }
+
+
+def _EmitLifecycleSamples(samples, lifecycle, namespace, extra):
+ """Emit pod lifecycle percentile samples for all three phases."""
+ _PHASE_MAP = [
+ ("time_to_created_s", "time_to_created"),
+ ("time_to_scheduled_s", "time_to_scheduled"),
+ ("time_to_running_s", "time_to_running"),
+ ]
+ for lifecycle_key, metric_base in _PHASE_MAP:
+ phase_data = lifecycle.get(lifecycle_key, {})
+ for stat in ("p50", "p95", "max"):
+ val = phase_data.get(stat)
+ if val is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_base}_{stat}",
+ val,
+ "seconds",
+ namespace,
+ extra,
+ )
+ )
+ count = phase_data.get("count")
+ if count is not None:
+ samples.append(
+ utils.MakeSample(
+ f"{BENCHMARK_NAME}_{metric_base}_count",
+ float(count),
+ "count",
+ namespace,
+ extra,
+ )
+ )
diff --git a/perfkitbenchmarker/providers/gcp/flags.py b/perfkitbenchmarker/providers/gcp/flags.py
index a56fe72b99..eeabaae0b3 100644
--- a/perfkitbenchmarker/providers/gcp/flags.py
+++ b/perfkitbenchmarker/providers/gcp/flags.py
@@ -580,6 +580,21 @@
' the size derived from max_vm_count. Use when the cluster will scale'
' beyond the default node pool (e.g. kubernetes_node_scale with 5k nodes).',
)
+
+
+GKE_ADDITIONAL_FLAGS = flags.DEFINE_list(
+ 'gke_additional_flags',
+ [],
+ 'Additional flags to pass to gcloud container clusters create. '
+ 'Example: --gke_additional_flags=--enable-pod-snapshots,--enable-dataplane-v2',
+)
+
+GKE_ADDITIONAL_NODEPOOL_FLAGS = flags.DEFINE_list(
+ 'gke_additional_nodepool_flags',
+ [],
+ 'Additional flags to pass to gcloud container node-pools create. '
+ 'Example: --gke_additional_nodepool_flags=--max-pods-per-node=250',
+)
GCE_PERFORMANCE_MONITORING_UNIT = flags.DEFINE_enum(
'gce_performance_monitoring_unit',
None,
diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
index f943a53ff1..06d4a295dc 100644
--- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
@@ -102,14 +102,25 @@ def _Delete(self):
).Issue()
def RemoteBuild(self, image: container.ContainerImage):
- """Builds the image remotely."""
- if not gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value:
- full_tag = self.GetFullRegistryTag(image.name)
+ """Builds the image remotely.
+
+ If --container_remote_build_config is set, uses it as the
+ --config argument to `gcloud builds submit` and passes the
+ image tag via --substitutions _IMAGE=.
+ Otherwise uses the simple --tag shorthand.
+ """
+ full_tag = self.GetFullRegistryTag(image.name)
+ if gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value:
+ build_cmd = util.GcloudCommand(
+ self, 'builds', 'submit',
+ '--config', gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value,
+ '--substitutions', f'_IMAGE={full_tag}',
+ image.directory,
+ )
else:
- full_tag = gcp_flags.CONTAINER_REMOTE_BUILD_CONFIG.value
- build_cmd = util.GcloudCommand(
- self, 'builds', 'submit', '--tag', full_tag, image.directory
- )
+ build_cmd = util.GcloudCommand(
+ self, 'builds', 'submit', '--tag', full_tag, image.directory,
+ )
build_cmd.Issue(timeout=None)
@@ -417,6 +428,10 @@ def _Create(self):
if self.enable_aam:
cmd.args.append('--auto-monitoring-scope=ALL')
+ # --- PKB Extension: additional cluster create flags ---
+ for additional_flag in gcp_flags.GKE_ADDITIONAL_FLAGS.value:
+ cmd.args.append(additional_flag)
+
self._RunClusterCreateCommand(cmd)
self._GetKubeconfig()
self._CreateCustomComputeClass(self.default_nodepool)
@@ -432,6 +447,10 @@ def _CreateNodePools(self):
nodepool,
cmd,
)
+ # --- PKB Extension: additional node pool create flags ---
+ for additional_flag in gcp_flags.GKE_ADDITIONAL_NODEPOOL_FLAGS.value:
+ cmd.args.append(additional_flag)
+
self._IssueResourceCreationCommand(cmd)
self._CreateCustomComputeClass(nodepool)