diff --git a/README.md b/README.md
index 3da3717..1e96973 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ OpenAdapt Evals is a unified framework for evaluating GUI automation agents agai
 - **Agent interfaces** including `ApiAgent` (Claude / GPT), `ClaudeComputerUseAgent` (with coordinate clamping and fail-safe recovery), `RetrievalAugmentedAgent`, `RandomAgent`, and `PolicyAgent`
 - **Multi-cloud VM infrastructure** with `AzureVMManager`, `AWSVMManager`, `PoolManager`, `SSHTunnelManager`, and `VMMonitor` for running evaluations at scale on Azure or AWS
 - **End-to-end eval pipeline** (`scripts/run_eval_pipeline.py`) -- orchestrates demo generation, VM lifecycle, SSH tunnels, and ZS/DC evaluation in a single command
+- **Deterministic desktop parity mode** -- `--clean-desktop` suppresses OneDrive/toast/popover noise, `--force-tray-icons` keeps network/audio tray controls visible, and run metadata records requested/observed environment flags
 - **RL training environment** -- `RLEnvironment` wrapper provides a Gymnasium-style `reset`/`step`/`evaluate` interface for online RL (GRPO, PPO) with outcome-based rewards from WAA scores
 - **Annotation pipeline** -- VLM-based screenshot annotation (`annotation.py`, `vlm.py`) migrated from openadapt-ml so the full record-annotate-evaluate workflow runs within this repo
 - **4-layer WAA probe** -- `probe --detailed` checks screenshot capture, accessibility tree, action pipeline, and scoring independently; supports `--json` and `--layers` filtering
@@ -155,6 +156,13 @@ python scripts/run_eval_pipeline.py --tasks 04d9aeaf --dry-run
 
 # AWS instead of Azure
 python scripts/run_eval_pipeline.py --cloud aws --vm-name waa-pool-00
+
+# Deterministic desktop parity + pinned image version metadata
+python scripts/run_eval_pipeline.py \
+  --tasks 04d9aeaf \
+  --clean-desktop \
+  --force-tray-icons \
+  --waa-image-version win11-24h2-2026-03-04
 ```
 
 ### Parallel evaluation
diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py
index 25136fd..050eeea 100644
--- a/openadapt_evals/adapters/waa/live.py
+++ b/openadapt_evals/adapters/waa/live.py
@@ -25,6 +25,7 @@
 from __future__ import annotations
 
 import base64
+import json
 import logging
 import re
 import time
@@ -263,6 +264,14 @@ class WAALiveConfig:
         waa_examples_path: Path to WAA evaluation_examples_windows directory
             for loading task configs with evaluator specs. If not set, tasks
             are loaded from server or created as minimal placeholders.
+        clean_desktop: If True, applies deterministic desktop policy before
+            task setup (suppresses OneDrive/toast/taskbar noise).
+        force_tray_icons: If True, enforces network/audio tray icons visible
+            via policy keys before task setup.
+        reapply_clean_desktop_each_reset: Re-apply clean desktop policy on
+            every reset instead of once per adapter lifecycle.
+        waa_image_version: Optional pinned WAA image version identifier to
+            record in environment metadata.
     """
 
     server_url: str = "http://localhost:5000"
@@ -274,6 +283,10 @@ class WAALiveConfig:
     action_delay: float = 0.5
     timeout: float = 90.0
     waa_examples_path: str | None = None
+    clean_desktop: bool = False
+    force_tray_icons: bool = False
+    reapply_clean_desktop_each_reset: bool = False
+    waa_image_version: str | None = None
 
 
 class WAALiveAdapter(BenchmarkAdapter):
@@ -298,6 +311,8 @@ def __init__(self, config: WAALiveConfig | None = None):
         self._current_screenshot: bytes | None = None
         self._actions: list[BenchmarkAction] = []
         self._actual_screen_size: tuple[int, int] | None = None
+        self._clean_desktop_applied = False
+        self._environment_profile: dict[str, Any] = {}
 
     @property
     def name(self) -> str:
@@ -314,6 +329,10 @@ def supports_parallel(self) -> bool:
         """Whether parallel execution is supported."""
         return False  # Single VM for now
 
+    def get_environment_profile(self) -> dict[str, Any]:
+        """Return the last captured environment profile for this adapter."""
+        return dict(self._environment_profile)
+
     def check_connection(self) -> bool:
         """Check if WAA server is reachable.
 
@@ -524,8 +543,12 @@ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
         except Exception as e:
             logger.warning(f"Failed to close windows: {e}")
 
-        # Dismiss system notifications (OneDrive, etc.) that persist through close_all
-        self._dismiss_notifications(requests)
+        # Optionally apply deterministic desktop policy before task setup.
+        if self.config.clean_desktop or self.config.force_tray_icons:
+            self._apply_clean_desktop_policy(requests)
+        else:
+            # Best-effort cleanup even when full clean policy is disabled.
+            self._dismiss_notifications(requests)
 
         # If task has setup commands in raw_config, execute them
         if task.raw_config:
@@ -1357,30 +1380,189 @@ def _dismiss_notifications(self, requests_module) -> None:
         are not closeable via close_all (they're system toasts, not windows).
         Kill the notification processes and dismiss via keyboard.
         """
-        evaluate_base = self.config.evaluate_url or self.config.server_url
-        # Kill OneDrive and related notification processes
         commands = [
             "taskkill /F /IM OneDrive.exe /T",
             "taskkill /F /IM OneDriveStandaloneUpdater.exe /T",
-            # Dismiss any remaining toast notifications via Action Center
+            "taskkill /F /IM ApplicationFrameHost.exe /T",
+            # Open/close action center to dismiss any currently surfaced toast.
             (
-                "powershell -Command \""
-                "Get-Process -Name 'ShellExperienceHost' -ErrorAction SilentlyContinue | "
-                "ForEach-Object { $_.CloseMainWindow() }\""
+                "python -c \""
+                "import pyautogui; "
+                "pyautogui.hotkey('win','a'); "
+                "pyautogui.press('esc')"
+                "\""
             ),
         ]
-        for cmd in commands:
-            try:
-                requests_module.post(
-                    f"{evaluate_base}/setup",
-                    json={"config": [
-                        {"type": "execute", "parameters": {"command": cmd, "shell": True}},
-                    ]},
-                    timeout=10.0,
+        self._run_setup_execute_commands(
+            requests_module,
+            commands,
+            label="notification cleanup",
+            timeout=15.0,
+        )
+
+    def _run_setup_execute_commands(
+        self,
+        requests_module,
+        commands: list[str],
+        *,
+        label: str,
+        timeout: float = 20.0,
+    ) -> None:
+        """Run a batch of `execute` setup commands on the evaluate server."""
+        if not commands:
+            return
+        evaluate_base = self.config.evaluate_url or self.config.server_url
+        payload = {
+            "config": [
+                {"type": "execute", "parameters": {"command": cmd, "shell": True}}
+                for cmd in commands
+            ]
+        }
+        try:
+            resp = requests_module.post(
+                f"{evaluate_base}/setup",
+                json=payload,
+                timeout=timeout,
+            )
+            if resp.status_code == 200:
+                logger.info("%s applied (%d command(s))", label, len(commands))
+            else:
+                logger.warning(
+                    "%s failed: HTTP %s %s",
+                    label,
+                    resp.status_code,
+                    resp.text[:200],
                 )
-            except Exception:
-                pass  # Best-effort; don't fail reset if notification kill fails
-        logger.debug("Dismissed system notifications")
+        except Exception as e:
+            logger.warning("%s request failed: %s", label, e)
+
+    def _apply_clean_desktop_policy(self, requests_module) -> None:
+        """Apply deterministic desktop policy for train/eval UI parity."""
+        if self._clean_desktop_applied and not self.config.reapply_clean_desktop_each_reset:
+            # Keep notifications suppressed each reset even if policy is one-time.
+            self._dismiss_notifications(requests_module)
+            return
+
+        commands: list[str] = []
+        if self.config.clean_desktop:
+            commands.extend([
+                # Suppress OneDrive first-run backup prompts.
+                (
+                    'reg add "HKLM\\SOFTWARE\\Policies\\Microsoft\\Windows\\OneDrive" '
+                    '/v DisableFileSyncNGSC /t REG_DWORD /d 1 /f'
+                ),
+                # Prevent OneDrive auto-start prompts on login.
+                'reg delete "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Run" /v OneDrive /f',
+                # Suppress notification toasts/popovers.
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\PushNotifications" '
+                    '/v ToastEnabled /t REG_DWORD /d 0 /f'
+                ),
+                # Disable common Windows suggestion surfaces/popups.
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
+                    '/v SubscribedContent-338389Enabled /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
+                    '/v SubscribedContent-338388Enabled /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\ContentDeliveryManager" '
+                    '/v SubscribedContent-353694Enabled /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
+                    '/v ShowCopilotButton /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
+                    '/v TaskbarDa /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\Advanced" '
+                    '/v TaskbarMn /t REG_DWORD /d 0 /f'
+                ),
+            ])
+
+        if self.config.clean_desktop or self.config.force_tray_icons:
+            commands.extend([
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" '
+                    '/v HideSCANetwork /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Policies\\Explorer" '
+                    '/v HideSCAVolume /t REG_DWORD /d 0 /f'
+                ),
+                (
+                    'reg add "HKCU\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer" '
+                    '/v EnableAutoTray /t REG_DWORD /d 0 /f'
+                ),
+            ])
+
+        if commands:
+            self._run_setup_execute_commands(
+                requests_module,
+                commands,
+                label="clean desktop policy",
+                timeout=25.0,
+            )
+
+        # Keep this as a final pass to clear already displayed popups.
+        self._dismiss_notifications(requests_module)
+        self._environment_profile = self._collect_environment_profile()
+        if self._environment_profile:
+            logger.info("Desktop environment profile: %s", self._environment_profile)
+        self._clean_desktop_applied = True
+
+    def _collect_environment_profile(self) -> dict[str, Any]:
+        """Capture key environment flags for reproducibility metadata."""
+        script = r"""
+$ErrorActionPreference='SilentlyContinue'
+$os = Get-ItemProperty 'HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion'
+$one = (Get-ItemProperty 'HKLM:\SOFTWARE\Policies\Microsoft\Windows\OneDrive' -Name DisableFileSyncNGSC).DisableFileSyncNGSC
+$toast = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\PushNotifications' -Name ToastEnabled).ToastEnabled
+$hideNet = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCANetwork).HideSCANetwork
+$hideVol = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Policies\Explorer' -Name HideSCAVolume).HideSCAVolume
+$autoTray = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer' -Name EnableAutoTray).EnableAutoTray
+$widgets = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name TaskbarDa).TaskbarDa
+$copilot = (Get-ItemProperty 'HKCU:\Software\Microsoft\Windows\CurrentVersion\Explorer\Advanced' -Name ShowCopilotButton).ShowCopilotButton
+$obj = [ordered]@{
+  image_version = $env:WAA_IMAGE_VERSION
+  os_product = $os.ProductName
+  os_release = $os.DisplayVersion
+  os_build = $os.CurrentBuildNumber
+  one_drive_disable_sync_ngsc = $one
+  toast_enabled = $toast
+  hide_network_icon = $hideNet
+  hide_volume_icon = $hideVol
+  enable_auto_tray = $autoTray
+  taskbar_widgets = $widgets
+  show_copilot_button = $copilot
+}
+$obj | ConvertTo-Json -Compress
+"""
+        try:
+            output = self.run_powershell(script).strip()
+            if not output:
+                return {}
+            profile: dict[str, Any] | None = None
+            for line in reversed(output.splitlines()):
+                line = line.strip()
+                if line.startswith("{") and line.endswith("}"):
+                    profile = json.loads(line)
+                    break
+            if profile is None:
+                return {}
+            profile["clean_desktop_requested"] = self.config.clean_desktop
+            profile["force_tray_icons_requested"] = self.config.force_tray_icons
+            if self.config.waa_image_version:
+                profile["configured_image_version"] = self.config.waa_image_version
+            return profile
+        except Exception as e:
+            logger.debug("Failed to collect environment profile: %s", e)
+            return {}
 
     # --- App-name to window-title mapping for post-setup focus ---
 
diff --git a/openadapt_evals/benchmarks/cli.py b/openadapt_evals/benchmarks/cli.py
index 6a6eb30..0b473d0 100644
--- a/openadapt_evals/benchmarks/cli.py
+++ b/openadapt_evals/benchmarks/cli.py
@@ -142,6 +142,50 @@ def _resolve_vm_context(args: argparse.Namespace) -> tuple[str, str] | None:
     return name, rg
 
 
+def _requested_environment_flags(args: argparse.Namespace) -> dict[str, str | bool | None]:
+    """Collect requested desktop-parity flags from CLI args."""
+    return {
+        "clean_desktop": bool(getattr(args, "clean_desktop", False)),
+        "force_tray_icons": bool(getattr(args, "force_tray_icons", False)),
+        "waa_image_version": getattr(args, "waa_image_version", None),
+    }
+
+
+def _write_run_environment_metadata(
+    benchmark_dir: Path,
+    *,
+    requested: dict[str, str | bool | None],
+    adapter,
+    server_url: str,
+    evaluate_url: str | None,
+) -> None:
+    """Persist run environment flags/profile into benchmark metadata.json."""
+    metadata_path = benchmark_dir / "metadata.json"
+    metadata: dict = {}
+    if metadata_path.exists():
+        try:
+            metadata = json.loads(metadata_path.read_text())
+        except Exception as e:
+            logger.warning("Could not parse metadata.json for environment patch: %s", e)
+            metadata = {}
+
+    observed = {}
+    if hasattr(adapter, "get_environment_profile"):
+        try:
+            observed = adapter.get_environment_profile() or {}
+        except Exception as e:
+            logger.debug("Could not read adapter environment profile: %s", e)
+
+    metadata["environment"] = {
+        "requested": requested,
+        "observed": observed,
+        "server_url": server_url,
+        "evaluate_url": evaluate_url,
+    }
+    benchmark_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path.write_text(json.dumps(metadata, indent=2))
+
+
 def cmd_mock(args: argparse.Namespace) -> int:
     """Run mock evaluation (no Windows VM required)."""
     from openadapt_evals.benchmarks import (
@@ -289,6 +333,9 @@ def cmd_run(args: argparse.Namespace) -> int:
         evaluate_url=evaluate_url,
         max_steps=args.max_steps,
         waa_examples_path=waa_examples_path,
+        clean_desktop=getattr(args, "clean_desktop", False),
+        force_tray_icons=getattr(args, "force_tray_icons", False),
+        waa_image_version=getattr(args, "waa_image_version", None),
     )
     adapter = WAALiveAdapter(config)
 
@@ -437,6 +484,13 @@ def cmd_run(args: argparse.Namespace) -> int:
     print(f"Avg score:    {metrics['avg_score']:.3f}")
     print(f"Avg steps:    {metrics['avg_steps']:.1f}")
     benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name
+    _write_run_environment_metadata(
+        benchmark_dir,
+        requested=_requested_environment_flags(args),
+        adapter=adapter,
+        server_url=server_url,
+        evaluate_url=evaluate_url,
+    )
     print(f"\nResults saved to: {benchmark_dir}")
 
     no_open = getattr(args, "no_open", False)
@@ -479,6 +533,9 @@ def cmd_live(args: argparse.Namespace) -> int:
         evaluate_url=evaluate_url,
         max_steps=args.max_steps,
         waa_examples_path=waa_examples_path,
+        clean_desktop=getattr(args, "clean_desktop", False),
+        force_tray_icons=getattr(args, "force_tray_icons", False),
+        waa_image_version=getattr(args, "waa_image_version", None),
     )
     adapter = WAALiveAdapter(config)
 
@@ -633,6 +690,14 @@ def cmd_live(args: argparse.Namespace) -> int:
     print(f"Avg steps:    {metrics['avg_steps']:.1f}")
 
     if eval_config:
+        benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name
+        _write_run_environment_metadata(
+            benchmark_dir,
+            requested=_requested_environment_flags(args),
+            adapter=adapter,
+            server_url=args.server,
+            evaluate_url=evaluate_url,
+        )
         print(f"\nResults saved to: {eval_config.output_dir}/{eval_config.run_name}")
 
     return 0
@@ -865,7 +930,15 @@ def patch_evaluate_endpoint() -> bool:
         print("[6/6] Running single-task live evaluation...")
 
         agent = ScriptedAgent([BenchmarkAction(type="done")])
-        adapter = WAALiveAdapter(WAALiveConfig(server_url=server_url, max_steps=args.max_steps))
+        adapter = WAALiveAdapter(
+            WAALiveConfig(
+                server_url=server_url,
+                max_steps=args.max_steps,
+                clean_desktop=getattr(args, "clean_desktop", False),
+                force_tray_icons=getattr(args, "force_tray_icons", False),
+                waa_image_version=getattr(args, "waa_image_version", None),
+            )
+        )
 
         eval_config = EvaluationConfig(
             max_steps=args.max_steps,
@@ -891,6 +964,15 @@ def patch_evaluate_endpoint() -> bool:
         print(f"Success rate: {metrics['success_rate']:.1%}")
         print(f"Avg score:    {metrics['avg_score']:.3f}")
         print(f"Avg steps:    {metrics['avg_steps']:.1f}")
+        benchmark_dir = Path(eval_config.output_dir) / eval_config.run_name
+        _write_run_environment_metadata(
+            benchmark_dir,
+            requested=_requested_environment_flags(args),
+            adapter=adapter,
+            server_url=server_url,
+            evaluate_url=None,
+        )
+        print(f"\nResults saved to: {benchmark_dir}")
 
         return 0
 
@@ -2299,6 +2381,12 @@ def main() -> int:
                            help="Max retries per step when using --controller (default: 2)")
     run_parser.add_argument("--max-replans", type=int, default=2,
                            help="Max replans when using --controller (default: 2)")
+    run_parser.add_argument("--clean-desktop", action="store_true",
+                           help="Apply deterministic clean-desktop policy (disable OneDrive/toast/taskbar noise)")
+    run_parser.add_argument("--force-tray-icons", action="store_true",
+                           help="Force network/audio tray icons visible for stable click-coordinate tasks")
+    run_parser.add_argument("--waa-image-version", type=str, default=None,
+                           help="Pinned WAA image version label to record in run metadata")
 
     # Live evaluation (full control)
     live_parser = subparsers.add_parser("live", help="Run live evaluation against WAA server (full control)")
@@ -2321,6 +2409,12 @@ def main() -> int:
                             help="Path to WAA evaluation_examples_windows directory for task configs")
     live_parser.add_argument("--output", type=str, help="Output directory for traces")
     live_parser.add_argument("--run-name", type=str, help="Name for this evaluation run")
+    live_parser.add_argument("--clean-desktop", action="store_true",
+                            help="Apply deterministic clean-desktop policy (disable OneDrive/toast/taskbar noise)")
+    live_parser.add_argument("--force-tray-icons", action="store_true",
+                            help="Force network/audio tray icons visible for stable click-coordinate tasks")
+    live_parser.add_argument("--waa-image-version", type=str, default=None,
+                            help="Pinned WAA image version label to record in run metadata")
 
     # Probe server
     probe_parser = subparsers.add_parser("probe", help="Check if WAA server is reachable")
@@ -2487,6 +2581,12 @@ def main() -> int:
                                   help="Save execution traces (viewer artifacts)")
     smoke_live_parser.add_argument("--no-stop-vm", dest="stop_vm", action="store_false",
                                   help="Do not deallocate VM after smoke test")
+    smoke_live_parser.add_argument("--clean-desktop", action="store_true",
+                                  help="Apply deterministic clean-desktop policy before smoke task")
+    smoke_live_parser.add_argument("--force-tray-icons", action="store_true",
+                                  help="Force network/audio tray icons visible during smoke task")
+    smoke_live_parser.add_argument("--waa-image-version", type=str, default=None,
+                                  help="Pinned WAA image version label to record in run metadata")
     smoke_live_parser.set_defaults(stop_vm=True)
 
     dashboard_parser = subparsers.add_parser("dashboard", help="Generate VM usage dashboard")
diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py
index 3379f65..21ca8fa 100644
--- a/scripts/run_dc_eval.py
+++ b/scripts/run_dc_eval.py
@@ -14,6 +14,10 @@
 
     # All 12 tasks
     python scripts/run_dc_eval.py --agent api-claude-cu
+
+    # Deterministic desktop parity + pinned image version metadata
+    python scripts/run_dc_eval.py --agent api-claude-cu --clean-desktop \\
+      --force-tray-icons --waa-image-version win11-24h2-2026-03-04
 """
 
 from __future__ import annotations
@@ -209,6 +213,21 @@ def main() -> int:
     parser.add_argument("--evaluate-url", default="http://localhost:5050")
     parser.add_argument("--max-steps", type=int, default=15)
     parser.add_argument("--output", default="benchmark_results")
+    parser.add_argument(
+        "--clean-desktop",
+        action="store_true",
+        help="Apply deterministic clean-desktop policy (OneDrive/toast suppression + tray parity)",
+    )
+    parser.add_argument(
+        "--force-tray-icons",
+        action="store_true",
+        help="Force network/audio tray icons visible before each run",
+    )
+    parser.add_argument(
+        "--waa-image-version",
+        default=None,
+        help="Pinned WAA image version label to record in run metadata",
+    )
     parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)")
     parser.add_argument("--start-from", type=int, default=0, help="Task index to start from")
     parser.add_argument("--vm-ip", default=None, help="VM IP (auto-detected if omitted)")
@@ -334,6 +353,12 @@ def main() -> int:
                 "--max-retries", str(args.max_retries),
                 "--max-replans", str(args.max_replans),
             ])
+        if args.clean_desktop:
+            cmd.append("--clean-desktop")
+        if args.force_tray_icons:
+            cmd.append("--force-tray-icons")
+        if args.waa_image_version:
+            cmd.extend(["--waa-image-version", args.waa_image_version])
 
         result = subprocess.run(cmd)
         elapsed = time.time() - task_start
diff --git a/scripts/run_eval_pipeline.py b/scripts/run_eval_pipeline.py
index 02bca0a..aa56021 100644
--- a/scripts/run_eval_pipeline.py
+++ b/scripts/run_eval_pipeline.py
@@ -28,6 +28,10 @@
 
     # Use AWS instead of Azure
     python scripts/run_eval_pipeline.py --cloud aws --vm-name waa-pool-00
+
+    # Train/eval UI parity mode (suppresses OneDrive/toast noise, pins image tag)
+    python scripts/run_eval_pipeline.py --tasks 04d9aeaf --clean-desktop \\
+        --force-tray-icons --waa-image-version win11-24h2-2026-03-04
 """
 
 from __future__ import annotations
@@ -436,6 +440,9 @@ def _run_eval(
     vm_ip: str,
     vm_user: str,
     tunnel_manager: SSHTunnelManager,
+    clean_desktop: bool = False,
+    force_tray_icons: bool = False,
+    waa_image_version: str | None = None,
 ) -> dict[str, dict]:
     """Run all eval conditions sequentially with health checks."""
     results = {}
@@ -479,6 +486,12 @@ def _run_eval(
             "--output", str(output_dir),
             "--run-name", run_name,
         ]
+        if clean_desktop:
+            cmd.append("--clean-desktop")
+        if force_tray_icons:
+            cmd.append("--force-tray-icons")
+        if waa_image_version:
+            cmd.extend(["--waa-image-version", waa_image_version])
         if demo_path:
             cmd.extend(["--demo", str(demo_path.resolve())])
 
@@ -577,6 +590,21 @@ def build_parser() -> argparse.ArgumentParser:
     parser.add_argument("--agent", default="api-claude-cu", help="Agent type")
     parser.add_argument("--max-steps", type=int, default=15)
     parser.add_argument("--output", default=str(DEFAULT_OUTPUT))
+    parser.add_argument(
+        "--clean-desktop",
+        action="store_true",
+        help="Apply deterministic clean-desktop policy (OneDrive/toast suppression + tray parity)",
+    )
+    parser.add_argument(
+        "--force-tray-icons",
+        action="store_true",
+        help="Force network/audio tray icons visible before each run",
+    )
+    parser.add_argument(
+        "--waa-image-version",
+        default=None,
+        help="Pinned WAA image version label to record in run metadata",
+    )
     parser.add_argument("--server", default="http://localhost:5001")
     parser.add_argument("--evaluate-url", default="http://localhost:5050")
     parser.add_argument(
@@ -823,6 +851,9 @@ def main() -> int:
         vm_ip=vm_ip or "",
         vm_user=vm_user,
         tunnel_manager=tunnel_manager,
+        clean_desktop=args.clean_desktop,
+        force_tray_icons=args.force_tray_icons,
+        waa_image_version=args.waa_image_version,
     )
 
     # ── Phase 4: Summary ──────────────────────────────────────────────