fix(test): unbreak qwen serve integration suites after daemon batch merge (#5041)

tanzhenxin · web-flow · commit fa684552b07e · 2026-06-12T19:22:23.000+08:00
Three integration tests have failed every nightly Release and E2E run since the daemon-mode feature batch (#4490) merged, because these suites only run post-merge: - routes: resync the capabilities envelope baseline with the features the batch added (verified against a live daemon), and strip the env toggles that flip conditional tags so the exact-equality assertion is hermetic on dev machines. - baseline: the 2xN MCP grandchildren tripwire fired as designed — the workspace MCP pool eliminated the bootstrap/session duplicate discovery. Assert exactly N pooled children and cross-check the pool's per-server accounting against pgrep. - streaming: the permission test could finish with its turn still blocked on a second permission request nobody would ever answer; the abandoned request wedges the shared session's prompt FIFO and the downstream Last-Event-ID resume test times out waiting for a turn_complete that never comes (reproduced empirically). Pin the session to default approval mode (hermetic vs host user settings) and cancel the possibly-in-flight turn before finishing. The daemon-side wedge (abandoned permission request blocks the FIFO until an explicit cancel) is real beyond tests and tracked separately.
diff --git a/integration-tests/cli/qwen-serve-baseline.test.ts b/integration-tests/cli/qwen-serve-baseline.test.ts
@@ -443,41 +443,35 @@ async function measureRssAtSessionCount(sessionCount: number): Promise<{
       }, 120_000);
 
       // PR 14b cross-check: validate the daemon's in-process MCP
-      // accounting on `GET /workspace/mcp` (`clientCount`, the field
-      // SDK consumers and dashboards see, and the same source the
-      // push-event channel — `mcp_budget_warning` /
-      // `mcp_child_refused_batch` — reads) against external `pgrep -P`
+      // accounting on `GET /workspace/mcp` against external `pgrep -P`
       // measurement.
       //
-      // Architectural note (PR 22a): a `qwen serve` ACP child runs
-      // two `Config` objects, each carrying its own
-      // `McpClientManager`. The bootstrap Config (`runAcpAgent` →
-      // `config.initialize`) discovers MCP servers when the child
-      // starts, and `/workspace/mcp` reads its manager via
-      // `buildWorkspaceMcpStatus(this.config)` (`acpAgent.ts:1399`).
-      // The per-session Config (`newSessionConfig` →
-      // `config.initialize`) spawns a SECOND set of MCP children for
-      // the SAME servers — its accounting is NOT what the
-      // workspace-level snapshot reflects. So pgrep observes
-      // `(1 + sessionCount) * MCP_SERVERS_CONFIGURED` grandchildren
-      // while `clientCount` stays at `MCP_SERVERS_CONFIGURED`.
+      // Architectural note (F2 workspace pool): the daemon hosts a
+      // workspace-shared MCP transport pool (`QwenAgent.mcpPool`).
+      // All sessions of a workspace share ONE transport per configured
+      // server, so pgrep observes exactly `MCP_SERVERS_CONFIGURED`
+      // grandchildren regardless of session count. (Pre-F2, bootstrap
+      // + per-session Configs each ran their own `McpClientManager`,
+      // and this test asserted the historical 2×N duplication.)
+      // Pool accounting surfaces per server cell as `entryCount` /
+      // `entrySummary`; the top-level `clientCount` field reflects the
+      // workspace budget controller's reserved count — 0 when budgets
+      // are off (this suite), NOT the live transport count.
       //
       // What this test validates:
-      // 1. `clientCount` is exactly the configured server count
-      //    (bootstrap manager accounting is honest).
-      // 2. pgrep observes the architectural 2×N grandchildren after
-      //    one session is created — encoded literally so a future
-      //    refactor that unifies bootstrap + session managers (#4175
-      //    follow-up to drop the duplicate discovery) fails this
-      //    assertion and forces a deliberate test update.
+      // 1. pgrep observes exactly N grandchildren after a session is
+      //    created — encoded literally so a refactor that reintroduces
+      //    per-session MCP children fails this assertion and forces a
+      //    deliberate test update (same tripwire spirit as the pre-F2
+      //    2×N assertion this replaces).
+      // 2. Pool accounting is honest: per-server `entryCount` sums to
+      //    the observed pgrep count (no amplification slack at idle —
+      //    the fixtures are stdio-only).
       // 3. `clientCount` NEVER exceeds the observed pgrep count —
       //    the original "snapshot must never over-report" guard.
       //
-      // Skip-gated like the parent describe (POSIX, non-sandbox);
-      // idle MCP fixtures are stdio-only so the relationship between
-      // `clientCount` and pgrep is exact (no amplification slack
-      // required at idle).
-      it('clientCount matches external pgrep observation', async () => {
+      // Skip-gated like the parent describe (POSIX, non-sandbox).
+      it('pool accounting matches external pgrep observation', async () => {
         const ws = makeTempWorkspace('mcp-counter');
         let daemon: SpawnedDaemon | undefined;
         try {
@@ -490,28 +484,35 @@ async function measureRssAtSessionCount(sessionCount: number): Promise<{
           daemon = await spawnDaemon({ workspaceCwd: ws });
           await daemon.client.createOrAttachSession({ workspaceCwd: ws });
 
-          // Wait until the OS sees the FULL post-session set
-          // (`MCP_SERVERS_CONFIGURED * 2` grandchildren — see the
+          // Wait until the OS sees the full pooled set
+          // (`MCP_SERVERS_CONFIGURED` grandchildren — see the
           // architectural note above), then read the snapshot.
           // pgrep first to lock the comparison floor; snapshot
           // second so the daemon can't sneak in a new connect
           // between the two reads.
-          const expectedGrandchildren = MCP_SERVERS_CONFIGURED * 2;
           const observed = await waitForMcpGrandchildren(
             daemon.daemon.pid!,
-            expectedGrandchildren,
+            MCP_SERVERS_CONFIGURED,
           );
           const snapshot = await daemon.client.workspaceMcp();
 
-          // (1) Bootstrap manager accounting is honest.
-          expect(snapshot.clientCount).toBe(MCP_SERVERS_CONFIGURED);
-          // (2) pgrep observes both managers' children. If a future
-          // refactor unifies them, change this to
-          // `MCP_SERVERS_CONFIGURED` (and update the architectural
-          // note above).
-          expect(observed.mcpGrandchildren.length).toBe(expectedGrandchildren);
-          // (3) Snapshot never over-reports OS reality. Holds under
-          // both the current 2× regime and the unified 1× future.
+          // (1) One pooled transport per configured server — no
+          // per-session amplification. If this fails with MORE
+          // children, per-session MCP spawning has been reintroduced;
+          // update the architectural note above deliberately.
+          expect(observed.mcpGrandchildren.length).toBe(MCP_SERVERS_CONFIGURED);
+          // (2) Pool accounting is honest: entryCount sums to the
+          // observed process count. Structural narrowing: the daemon
+          // emits `entryCount` on pool-backed cells but the SDK's
+          // `DaemonWorkspaceMcpServerStatus` doesn't carry the F2
+          // pool fields yet.
+          const pooledEntries = snapshot.servers.reduce(
+            (sum, server) =>
+              sum + ((server as { entryCount?: number }).entryCount ?? 0),
+            0,
+          );
+          expect(pooledEntries).toBe(observed.mcpGrandchildren.length);
+          // (3) Snapshot never over-reports OS reality.
           expect(snapshot.clientCount).toBeLessThanOrEqual(
             observed.mcpGrandchildren.length,
           );
diff --git a/integration-tests/cli/qwen-serve-routes.test.ts b/integration-tests/cli/qwen-serve-routes.test.ts
@@ -70,7 +70,26 @@ beforeAll(async () => {
       '--workspace',
       REPO_ROOT,
     ],
-    { stdio: ['ignore', 'pipe', 'pipe'] },
+    {
+      stdio: ['ignore', 'pipe', 'pipe'],
+      // Strip the env toggles that flip conditional capability tags
+      // (`prompt_absolute_deadline`, `writer_idle_timeout`,
+      // `rate_limit`, and the pool tags via the kill switch). The
+      // capabilities baseline below assumes their default state; a
+      // dev machine exporting any of these would otherwise fail the
+      // exact-equality assertion.
+      env: Object.fromEntries(
+        Object.entries(process.env).filter(
+          ([k]) =>
+            ![
+              'QWEN_SERVE_PROMPT_DEADLINE_MS',
+              'QWEN_SERVE_WRITER_IDLE_TIMEOUT_MS',
+              'QWEN_SERVE_RATE_LIMIT',
+              'QWEN_SERVE_NO_MCP_POOL',
+            ].includes(k),
+        ),
+      ),
+    },
   );
   // Read stdout until we see the listening line + parse the port.
   port = await new Promise<number>((resolve, reject) => {
@@ -187,6 +206,15 @@ describe('qwen serve — capabilities envelope', () => {
     // Order must match `SERVE_CAPABILITY_REGISTRY` in
     // `packages/cli/src/serve/capabilities.ts` and the unit-level
     // baseline features in `packages/cli/src/serve/server.test.ts`.
+    //
+    // Conditional tags absent under this suite's spawn flags (no
+    // `--require-auth` / `--allow-origin` / deadline env vars /
+    // rate-limit opt-in): `require_auth`, `allow_origin`,
+    // `prompt_absolute_deadline`, `writer_idle_timeout`, `rate_limit`.
+    // Pool tags (`mcp_workspace_pool`, `mcp_pool_restart`) ARE present
+    // because the workspace MCP pool is on by default, as are
+    // `workspace_settings` / `workspace_reload` (the CLI serve path
+    // always wires `persistSetting` and the workspace service).
     expect(caps.features).toEqual([
       'health',
       'capabilities',
@@ -209,25 +237,45 @@ describe('qwen serve — capabilities envelope', () => {
       'workspace_mcp',
       'workspace_skills',
       'workspace_providers',
+      'auth_provider_install',
       'workspace_memory',
       'workspace_agents',
+      'workspace_agent_generate',
       'workspace_env',
       'workspace_preflight',
       'session_context',
+      'session_context_usage',
       'session_supported_commands',
       'session_tasks',
+      'session_stats',
       'session_close',
       'session_metadata',
       'mcp_guardrails',
+      'workspace_mcp_manage',
       'mcp_guardrail_events',
+      'mcp_server_runtime_mutation',
       'workspace_file_read',
       'workspace_file_bytes',
       'workspace_file_write',
       'session_approval_mode_control',
       'workspace_tool_toggle',
+      'workspace_settings',
       'workspace_init',
       'workspace_mcp_restart',
+      'session_recap',
+      'session_btw',
+      'mcp_workspace_pool',
+      'mcp_pool_restart',
       'auth_device_flow',
+      'permission_mediation',
+      'non_blocking_prompt',
+      'session_language',
+      'session_rewind',
+      'workspace_hooks',
+      'session_hooks',
+      'workspace_extensions',
+      'session_branch',
+      'workspace_reload',
     ]);
   });
 });
diff --git a/integration-tests/cli/qwen-serve-streaming.test.ts b/integration-tests/cli/qwen-serve-streaming.test.ts
@@ -226,6 +226,13 @@ describeLLM('qwen serve — multi-client first-responder permission', () => {
       workspaceCwd: REPO_ROOT,
     });
 
+    // Pin the session to `default` approval mode. The ACP child
+    // inherits the host's user-level settings — a developer machine
+    // with `approvalMode: yolo` auto-approves the write below, no
+    // permission_request ever fires, and this test fails only
+    // locally. CI passes because its HOME has no user settings.
+    await client.setSessionApprovalMode(session.sessionId, 'default');
+
     const ac1 = new AbortController();
     const ac2 = new AbortController();
     const seen1: DaemonEvent[] = [];
@@ -315,6 +322,21 @@ describeLLM('qwen serve — multi-client first-responder permission', () => {
       promptTask.catch(() => undefined),
       new Promise((r) => setTimeout(r, 30_000)),
     ]);
+    // The race above tolerates the turn still running (slow model).
+    // But ABANDONING an in-flight turn wedges the shared session: if
+    // the model asks for a SECOND permission after the allow_once
+    // vote, nobody is left to answer it, the pending request blocks
+    // the turn forever, and the per-session prompt FIFO holds every
+    // later prompt behind it — the Last-Event-ID resume test below
+    // then times out waiting for a turn_complete that never comes
+    // (the exact 60s × 3-retry hang from the 2026-06-12 nightly).
+    // Cancel the active prompt so the session is clean for the next
+    // test; harmless when the turn already finished.
+    await client.cancel(session.sessionId).catch(() => undefined);
+    await Promise.race([
+      promptTask.catch(() => undefined),
+      new Promise((r) => setTimeout(r, 5_000)),
+    ]);
     ac1.abort();
     ac2.abort();
     await Promise.all([sub1, sub2]);