Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions ios-qa/daemon/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,31 @@ export async function startDaemon(opts: DaemonOptions): Promise<RunningDaemon |

const tokenStore = new SessionTokenStore();
let tunnel: DeviceTunnel | null = null;
let cachedTunnelAt = 0;

// No TTL on the tunnel cache. Rationale: bootstrapTunnel rotates the boot
// token and the iOS-side StateServer *deletes* the boot-token file on disk
// immediately after handling /auth/rotate (see StateServer.swift.template
// handleAuthRotate). The rotated bearer lives only in this daemon's memory.
// If we invalidate on a wall-clock timer, the next request re-runs
// bootstrapTunnel which can no longer read the now-deleted boot token →
// boot_token_unavailable → every subsequent request returns 503
// device_not_connected. Instead, hold the tunnel for the lifetime of the
// daemon and only drop it when the proxy reports the underlying CoreDevice
// route is dead (ECONNREFUSED / EHOSTUNREACH → 503 device_disconnected).
// The keepalive in devicectl.startTunnelKeepalive prevents the route from
// going stale in practice.
const getTunnel = async (): Promise<DeviceTunnel | null> => {
// Cache the tunnel for 30s; refresh on demand.
if (tunnel && Date.now() - cachedTunnelAt < 30_000) return tunnel;
if (tunnel) return tunnel;
if (opts.tunnelProvider) {
tunnel = await opts.tunnelProvider();
cachedTunnelAt = Date.now();
}
return tunnel;
};

const invalidateTunnel = (): void => {
tunnel = null;
};

// 2. Tailnet probe (fail-closed).
const probe = opts.tailnetEnabled
? (opts.probeImpl ? await opts.probeImpl() : await probeTailscale(opts.tailnetSocketPath))
Expand All @@ -80,7 +93,7 @@ export async function startDaemon(opts: DaemonOptions): Promise<RunningDaemon |

// 3. Loopback listener (full surface).
const loopbackServer = createServer(async (req, res) => {
await handleLoopback({ req, res, tokenStore, getTunnel });
await handleLoopback({ req, res, tokenStore, getTunnel, invalidateTunnel });
});
// Use port 0 for OS-assigned port when test/random port collisions are a risk.
const requestedPort = opts.loopbackPort;
Expand All @@ -91,7 +104,7 @@ export async function startDaemon(opts: DaemonOptions): Promise<RunningDaemon |
// mode this can collide; we try the actualPort first and skip ipv6 if it
// fails (tests don't exercise ::1 explicitly).
const loopbackServerV6 = createServer(async (req, res) => {
await handleLoopback({ req, res, tokenStore, getTunnel });
await handleLoopback({ req, res, tokenStore, getTunnel, invalidateTunnel });
});
let v6Bound = false;
try {
Expand All @@ -112,6 +125,7 @@ export async function startDaemon(opts: DaemonOptions): Promise<RunningDaemon |
res,
tokenStore,
getTunnel,
invalidateTunnel,
whoIsImpl: opts.whoIsImpl ?? ((addr) => whoIs(addr, opts.tailnetSocketPath)),
});
});
Expand Down Expand Up @@ -172,6 +186,7 @@ interface HandlerCtx {
res: ServerResponse;
tokenStore: SessionTokenStore;
getTunnel: () => Promise<DeviceTunnel | null>;
invalidateTunnel: () => void;
}

function readBody(req: IncomingMessage, maxBytes = 1_048_576): Promise<Buffer | { error: 'body_too_large' }> {
Expand Down Expand Up @@ -215,7 +230,7 @@ function sendJson(res: ServerResponse, status: number, body: unknown): void {
* loopback bind itself is the boundary).
*/
async function handleLoopback(ctx: HandlerCtx): Promise<void> {
const { req, res, tokenStore, getTunnel } = ctx;
const { req, res, tokenStore, getTunnel, invalidateTunnel } = ctx;
const url = parseUrl(req.url ?? '/');
const path = url.pathname ?? '/';
const method = req.method ?? 'GET';
Expand Down Expand Up @@ -259,13 +274,28 @@ async function handleLoopback(ctx: HandlerCtx): Promise<void> {
const sessionId = (req.headers['x-session-id'] as string | undefined) ?? null;
const agentIdentity = (req.headers['x-agent-identity'] as string | undefined) ?? undefined;
const upstream = await proxyToDevice({ inbound: req, body, tunnel, sessionId, agentIdentity });
// If the underlying CoreDevice tunnel route went stale (ECONNREFUSED /
// EHOSTUNREACH surface as 503 device_disconnected from proxy.ts), drop
// the cached tunnel so the next request triggers a fresh bootstrap.
if (upstream.status === 503 && isDeviceDisconnected(upstream.body)) {
invalidateTunnel();
}
res.writeHead(upstream.status, upstream.headers);
res.end(upstream.body);
} catch (err) {
sendJson(res, 500, { error: 'internal_error', detail: (err as Error).message });
}
}

function isDeviceDisconnected(body: Buffer): boolean {
try {
const parsed = JSON.parse(body.toString('utf-8')) as { error?: string };
return parsed.error === 'device_disconnected';
} catch {
return false;
}
}

interface TailnetCtx extends HandlerCtx {
whoIsImpl: (addr: string) => Promise<{ identity: string; raw: unknown }>;
}
Expand All @@ -274,7 +304,7 @@ interface TailnetCtx extends HandlerCtx {
* Tailnet handler — locked allowlist + capability tiers.
*/
async function handleTailnet(ctx: TailnetCtx): Promise<void> {
const { req, res, tokenStore, getTunnel, whoIsImpl } = ctx;
const { req, res, tokenStore, getTunnel, invalidateTunnel, whoIsImpl } = ctx;
const url = parseUrl(req.url ?? '/');
const path = url.pathname ?? '/';
const method = req.method ?? 'GET';
Expand Down Expand Up @@ -372,6 +402,12 @@ async function handleTailnet(ctx: TailnetCtx): Promise<void> {
agentIdentity: session.identity,
});

// If the CoreDevice tunnel went stale, drop the cached tunnel — next
// request will re-bootstrap. See handleLoopback for the same path.
if (upstream.status === 503 && isDeviceDisconnected(upstream.body)) {
invalidateTunnel();
}

// Audit the action (mutating endpoints only).
if (requiredCapability !== 'observe') {
await writeAudit({
Expand Down
173 changes: 173 additions & 0 deletions ios-qa/daemon/test/tunnel-cache.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// tunnel-cache.test.ts
//
// Regression coverage for the tunnel-cache invalidation policy.
//
// Background: bootstrapTunnel rotates the StateServer boot token and the
// iOS-side StateServer *deletes* the boot-token file on disk immediately
// after handling /auth/rotate. The rotated bearer lives only in the
// daemon's memory. A wall-clock TTL on the tunnel cache therefore caused
// the daemon to re-bootstrap after the TTL window and fail with
// boot_token_unavailable on every subsequent request (observed live on an
// iPhone 12 Pro: first ~30s of /ios-qa worked, then 100% 503s).
//
// Correct policy:
// - Cache the tunnel for the lifetime of the daemon (no TTL).
// - Invalidate the cache only when the proxy reports the underlying
// CoreDevice route is dead (503 device_disconnected).
//
// These tests exercise both legs via the tunnelProvider injection point.

import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { createServer } from 'http';
import type { Server } from 'http';
import { mkdtempSync, rmSync } from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';
import { startDaemon, type RunningDaemon } from '../src/index';
import type { DeviceTunnel } from '../src/proxy';

interface Stub {
server: Server;
port: number;
alive: { value: boolean };
}

function startStub(): Promise<Stub> {
const alive = { value: true };
return new Promise((resolve) => {
const server = createServer((req, res) => {
// Drop everything on the floor when "dead" to surface ECONNREFUSED
// semantically. We model the dead state by destroying the socket,
// which makes Node's http.request emit an error.
if (!alive.value) {
req.socket.destroy();
return;
}
const chunks: Buffer[] = [];
req.on('data', (c) => chunks.push(c));
req.on('end', () => {
if (req.url === '/screenshot') {
res.writeHead(200, { 'content-type': 'application/json' });
res.end(JSON.stringify({ png_base64: 'abc=' }));
return;
}
res.writeHead(404, { 'content-type': 'application/json' });
res.end(JSON.stringify({ error: 'not_found' }));
});
});
server.listen(0, '127.0.0.1', () => {
const addr = server.address();
const port = typeof addr === 'object' && addr ? addr.port : 0;
resolve({ server, port, alive });
});
});
}

async function fetchWith(method: string, url: string): Promise<{ status: number; bodyText: string }> {
const res = await fetch(url, { method });
return { status: res.status, bodyText: await res.text() };
}

describe('daemon — tunnel cache invalidation', () => {
let workDir: string;
let pidPath: string;
let stub: Stub;
let daemon: RunningDaemon | null = null;

beforeEach(async () => {
workDir = mkdtempSync(join(tmpdir(), 'ios-qa-tunnel-cache-'));
pidPath = join(workDir, 'daemon.pid');
stub = await startStub();
});

afterEach(async () => {
if (daemon) {
await daemon.close();
daemon = null;
}
stub.server.close();
rmSync(workDir, { recursive: true, force: true });
});

test('tunnelProvider is called exactly once across many sequential requests (no TTL re-bootstrap)', async () => {
let bootstrapCount = 0;
const tunnel: DeviceTunnel = {
udid: 'STUB-UDID',
ipv6Addr: '127.0.0.1',
port: stub.port,
bootTokenRotated: 'token-after-rotate',
};

const d = await startDaemon({
loopbackPort: 0,
tailnetEnabled: false,
pidfilePath: pidPath,
tunnelProvider: async () => {
bootstrapCount += 1;
return tunnel;
},
});
if ('error' in d) throw new Error(d.error);
daemon = d;

for (let i = 0; i < 25; i++) {
const r = await fetchWith('GET', `http://127.0.0.1:${d.loopbackPort}/screenshot`);
expect(r.status).toBe(200);
}

// Bootstrap MUST have run exactly once. The pre-fix 30s TTL would have
// re-bootstrapped on the first call after the window expired; even
// ignoring time, this asserts a single bootstrap regardless.
expect(bootstrapCount).toBe(1);
});

test('tunnel cache is dropped when proxy reports device_disconnected, then re-bootstrapped on next call', async () => {
let bootstrapCount = 0;
const tunnel: DeviceTunnel = {
udid: 'STUB-UDID',
ipv6Addr: '127.0.0.1',
port: stub.port,
bootTokenRotated: 'token-after-rotate',
};

const d = await startDaemon({
loopbackPort: 0,
tailnetEnabled: false,
pidfilePath: pidPath,
tunnelProvider: async () => {
bootstrapCount += 1;
return tunnel;
},
});
if ('error' in d) throw new Error(d.error);
daemon = d;

// First call: bootstrap + 200.
const r1 = await fetchWith('GET', `http://127.0.0.1:${d.loopbackPort}/screenshot`);
expect(r1.status).toBe(200);
expect(bootstrapCount).toBe(1);

// Kill the upstream so the proxy surfaces ECONNREFUSED → 503 device_disconnected.
stub.alive.value = false;
stub.server.close();
await new Promise((r) => setTimeout(r, 10));

const r2 = await fetchWith('GET', `http://127.0.0.1:${d.loopbackPort}/screenshot`);
expect(r2.status).toBe(503);
expect(JSON.parse(r2.bodyText).error).toBe('device_disconnected');

// After a device_disconnected, the cached tunnel must have been dropped.
// Restart the stub on a fresh port and mutate the shared DeviceTunnel; the
// next call must trigger a fresh tunnelProvider invocation.
const stub2 = await startStub();
try {
tunnel.port = stub2.port;
const r3 = await fetchWith('GET', `http://127.0.0.1:${d.loopbackPort}/screenshot`);
expect(r3.status).toBe(200);
// Critical assertion: the daemon re-bootstrapped after the disconnect.
expect(bootstrapCount).toBe(2);
} finally {
stub2.server.close();
}
});
});
23 changes: 14 additions & 9 deletions ios-qa/templates/StateServer.swift.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,15 @@ public final class StateServer {
private let port: UInt16
private let bootTokenPath: String

// Two listeners for dual-stack loopback. The fork's single-listener IPv6-only
// binding was caught in eng + outside-voice review as incomplete.
// One listener for dual-stack loopback. NWListener with default
// NWParameters.tcp on iOS binds a dual-stack socket (IN6ADDR_ANY without
// IPV6_V6ONLY) — the ::1 listener accepts IPv4 connections via 4-in-6
// mapped addresses (e.g. ::ffff:127.0.0.1). An earlier revision opened a
// second 127.0.0.1 listener on the same port; that second bind reliably
// failed with POSIXErrorCode 48 (EADDRINUSE) on iOS 26.x and spammed the
// log with `LISTENER_FAILED family=ipv4 err=POSIXErrorCode(rawValue: 48):
// Address already in use` on every launch. Verified on iPhone 12 Pro.
private var ipv6Listener: NWListener?
private var ipv4Listener: NWListener?

// Auth state. The boot token is what we wrote to os_log on first launch.
// It exists ONLY long enough for the daemon to call /auth/rotate.
Expand Down Expand Up @@ -100,10 +105,10 @@ public final class StateServer {
// seconds.
logger.notice("gstack-ios-qa-bootstrap token=\(self.bootToken, privacy: .public) port=\(self.port, privacy: .public) build=\(self.appBuildId, privacy: .public)")

// 3. Bind both IPv6 and IPv4 loopback. CoreDevice tunnel uses IPv6;
// local tooling may use IPv4. Never bind 0.0.0.0 or ::.
// 3. Bind ::1. CoreDevice tunnel uses IPv6; IPv4 loopback callers are
// accepted via 4-in-6 mapped addresses on the same dual-stack
// socket. Never bind 0.0.0.0 or ::.
startListener(family: .ipv6)
startListener(family: .ipv4)
}

public func register(buildId: String, accessorHash: String, atomicRestore: @escaping AtomicRestoreFn) {
Expand All @@ -121,12 +126,13 @@ public final class StateServer {
// MARK: Listener setup

private enum AddressFamily {
case ipv4
// Only .ipv6 is bound — see startListener comment block. The enum is
// kept (rather than inlining ::1) so future readers see the explicit
// decision and don't reintroduce a second IPv4 listener.
case ipv6

var host: NWEndpoint.Host {
switch self {
case .ipv4: return NWEndpoint.Host("127.0.0.1")
case .ipv6: return NWEndpoint.Host("::1")
}
}
Expand Down Expand Up @@ -172,7 +178,6 @@ public final class StateServer {

switch family {
case .ipv6: ipv6Listener = listener
case .ipv4: ipv4Listener = listener
}
} catch {
logger.error("Listener bind failed (\(String(describing: family))): \(error.localizedDescription, privacy: .public)")
Expand Down
Loading