From 94a53fb3cc5fe363cb7db5e8f2dcb300e2c83a97 Mon Sep 17 00:00:00 2001 From: enieuwy Date: Thu, 25 Jun 2026 13:07:14 +0800 Subject: [PATCH 1/3] serve: bound /usage per provider so one slow provider can't stall the response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `serveUsage` collected providers in a sequential loop with no per-provider timeout, so a single slow or hung provider (e.g. a CLI/web fetch that never returns) blocked the whole `/usage` handler. The only backstop was the outer request deadline, which returns a 504 with an empty body and discards every provider already collected — and because that 504 is not `.ok`, the last-known-good merge (`mergeLastGoodUsageItems`, which requires `.ok`) never ran. Net effect: one stuck provider made the entire endpoint return nothing, which pushed shell/Zellij consumers onto degraded CLI fallback. Collect providers concurrently, bounding each with `BoundedTaskJoin` at a budget strictly below the outer request deadline (`serveProviderTimeout`). A provider over budget now contributes a provider error row instead of blocking the others, so the response stays `.ok`, the cache can restore that row from last-known-good, and every healthy provider still renders. Each provider's timeout clock starts when its task is spawned, so a hung provider cannot serialize the others' deadlines. Results merge in caller-provided provider order regardless of completion order. The serve usage context's `webTimeout` is aligned to the per-provider budget (was a fixed 60s that exceeded the 30s request deadline). Adds CLIServeRouterTests coverage for the timeout budget and for a hung provider degrading to an error row without blocking siblings. --- CHANGELOG.md | 1 + Sources/CodexBarCLI/CLIServeCommand.swift | 102 ++++++++++++++++-- Tests/CodexBarTests/CLIServeRouterTests.swift | 46 ++++++++ 3 files changed, 142 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a6302153..78a78cc14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Fixed - Mistral: restore Vibe monthly-plan usage by forwarding only required console session cookies. Thanks @lfmundim! +- CLI server: collect providers concurrently with a per-provider timeout when serving `/usage` so one slow or hung provider degrades to its own error row while the others still return fresh data, instead of stalling the whole response into an empty timeout. Thanks @enieuwy! ## 0.37.3 — 2026-06-23 diff --git a/Sources/CodexBarCLI/CLIServeCommand.swift b/Sources/CodexBarCLI/CLIServeCommand.swift index c9ae89b09..e5ebfeee7 100644 --- a/Sources/CodexBarCLI/CLIServeCommand.swift +++ b/Sources/CodexBarCLI/CLIServeCommand.swift @@ -419,6 +419,14 @@ private enum CLIServeArgumentError: LocalizedError { } } +private struct CLIServeProviderTimeoutError: LocalizedError { + let provider: UsageProvider + + var errorDescription: String? { + "\(self.provider.rawValue) usage timed out" + } +} + extension CodexBarCLI { static let defaultServeRequestTimeout: TimeInterval = 30 @@ -561,7 +569,8 @@ extension CodexBarCLI { await Self.serveUsage( provider: provider, config: snapshot.config, - refreshInterval: runtime.refreshInterval) + refreshInterval: runtime.refreshInterval, + requestTimeout: runtime.requestTimeout) } case let .cost(provider): let snapshot: CLIServeConfigSnapshot @@ -691,7 +700,8 @@ extension CodexBarCLI { private static func serveUsage( provider rawProvider: String?, config: CodexBarConfig, - refreshInterval: TimeInterval) async -> CLILocalHTTPResponse + refreshInterval: TimeInterval, + requestTimeout: TimeInterval) async -> CLILocalHTTPResponse { let selection: ProviderSelection do { @@ -710,6 +720,12 @@ extension CodexBarCLI { return Self.serveError(status: .internalServerError, message: error.localizedDescription) } + // Bound each provider strictly below the outer request deadline so a + // single slow/hung provider degrades to its own error row while every + // other provider still returns fresh data, instead of stalling the whole + // response and being discarded by the deadline's empty 504. + let providerTimeout = Self.serveProviderTimeout(requestTimeout: requestTimeout) + let browserDetection = BrowserDetection() let command = UsageCommandContext( format: .json, @@ -718,7 +734,7 @@ extension CodexBarCLI { antigravityPlanDebug: false, augmentDebug: false, webDebugDumpHTML: false, - webTimeout: 60, + webTimeout: providerTimeout, verbose: false, useColor: false, resetStyle: Self.resetTimeDisplayStyleFromDefaults(), @@ -731,16 +747,17 @@ extension CodexBarCLI { persistCLISessions: true, persistentCLISessionIdleWindow: Self.serveCLISessionIdleWindow(refreshInterval: refreshInterval)) - var output = UsageCommandOutput() - for provider in selection.asList { - let providerOutput = await ProviderInteractionContext.$current.withValue(.background) { + let output = await Self.serveCollectUsageOutputs( + providers: selection.asList, + providerTimeout: providerTimeout) + { provider in + await ProviderInteractionContext.$current.withValue(.background) { await Self.fetchUsageOutputs( provider: provider, status: nil, tokenContext: tokenContext, command: command) } - output.merge(providerOutput) } return Self.serveJSON( @@ -748,6 +765,77 @@ extension CodexBarCLI { usageCacheKeys: output.payload.map(\.cacheAccountKey)) } + /// Per-provider fetch budget for `/usage`. Each provider is bounded strictly + /// below the outer request deadline so the deadline (which yields an empty + /// 504 and discards every collected provider) stays a last resort, never the + /// primary cutoff. When the outer deadline is disabled (`requestTimeout == 0`) + /// each provider is still bounded so `/usage` cannot hang indefinitely. + static func serveProviderTimeout(requestTimeout: TimeInterval) -> TimeInterval { + let disabledDeadlineBudget: TimeInterval = 25 + guard requestTimeout > 0, requestTimeout.isFinite else { return disabledDeadlineBudget } + // 0.8x keeps the budget strictly below the finite deadline at every + // value (including sub-second timeouts), so the empty-504 deadline can + // never preempt a provider's own bound. + return requestTimeout * 0.8 + } + + /// Collects usage for each provider concurrently, bounding every provider by + /// `providerTimeout`. A provider that exceeds its budget contributes a + /// provider error row instead of blocking the others, so the overall response + /// still renders every healthy provider. (Per-account error rows that carry a + /// cache key are merged with last-known-good by `CLIServeResponseCache`; a + /// timeout row is account-agnostic and is not reconstructed, matching the + /// existing "a timeout cannot prove the active account" cache rule.) Each + /// provider's timeout clock starts when its task is spawned, so a hung + /// provider cannot serialize the others' deadlines; results are merged in the + /// caller's provider order regardless of completion order. + static func serveCollectUsageOutputs( + providers: [UsageProvider], + providerTimeout: TimeInterval, + fetch: @Sendable @escaping (UsageProvider) async -> UsageCommandOutput) async -> UsageCommandOutput + { + let grace = Duration.seconds(max(0, providerTimeout)) + let indexed = await withTaskGroup(of: (Int, UsageCommandOutput).self) { group in + for (index, provider) in providers.enumerated() { + group.addTask { + let task = Task { await fetch(provider) } + let join = BoundedTaskJoin(sourceTask: task) + switch await join.value(joinGrace: grace) { + case let .value(output): + return (index, output) + case .failure, .timedOut: + return (index, Self.serveProviderTimeoutOutput(provider: provider)) + } + } + } + var collected: [(Int, UsageCommandOutput)] = [] + for await item in group { + collected.append(item) + } + return collected + } + + var output = UsageCommandOutput() + for (_, providerOutput) in indexed.sorted(by: { $0.0 < $1.0 }) { + output.merge(providerOutput) + } + return output + } + + /// Provider-level error row for a fetch that exceeded its per-provider budget. + static func serveProviderTimeoutOutput(provider: UsageProvider) -> UsageCommandOutput { + var output = UsageCommandOutput() + output.exitCode = .failure + output.payload.append(Self.makeProviderErrorPayload( + provider: provider, + account: nil, + source: "auto", + status: nil, + error: CLIServeProviderTimeoutError(provider: provider), + kind: .provider)) + return output + } + private static func serveCost(provider rawProvider: String?, config: CodexBarConfig) async -> CLILocalHTTPResponse { let selection: ProviderSelection do { diff --git a/Tests/CodexBarTests/CLIServeRouterTests.swift b/Tests/CodexBarTests/CLIServeRouterTests.swift index 0e75871cd..f6142e483 100644 --- a/Tests/CodexBarTests/CLIServeRouterTests.swift +++ b/Tests/CodexBarTests/CLIServeRouterTests.swift @@ -221,6 +221,52 @@ struct CLIServeRouterTests { #expect(!CodexBarCLI.shouldCacheServeResponse(routeError)) } + @Test + func `serve provider timeout stays below the request deadline`() { + #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 30) - 24) < 1e-9) + #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 10) - 8) < 1e-9) + // Outer deadline disabled (0) or non-finite: still bound each provider. + #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 0) == 25) + #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: .infinity) == 25) + // Finite deadlines stay strictly below the request timeout at every + // value, including sub-second ones. + #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 1) < 1) + #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 0.5) - 0.4) < 1e-9) + } + + @Test + func `serve usage collection bounds a hung provider without blocking others`() async { + let providers: [UsageProvider] = [.codex, .claude, .gemini] + let start = Date() + let output = await CodexBarCLI.serveCollectUsageOutputs( + providers: providers, + providerTimeout: 0.1) + { provider in + if provider == .claude { + try? await Task.sleep(for: .seconds(30)) + return UsageCommandOutput(sections: ["late:\(provider.rawValue)"]) + } + return UsageCommandOutput(sections: ["ok:\(provider.rawValue)"]) + } + let elapsed = Date().timeIntervalSince(start) + + // The hung provider must not serialize or stall the others. + #expect(elapsed < 5) + // Fast providers render in caller order; the hung one yields no section. + #expect(output.sections == ["ok:codex", "ok:gemini"]) + // The hung provider degrades to a single provider error row. + #expect(output.payload.count == 1) + #expect(output.payload.first?.provider == UsageProvider.claude.rawValue) + #expect(output.payload.first?.error != nil) + #expect(output.payload.first?.error?.kind == .provider) + // The timeout row is account-agnostic: it carries no cache key, so the + // cache's keyed last-good merge intentionally does not reconstruct it + // (a timeout cannot prove which account is active). + #expect(output.payload.first?.cacheAccountKey == nil) + #expect(output.payload.first?.account == nil) + #expect(output.exitCode == .failure) + } + @Test func `serve cache uses stable Codex account identities`() { let storedID = UUID() From 86add329bd145520d4eb0e0a7f58e1b762aa95a4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 27 Jun 2026 03:01:42 +0100 Subject: [PATCH 2/3] fix: preserve serve timeout opt-out --- Sources/CodexBarCLI/CLIServeCommand.swift | 42 ++++++++++--------- Tests/CodexBarTests/CLIServeRouterTests.swift | 41 ++++++++++++++---- 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/Sources/CodexBarCLI/CLIServeCommand.swift b/Sources/CodexBarCLI/CLIServeCommand.swift index e5ebfeee7..c95dae8d8 100644 --- a/Sources/CodexBarCLI/CLIServeCommand.swift +++ b/Sources/CodexBarCLI/CLIServeCommand.swift @@ -530,7 +530,7 @@ extension CodexBarCLI { } else { parsed = Self.defaultServeRequestTimeout } - guard parsed >= 0 else { return nil } + guard parsed.isFinite, parsed >= 0 else { return nil } return parsed } @@ -720,10 +720,10 @@ extension CodexBarCLI { return Self.serveError(status: .internalServerError, message: error.localizedDescription) } - // Bound each provider strictly below the outer request deadline so a - // single slow/hung provider degrades to its own error row while every - // other provider still returns fresh data, instead of stalling the whole - // response and being discarded by the deadline's empty 504. + // For finite request deadlines, bound each provider early enough to + // return the healthy rows before the outer deadline discards them all. + // A disabled request deadline adds no serve-level provider bound; the + // providers' existing internal timeouts still apply. let providerTimeout = Self.serveProviderTimeout(requestTimeout: requestTimeout) let browserDetection = BrowserDetection() @@ -734,7 +734,7 @@ extension CodexBarCLI { antigravityPlanDebug: false, augmentDebug: false, webDebugDumpHTML: false, - webTimeout: providerTimeout, + webTimeout: providerTimeout ?? 60, verbose: false, useColor: false, resetStyle: Self.resetTimeDisplayStyleFromDefaults(), @@ -765,24 +765,22 @@ extension CodexBarCLI { usageCacheKeys: output.payload.map(\.cacheAccountKey)) } - /// Per-provider fetch budget for `/usage`. Each provider is bounded strictly - /// below the outer request deadline so the deadline (which yields an empty - /// 504 and discards every collected provider) stays a last resort, never the - /// primary cutoff. When the outer deadline is disabled (`requestTimeout == 0`) - /// each provider is still bounded so `/usage` cannot hang indefinitely. - static func serveProviderTimeout(requestTimeout: TimeInterval) -> TimeInterval { - let disabledDeadlineBudget: TimeInterval = 25 - guard requestTimeout > 0, requestTimeout.isFinite else { return disabledDeadlineBudget } + /// Per-provider fetch budget for `/usage`. Finite provider work is bounded + /// below the outer request deadline so the empty 504 stays a last resort. + /// `nil` preserves the documented disabled serve deadline without changing + /// provider-specific internal timeouts. + static func serveProviderTimeout(requestTimeout: TimeInterval) -> TimeInterval? { + guard requestTimeout > 0, requestTimeout.isFinite else { return nil } // 0.8x keeps the budget strictly below the finite deadline at every // value (including sub-second timeouts), so the empty-504 deadline can // never preempt a provider's own bound. return requestTimeout * 0.8 } - /// Collects usage for each provider concurrently, bounding every provider by - /// `providerTimeout`. A provider that exceeds its budget contributes a - /// provider error row instead of blocking the others, so the overall response - /// still renders every healthy provider. (Per-account error rows that carry a + /// Collects usage for each provider concurrently. When `providerTimeout` is + /// non-nil, a provider that exceeds its budget contributes a provider error + /// row instead of blocking the others, so the overall response still renders + /// every healthy provider. (Per-account error rows that carry a /// cache key are merged with last-known-good by `CLIServeResponseCache`; a /// timeout row is account-agnostic and is not reconstructed, matching the /// existing "a timeout cannot prove the active account" cache rule.) Each @@ -791,13 +789,17 @@ extension CodexBarCLI { /// caller's provider order regardless of completion order. static func serveCollectUsageOutputs( providers: [UsageProvider], - providerTimeout: TimeInterval, + providerTimeout: TimeInterval?, fetch: @Sendable @escaping (UsageProvider) async -> UsageCommandOutput) async -> UsageCommandOutput { - let grace = Duration.seconds(max(0, providerTimeout)) + let grace = providerTimeout.map { Duration.seconds(max(0, $0)) } let indexed = await withTaskGroup(of: (Int, UsageCommandOutput).self) { group in for (index, provider) in providers.enumerated() { group.addTask { + guard let grace else { + let output = await fetch(provider) + return (index, output) + } let task = Task { await fetch(provider) } let join = BoundedTaskJoin(sourceTask: task) switch await join.value(joinGrace: grace) { diff --git a/Tests/CodexBarTests/CLIServeRouterTests.swift b/Tests/CodexBarTests/CLIServeRouterTests.swift index f6142e483..00cf2af1c 100644 --- a/Tests/CodexBarTests/CLIServeRouterTests.swift +++ b/Tests/CodexBarTests/CLIServeRouterTests.swift @@ -157,6 +157,10 @@ struct CLIServeRouterTests { positional: [], options: ["requestTimeout": ["-0.5"]], flags: [])) == nil) + #expect(CodexBarCLI.decodeServeRequestTimeout(from: ParsedValues( + positional: [], + options: ["requestTimeout": ["inf"]], + flags: [])) == nil) #expect(CodexBarCLI.decodeServeRequestTimeout(from: ParsedValues( positional: [], options: ["requestTimeout": ["0"]], @@ -222,16 +226,20 @@ struct CLIServeRouterTests { } @Test - func `serve provider timeout stays below the request deadline`() { - #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 30) - 24) < 1e-9) - #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 10) - 8) < 1e-9) - // Outer deadline disabled (0) or non-finite: still bound each provider. - #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 0) == 25) - #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: .infinity) == 25) + func `serve provider timeout stays below the request deadline`() throws { + let thirtySecondTimeout = try #require(CodexBarCLI.serveProviderTimeout(requestTimeout: 30)) + let tenSecondTimeout = try #require(CodexBarCLI.serveProviderTimeout(requestTimeout: 10)) + #expect(abs(thirtySecondTimeout - 24) < 1e-9) + #expect(abs(tenSecondTimeout - 8) < 1e-9) + // Outer deadline disabled (0) or non-finite: add no serve-level provider bound. + #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 0) == nil) + #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: .infinity) == nil) // Finite deadlines stay strictly below the request timeout at every // value, including sub-second ones. - #expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 1) < 1) - #expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 0.5) - 0.4) < 1e-9) + let oneSecondTimeout = try #require(CodexBarCLI.serveProviderTimeout(requestTimeout: 1)) + let halfSecondTimeout = try #require(CodexBarCLI.serveProviderTimeout(requestTimeout: 0.5)) + #expect(oneSecondTimeout < 1) + #expect(abs(halfSecondTimeout - 0.4) < 1e-9) } @Test @@ -267,6 +275,23 @@ struct CLIServeRouterTests { #expect(output.exitCode == .failure) } + @Test + func `serve usage collection adds no join bound when request deadline is disabled`() async { + let output = await CodexBarCLI.serveCollectUsageOutputs( + providers: [.codex, .claude], + providerTimeout: nil) + { provider in + if provider == .codex { + try? await Task.sleep(for: .milliseconds(25)) + } + return UsageCommandOutput(sections: ["ok:\(provider.rawValue)"]) + } + + #expect(output.sections == ["ok:codex", "ok:claude"]) + #expect(output.payload.isEmpty) + #expect(output.exitCode == .success) + } + @Test func `serve cache uses stable Codex account identities`() { let storedID = UUID() From 2c3454f1a2bb0143ca0488f919879f3e9d3bc226 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 27 Jun 2026 03:09:37 +0100 Subject: [PATCH 3/3] fix: clamp serve provider timeout --- Sources/CodexBarCLI/CLIServeCommand.swift | 10 ++++++---- Tests/CodexBarTests/CLIServeRouterTests.swift | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/Sources/CodexBarCLI/CLIServeCommand.swift b/Sources/CodexBarCLI/CLIServeCommand.swift index c95dae8d8..7882748a5 100644 --- a/Sources/CodexBarCLI/CLIServeCommand.swift +++ b/Sources/CodexBarCLI/CLIServeCommand.swift @@ -429,6 +429,7 @@ private struct CLIServeProviderTimeoutError: LocalizedError { extension CodexBarCLI { static let defaultServeRequestTimeout: TimeInterval = 30 + private static let maximumServeRequestTimeout: TimeInterval = 86400 static func runServe(_ values: ParsedValues) async { let output = CLIOutputPreferences(format: .json, jsonOnly: true, pretty: false) @@ -644,7 +645,7 @@ extension CodexBarCLI { seconds timeout: TimeInterval, makeResponse: @Sendable @escaping () async -> CLILocalHTTPResponse) async -> CLILocalHTTPResponse { - let clampedTimeout = min(max(timeout, 0), 86400) + let clampedTimeout = min(max(timeout, 0), Self.maximumServeRequestTimeout) guard clampedTimeout > 0 else { return await makeResponse() } @@ -771,10 +772,11 @@ extension CodexBarCLI { /// provider-specific internal timeouts. static func serveProviderTimeout(requestTimeout: TimeInterval) -> TimeInterval? { guard requestTimeout > 0, requestTimeout.isFinite else { return nil } + let clampedTimeout = min(requestTimeout, Self.maximumServeRequestTimeout) // 0.8x keeps the budget strictly below the finite deadline at every - // value (including sub-second timeouts), so the empty-504 deadline can - // never preempt a provider's own bound. - return requestTimeout * 0.8 + // value (including sub-second and capped timeouts), so the empty-504 + // deadline can never preempt a provider's own bound. + return clampedTimeout * 0.8 } /// Collects usage for each provider concurrently. When `providerTimeout` is diff --git a/Tests/CodexBarTests/CLIServeRouterTests.swift b/Tests/CodexBarTests/CLIServeRouterTests.swift index 00cf2af1c..346681951 100644 --- a/Tests/CodexBarTests/CLIServeRouterTests.swift +++ b/Tests/CodexBarTests/CLIServeRouterTests.swift @@ -240,6 +240,12 @@ struct CLIServeRouterTests { let halfSecondTimeout = try #require(CodexBarCLI.serveProviderTimeout(requestTimeout: 0.5)) #expect(oneSecondTimeout < 1) #expect(abs(halfSecondTimeout - 0.4) < 1e-9) + // Oversized finite deadlines share the outer 24-hour cap and cannot + // overflow Duration conversion. + let oversizedTimeout = try #require(CodexBarCLI.serveProviderTimeout( + requestTimeout: .greatestFiniteMagnitude)) + #expect(abs(oversizedTimeout - 69120) < 1e-9) + #expect(oversizedTimeout < 86400) } @Test