Skip to content

Commit 94a53fb

Browse files
committed
serve: bound /usage per provider so one slow provider can't stall the response
`serveUsage` collected providers in a sequential loop with no per-provider timeout, so a single slow or hung provider (e.g. a CLI/web fetch that never returns) blocked the whole `/usage` handler. The only backstop was the outer request deadline, which returns a 504 with an empty body and discards every provider already collected — and because that 504 is not `.ok`, the last-known-good merge (`mergeLastGoodUsageItems`, which requires `.ok`) never ran. Net effect: one stuck provider made the entire endpoint return nothing, which pushed shell/Zellij consumers onto degraded CLI fallback. Collect providers concurrently, bounding each with `BoundedTaskJoin` at a budget strictly below the outer request deadline (`serveProviderTimeout`). A provider over budget now contributes a provider error row instead of blocking the others, so the response stays `.ok`, the cache can restore that row from last-known-good, and every healthy provider still renders. Each provider's timeout clock starts when its task is spawned, so a hung provider cannot serialize the others' deadlines. Results merge in caller-provided provider order regardless of completion order. The serve usage context's `webTimeout` is aligned to the per-provider budget (was a fixed 60s that exceeded the 30s request deadline). Adds CLIServeRouterTests coverage for the timeout budget and for a hung provider degrading to an error row without blocking siblings.
1 parent ada3660 commit 94a53fb

3 files changed

Lines changed: 142 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
### Fixed
99
- Mistral: restore Vibe monthly-plan usage by forwarding only required console session cookies. Thanks @lfmundim!
10+
- CLI server: collect providers concurrently with a per-provider timeout when serving `/usage` so one slow or hung provider degrades to its own error row while the others still return fresh data, instead of stalling the whole response into an empty timeout. Thanks @enieuwy!
1011

1112
## 0.37.3 — 2026-06-23
1213

Sources/CodexBarCLI/CLIServeCommand.swift

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,14 @@ private enum CLIServeArgumentError: LocalizedError {
419419
}
420420
}
421421

422+
private struct CLIServeProviderTimeoutError: LocalizedError {
423+
let provider: UsageProvider
424+
425+
var errorDescription: String? {
426+
"\(self.provider.rawValue) usage timed out"
427+
}
428+
}
429+
422430
extension CodexBarCLI {
423431
static let defaultServeRequestTimeout: TimeInterval = 30
424432

@@ -561,7 +569,8 @@ extension CodexBarCLI {
561569
await Self.serveUsage(
562570
provider: provider,
563571
config: snapshot.config,
564-
refreshInterval: runtime.refreshInterval)
572+
refreshInterval: runtime.refreshInterval,
573+
requestTimeout: runtime.requestTimeout)
565574
}
566575
case let .cost(provider):
567576
let snapshot: CLIServeConfigSnapshot
@@ -691,7 +700,8 @@ extension CodexBarCLI {
691700
private static func serveUsage(
692701
provider rawProvider: String?,
693702
config: CodexBarConfig,
694-
refreshInterval: TimeInterval) async -> CLILocalHTTPResponse
703+
refreshInterval: TimeInterval,
704+
requestTimeout: TimeInterval) async -> CLILocalHTTPResponse
695705
{
696706
let selection: ProviderSelection
697707
do {
@@ -710,6 +720,12 @@ extension CodexBarCLI {
710720
return Self.serveError(status: .internalServerError, message: error.localizedDescription)
711721
}
712722

723+
// Bound each provider strictly below the outer request deadline so a
724+
// single slow/hung provider degrades to its own error row while every
725+
// other provider still returns fresh data, instead of stalling the whole
726+
// response and being discarded by the deadline's empty 504.
727+
let providerTimeout = Self.serveProviderTimeout(requestTimeout: requestTimeout)
728+
713729
let browserDetection = BrowserDetection()
714730
let command = UsageCommandContext(
715731
format: .json,
@@ -718,7 +734,7 @@ extension CodexBarCLI {
718734
antigravityPlanDebug: false,
719735
augmentDebug: false,
720736
webDebugDumpHTML: false,
721-
webTimeout: 60,
737+
webTimeout: providerTimeout,
722738
verbose: false,
723739
useColor: false,
724740
resetStyle: Self.resetTimeDisplayStyleFromDefaults(),
@@ -731,23 +747,95 @@ extension CodexBarCLI {
731747
persistCLISessions: true,
732748
persistentCLISessionIdleWindow: Self.serveCLISessionIdleWindow(refreshInterval: refreshInterval))
733749

734-
var output = UsageCommandOutput()
735-
for provider in selection.asList {
736-
let providerOutput = await ProviderInteractionContext.$current.withValue(.background) {
750+
let output = await Self.serveCollectUsageOutputs(
751+
providers: selection.asList,
752+
providerTimeout: providerTimeout)
753+
{ provider in
754+
await ProviderInteractionContext.$current.withValue(.background) {
737755
await Self.fetchUsageOutputs(
738756
provider: provider,
739757
status: nil,
740758
tokenContext: tokenContext,
741759
command: command)
742760
}
743-
output.merge(providerOutput)
744761
}
745762

746763
return Self.serveJSON(
747764
output.payload,
748765
usageCacheKeys: output.payload.map(\.cacheAccountKey))
749766
}
750767

768+
/// Per-provider fetch budget for `/usage`. Each provider is bounded strictly
769+
/// below the outer request deadline so the deadline (which yields an empty
770+
/// 504 and discards every collected provider) stays a last resort, never the
771+
/// primary cutoff. When the outer deadline is disabled (`requestTimeout == 0`)
772+
/// each provider is still bounded so `/usage` cannot hang indefinitely.
773+
static func serveProviderTimeout(requestTimeout: TimeInterval) -> TimeInterval {
774+
let disabledDeadlineBudget: TimeInterval = 25
775+
guard requestTimeout > 0, requestTimeout.isFinite else { return disabledDeadlineBudget }
776+
// 0.8x keeps the budget strictly below the finite deadline at every
777+
// value (including sub-second timeouts), so the empty-504 deadline can
778+
// never preempt a provider's own bound.
779+
return requestTimeout * 0.8
780+
}
781+
782+
/// Collects usage for each provider concurrently, bounding every provider by
783+
/// `providerTimeout`. A provider that exceeds its budget contributes a
784+
/// provider error row instead of blocking the others, so the overall response
785+
/// still renders every healthy provider. (Per-account error rows that carry a
786+
/// cache key are merged with last-known-good by `CLIServeResponseCache`; a
787+
/// timeout row is account-agnostic and is not reconstructed, matching the
788+
/// existing "a timeout cannot prove the active account" cache rule.) Each
789+
/// provider's timeout clock starts when its task is spawned, so a hung
790+
/// provider cannot serialize the others' deadlines; results are merged in the
791+
/// caller's provider order regardless of completion order.
792+
static func serveCollectUsageOutputs(
793+
providers: [UsageProvider],
794+
providerTimeout: TimeInterval,
795+
fetch: @Sendable @escaping (UsageProvider) async -> UsageCommandOutput) async -> UsageCommandOutput
796+
{
797+
let grace = Duration.seconds(max(0, providerTimeout))
798+
let indexed = await withTaskGroup(of: (Int, UsageCommandOutput).self) { group in
799+
for (index, provider) in providers.enumerated() {
800+
group.addTask {
801+
let task = Task<UsageCommandOutput, Error> { await fetch(provider) }
802+
let join = BoundedTaskJoin(sourceTask: task)
803+
switch await join.value(joinGrace: grace) {
804+
case let .value(output):
805+
return (index, output)
806+
case .failure, .timedOut:
807+
return (index, Self.serveProviderTimeoutOutput(provider: provider))
808+
}
809+
}
810+
}
811+
var collected: [(Int, UsageCommandOutput)] = []
812+
for await item in group {
813+
collected.append(item)
814+
}
815+
return collected
816+
}
817+
818+
var output = UsageCommandOutput()
819+
for (_, providerOutput) in indexed.sorted(by: { $0.0 < $1.0 }) {
820+
output.merge(providerOutput)
821+
}
822+
return output
823+
}
824+
825+
/// Provider-level error row for a fetch that exceeded its per-provider budget.
826+
static func serveProviderTimeoutOutput(provider: UsageProvider) -> UsageCommandOutput {
827+
var output = UsageCommandOutput()
828+
output.exitCode = .failure
829+
output.payload.append(Self.makeProviderErrorPayload(
830+
provider: provider,
831+
account: nil,
832+
source: "auto",
833+
status: nil,
834+
error: CLIServeProviderTimeoutError(provider: provider),
835+
kind: .provider))
836+
return output
837+
}
838+
751839
private static func serveCost(provider rawProvider: String?, config: CodexBarConfig) async -> CLILocalHTTPResponse {
752840
let selection: ProviderSelection
753841
do {

Tests/CodexBarTests/CLIServeRouterTests.swift

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,52 @@ struct CLIServeRouterTests {
221221
#expect(!CodexBarCLI.shouldCacheServeResponse(routeError))
222222
}
223223

224+
@Test
225+
func `serve provider timeout stays below the request deadline`() {
226+
#expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 30) - 24) < 1e-9)
227+
#expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 10) - 8) < 1e-9)
228+
// Outer deadline disabled (0) or non-finite: still bound each provider.
229+
#expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 0) == 25)
230+
#expect(CodexBarCLI.serveProviderTimeout(requestTimeout: .infinity) == 25)
231+
// Finite deadlines stay strictly below the request timeout at every
232+
// value, including sub-second ones.
233+
#expect(CodexBarCLI.serveProviderTimeout(requestTimeout: 1) < 1)
234+
#expect(abs(CodexBarCLI.serveProviderTimeout(requestTimeout: 0.5) - 0.4) < 1e-9)
235+
}
236+
237+
@Test
238+
func `serve usage collection bounds a hung provider without blocking others`() async {
239+
let providers: [UsageProvider] = [.codex, .claude, .gemini]
240+
let start = Date()
241+
let output = await CodexBarCLI.serveCollectUsageOutputs(
242+
providers: providers,
243+
providerTimeout: 0.1)
244+
{ provider in
245+
if provider == .claude {
246+
try? await Task.sleep(for: .seconds(30))
247+
return UsageCommandOutput(sections: ["late:\(provider.rawValue)"])
248+
}
249+
return UsageCommandOutput(sections: ["ok:\(provider.rawValue)"])
250+
}
251+
let elapsed = Date().timeIntervalSince(start)
252+
253+
// The hung provider must not serialize or stall the others.
254+
#expect(elapsed < 5)
255+
// Fast providers render in caller order; the hung one yields no section.
256+
#expect(output.sections == ["ok:codex", "ok:gemini"])
257+
// The hung provider degrades to a single provider error row.
258+
#expect(output.payload.count == 1)
259+
#expect(output.payload.first?.provider == UsageProvider.claude.rawValue)
260+
#expect(output.payload.first?.error != nil)
261+
#expect(output.payload.first?.error?.kind == .provider)
262+
// The timeout row is account-agnostic: it carries no cache key, so the
263+
// cache's keyed last-good merge intentionally does not reconstruct it
264+
// (a timeout cannot prove which account is active).
265+
#expect(output.payload.first?.cacheAccountKey == nil)
266+
#expect(output.payload.first?.account == nil)
267+
#expect(output.exitCode == .failure)
268+
}
269+
224270
@Test
225271
func `serve cache uses stable Codex account identities`() {
226272
let storedID = UUID()

0 commit comments

Comments
 (0)