Skip to content

Commit 180e14d

Browse files
authored
Merge pull request #293 from getagentseal/fix/menubar-loading-watchdog
Fix menubar loading recovery deadlocks
2 parents 3380517 + d79deef commit 180e14d

4 files changed

Lines changed: 152 additions & 51 deletions

File tree

mac/Sources/CodeBurnMenubar/AppStore.swift

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,14 @@ final class AppStore {
2525
}
2626
var showingAccentPicker: Bool = false
2727
var currency: String = "USD"
28-
var isLoading: Bool { loadingCount > 0 }
29-
private var loadingCount: Int = 0
30-
var lastError: String?
28+
var isLoading: Bool { loadingCountsByKey.values.contains { $0 > 0 } }
29+
var isCurrentKeyLoading: Bool { loadingCountsByKey[currentKey, default: 0] > 0 }
30+
var hasAttemptedCurrentKeyLoad: Bool { attemptedKeys.contains(currentKey) }
31+
var lastError: String? { lastErrorByKey[currentKey] }
32+
private var loadingCountsByKey: [PayloadCacheKey: Int] = [:]
33+
private var loadingStartedAtByKey: [PayloadCacheKey: Date] = [:]
34+
private var attemptedKeys: Set<PayloadCacheKey> = []
35+
private var lastErrorByKey: [PayloadCacheKey: String] = [:]
3136
var subscription: SubscriptionUsage?
3237
var subscriptionError: String?
3338
var subscriptionLoadState: SubscriptionLoadState = ClaudeCredentialStore.isBootstrapCompleted ? .loading : .notBootstrapped
@@ -130,10 +135,51 @@ final class AppStore {
130135
private var inFlightKeys: Set<PayloadCacheKey> = []
131136

132137
func resetLoadingState() {
133-
loadingCount = 0
138+
loadingCountsByKey.removeAll()
139+
loadingStartedAtByKey.removeAll()
134140
inFlightKeys.removeAll()
135141
}
136142

143+
private let loadingWatchdogSeconds: TimeInterval = 60
144+
145+
@discardableResult
146+
func clearStaleLoadingIfNeeded() -> Bool {
147+
let now = Date()
148+
let staleEntries = loadingStartedAtByKey.filter {
149+
now.timeIntervalSince($0.value) > loadingWatchdogSeconds
150+
}
151+
guard !staleEntries.isEmpty else { return false }
152+
153+
for (key, started) in staleEntries {
154+
NSLog("CodeBurn: loading stuck for %ds on %@/%@ — auto-clearing",
155+
Int(now.timeIntervalSince(started)), key.period.rawValue, key.provider.rawValue)
156+
loadingCountsByKey[key] = nil
157+
loadingStartedAtByKey[key] = nil
158+
inFlightKeys.remove(key)
159+
if cache[key] == nil {
160+
lastErrorByKey[key] = "Refresh took longer than expected. CodeBurn will keep retrying in the background."
161+
}
162+
}
163+
return true
164+
}
165+
166+
private func beginLoading(for key: PayloadCacheKey) {
167+
if loadingCountsByKey[key, default: 0] == 0 {
168+
loadingStartedAtByKey[key] = Date()
169+
}
170+
loadingCountsByKey[key, default: 0] += 1
171+
}
172+
173+
private func finishLoading(for key: PayloadCacheKey) {
174+
guard let count = loadingCountsByKey[key], count > 0 else { return }
175+
if count == 1 {
176+
loadingCountsByKey[key] = nil
177+
loadingStartedAtByKey[key] = nil
178+
} else {
179+
loadingCountsByKey[key] = count - 1
180+
}
181+
}
182+
137183
private func invalidateStaleDayCache() {
138184
let formatter = DateFormatter()
139185
formatter.dateFormat = "yyyy-MM-dd"
@@ -155,9 +201,11 @@ final class AppStore {
155201
if !force, cache[key]?.isFresh == true { return }
156202
if !force, inFlightKeys.contains(key) { return }
157203
inFlightKeys.insert(key)
204+
attemptedKeys.insert(key)
205+
lastErrorByKey[key] = nil
158206
let didShowLoading = showLoading || cache[key] == nil
159207
if didShowLoading {
160-
loadingCount += 1
208+
beginLoading(for: key)
161209
}
162210
// Diagnostic anchor: if this key has been empty for a long time (the
163211
// popover would currently be showing "Loading..."), log how stale the
@@ -172,7 +220,9 @@ final class AppStore {
172220
}
173221
defer {
174222
inFlightKeys.remove(key)
175-
if didShowLoading { loadingCount = max(loadingCount - 1, 0) }
223+
if didShowLoading {
224+
finishLoading(for: key)
225+
}
176226
}
177227
do {
178228
let fresh = try await DataClient.fetch(period: key.period, provider: key.provider, includeOptimize: includeOptimize)
@@ -194,7 +244,7 @@ final class AppStore {
194244
}
195245
cache[key] = CachedPayload(payload: fresh, fetchedAt: Date())
196246
lastSuccessByKey[key] = Date()
197-
lastError = nil
247+
lastErrorByKey[key] = nil
198248
} catch {
199249
if Task.isCancelled { return }
200250
NSLog("CodeBurn: fetch failed for \(key.period.rawValue)/\(key.provider.rawValue): \(error)")
@@ -205,14 +255,14 @@ final class AppStore {
205255
if cacheDate != cacheDateAtStart { return }
206256
cache[key] = CachedPayload(payload: fallback, fetchedAt: Date())
207257
lastSuccessByKey[key] = Date()
208-
lastError = nil
258+
lastErrorByKey[key] = nil
209259
return
210260
} catch {
211261
if Task.isCancelled { return }
212262
NSLog("CodeBurn: fallback fetch also failed: \(error)")
213263
}
214264
}
215-
lastError = String(describing: error)
265+
lastErrorByKey[key] = String(describing: error)
216266
}
217267

218268
let allKey = PayloadCacheKey(period: selectedPeriod, provider: .all)
@@ -232,7 +282,10 @@ final class AppStore {
232282
// Same day-rollover guard as refresh(): drop yesterday's payload if
233283
// the calendar rolled over during the fetch.
234284
if cacheDate != cacheDateAtStart { return }
235-
cache[PayloadCacheKey(period: period, provider: .all)] = CachedPayload(payload: fresh, fetchedAt: Date())
285+
let key = PayloadCacheKey(period: period, provider: .all)
286+
cache[key] = CachedPayload(payload: fresh, fetchedAt: Date())
287+
lastSuccessByKey[key] = Date()
288+
lastErrorByKey[key] = nil
236289
} catch {
237290
NSLog("CodeBurn: quiet refresh failed for \(period.rawValue): \(error)")
238291
}

mac/Sources/CodeBurnMenubar/CodeBurnApp.swift

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import Observation
55
private let refreshIntervalSeconds: UInt64 = 30
66
private let nanosPerSecond: UInt64 = 1_000_000_000
77
private let refreshIntervalNanos: UInt64 = refreshIntervalSeconds * nanosPerSecond
8+
private let forceRefreshWatchdogSeconds: TimeInterval = 90
89
private let statusItemWidth: CGFloat = NSStatusItem.variableLength
910
private let popoverWidth: CGFloat = 360
1011
private let popoverHeight: CGFloat = 660
@@ -36,6 +37,8 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate {
3637
private var pendingRefreshWork: DispatchWorkItem?
3738
private var refreshLoopTask: Task<Void, Never>?
3839
private var forceRefreshTask: Task<Void, Never>?
40+
private var forceRefreshStartedAt: Date?
41+
private var forceRefreshGeneration: UInt64 = 0
3942

4043
func applicationWillFinishLaunching(_ notification: Notification) {
4144
// Set accessory policy before the app's focus chain forms. On macOS Tahoe
@@ -90,6 +93,8 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate {
9093
Task { @MainActor in
9194
self?.forceRefreshTask?.cancel()
9295
self?.forceRefreshTask = nil
96+
self?.forceRefreshStartedAt = nil
97+
self?.forceRefreshGeneration &+= 1
9398
self?.refreshLoopTask?.cancel()
9499
self?.refreshLoopTask = nil
95100
}
@@ -208,17 +213,42 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate {
208213

209214
private var lastRefreshTime: Date = .distantPast
210215

216+
@discardableResult
217+
private func clearStaleForceRefreshIfNeeded(now: Date = Date()) -> Bool {
218+
if let started = forceRefreshStartedAt, forceRefreshTask != nil {
219+
let elapsed = now.timeIntervalSince(started)
220+
guard elapsed > forceRefreshWatchdogSeconds else { return false }
221+
NSLog("CodeBurn: force refresh stuck for %ds — cancelling and restarting", Int(elapsed))
222+
forceRefreshTask?.cancel()
223+
forceRefreshTask = nil
224+
forceRefreshStartedAt = nil
225+
forceRefreshGeneration &+= 1
226+
store.resetLoadingState()
227+
return true
228+
}
229+
return false
230+
}
231+
211232
private func forceRefresh() {
212233
let now = Date()
234+
_ = clearStaleForceRefreshIfNeeded(now: now)
213235
guard now.timeIntervalSince(lastRefreshTime) > 5 else { return }
214236
lastRefreshTime = now
237+
forceRefreshStartedAt = now
238+
forceRefreshGeneration &+= 1
239+
let generation = forceRefreshGeneration
215240

216-
forceRefreshTask?.cancel()
217241
forceRefreshTask = Task {
218242
async let main: Void = store.refresh(includeOptimize: false, force: true, showLoading: true)
219243
async let today: Void = store.refreshQuietly(period: .today)
220244
_ = await (main, today)
221245
refreshStatusButton()
246+
await MainActor.run { [weak self] in
247+
guard let self, self.forceRefreshGeneration == generation else { return }
248+
self.forceRefreshTask = nil
249+
self.forceRefreshStartedAt = nil
250+
self.lastRefreshTime = Date()
251+
}
222252
}
223253
}
224254

@@ -259,12 +289,14 @@ final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelegate {
259289
}
260290
while !Task.isCancelled {
261291
guard let self else { return }
292+
let clearedStaleForceRefresh = self.clearStaleForceRefreshIfNeeded()
293+
let clearedStaleLoading = self.store.clearStaleLoadingIfNeeded()
262294
// Skip the loop's tick if a wake / manual / distributed-
263295
// notification refresh just ran. Without this gate, every
264296
// wake produced two refreshes (forceRefresh from the wake
265297
// observer plus the loop's natural tick).
266298
let sinceLast = Date().timeIntervalSince(self.lastRefreshTime)
267-
if sinceLast >= 5 {
299+
if self.forceRefreshTask == nil && (clearedStaleForceRefresh || clearedStaleLoading || sinceLast >= 5) {
268300
if self.store.selectedPeriod != .today || self.store.selectedProvider != .all {
269301
async let quiet: Void = self.store.refreshQuietly(period: .today)
270302
async let main: Void = self.store.refresh(includeOptimize: false, force: true)

mac/Sources/CodeBurnMenubar/Data/DataClient.swift

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -61,41 +61,27 @@ struct DataClient {
6161
throw DataClientError.spawn(error.localizedDescription)
6262
}
6363

64-
// Wall-clock timeout: if the CLI hangs (parser stuck, disk stall), kill it.
65-
// Log when this fires so a recurring stuck-popover state has an actual
66-
// diagnostic — historically users saw "Loading..." forever with no signal
67-
// about what failed; the only way to debug was to read process state at
68-
// the wrong time. The log line names the subcommand so we can correlate
69-
// with a specific period/provider combination.
7064
let timeoutTask = Task.detached(priority: .utility) {
7165
try? await Task.sleep(nanoseconds: spawnTimeoutSeconds * 1_000_000_000)
7266
if process.isRunning {
7367
NSLog("CodeBurn: CLI subprocess timed out after %llus for %@ — terminating",
7468
spawnTimeoutSeconds, subcommand.joined(separator: " "))
75-
process.terminate()
69+
terminateWithEscalation(process)
7670
}
7771
}
7872
defer { timeoutTask.cancel() }
7973

80-
// If the caller cancels its Task (rapid period/provider tab clicks
81-
// cancel switchTask in AppStore), terminate the in-flight subprocess.
82-
// Without this the cancelled Task returns immediately but the spawned
83-
// CLI keeps running to completion, piling up zombie codeburn processes
84-
// on rapid UI interactions. We hold a strong reference to the Process
85-
// in the cancellation handler so the closure can find it even if the
86-
// surrounding scope has gone async.
74+
let outHandle = outPipe.fileHandleForReading
75+
let errHandle = errPipe.fileHandleForReading
8776
let (out, err) = await withTaskCancellationHandler {
88-
// Drain both pipes concurrently so a large stderr can't deadlock stdout
89-
// (the child blocks on write once the pipe buffer fills). `drain`
90-
// also enforces a byte cap.
91-
async let stdoutData = drain(outPipe.fileHandleForReading, limit: maxPayloadBytes)
92-
async let stderrData = drain(errPipe.fileHandleForReading, limit: maxStderrBytes)
77+
async let stdoutData = drain(outHandle, limit: maxPayloadBytes)
78+
async let stderrData = drain(errHandle, limit: maxStderrBytes)
9379
return await (stdoutData, stderrData)
9480
} onCancel: {
95-
if process.isRunning {
96-
process.terminate()
97-
}
81+
terminateWithEscalation(process)
9882
}
83+
try? outHandle.close()
84+
try? errHandle.close()
9985
process.waitUntilExit()
10086

10187
if out.count >= maxPayloadBytes {
@@ -106,22 +92,45 @@ struct DataClient {
10692
return ProcessResult(stdout: out, stderr: stderrString, exitCode: process.terminationStatus)
10793
}
10894

109-
/// Pulls bytes off a pipe until EOF or `limit`. Intentionally uses `availableData`, which
110-
/// returns empty on EOF -- no blocking once the child exits.
95+
private static func terminateWithEscalation(_ process: Process) {
96+
guard process.isRunning else { return }
97+
process.terminate()
98+
let pid = process.processIdentifier
99+
DispatchQueue.global(qos: .utility).asyncAfter(deadline: .now() + 0.5) {
100+
if process.isRunning { kill(pid, SIGKILL) }
101+
}
102+
}
103+
111104
private static func drain(_ handle: FileHandle, limit: Int) async -> Data {
112-
await Task.detached(priority: .utility) {
113-
var buffer = Data()
114-
while buffer.count < limit {
115-
let chunk = handle.availableData
116-
if chunk.isEmpty { break }
117-
let remaining = limit - buffer.count
118-
if chunk.count > remaining {
119-
buffer.append(chunk.prefix(remaining))
120-
break
121-
}
122-
buffer.append(chunk)
105+
let fd = handle.fileDescriptor
106+
let flags = Darwin.fcntl(fd, F_GETFL)
107+
if flags >= 0 {
108+
_ = Darwin.fcntl(fd, F_SETFL, flags | O_NONBLOCK)
109+
} else {
110+
NSLog("CodeBurn: fcntl F_GETFL failed on fd %d, drain may block", fd)
111+
}
112+
113+
var buffer = Data()
114+
var chunk = [UInt8](repeating: 0, count: 65_536)
115+
116+
while buffer.count < limit && !Task.isCancelled {
117+
let toRead = min(chunk.count, limit - buffer.count)
118+
let n = chunk.withUnsafeMutableBufferPointer { ptr in
119+
Darwin.read(fd, ptr.baseAddress!, toRead)
123120
}
124-
return buffer
125-
}.value
121+
if n > 0 {
122+
buffer.append(contentsOf: chunk.prefix(n))
123+
} else if n == 0 {
124+
break
125+
} else if errno == EAGAIN || errno == EWOULDBLOCK {
126+
try? await Task.sleep(nanoseconds: 5_000_000)
127+
} else if errno == EINTR {
128+
continue
129+
} else {
130+
NSLog("CodeBurn: drain read() failed on fd %d: errno %d", fd, errno)
131+
break
132+
}
133+
}
134+
return buffer
126135
}
127136
}

mac/Sources/CodeBurnMenubar/Views/MenuBarContent.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,22 @@ struct MenuBarContent: View {
4747
// error, etc.), surface a retry card instead of leaving the
4848
// user stuck on a perpetual "Loading..." spinner.
4949
if !store.hasCachedData {
50-
if let err = store.lastError, !store.isLoading {
50+
if store.isCurrentKeyLoading || !store.hasAttemptedCurrentKeyLoad {
51+
BurnLoadingOverlay(periodLabel: store.selectedPeriod.rawValue)
52+
.transition(.opacity)
53+
} else if let err = store.lastError {
5154
FetchErrorOverlay(
5255
error: err,
5356
periodLabel: store.selectedPeriod.rawValue,
5457
retry: { Task { await store.refresh(includeOptimize: false, force: true, showLoading: true) } }
5558
)
5659
.transition(.opacity)
5760
} else {
58-
BurnLoadingOverlay(periodLabel: store.selectedPeriod.rawValue)
61+
FetchErrorOverlay(
62+
error: "The last refresh stopped before returning data. CodeBurn will keep retrying, or you can retry now.",
63+
periodLabel: store.selectedPeriod.rawValue,
64+
retry: { Task { await store.refresh(includeOptimize: false, force: true, showLoading: true) } }
65+
)
5966
.transition(.opacity)
6067
}
6168
}

0 commit comments

Comments
 (0)