Skip to content

Commit 086d7b2

Browse files
anandgupta42claude
andauthored
fix: eliminate stale file race condition and fix error misclassification (#611)
* fix: eliminate stale file race condition and fix error misclassification - Use file's actual `mtime` instead of `new Date()` in `FileTime.read()` to eliminate clock drift between Node.js and filesystem (WSL, networked drives) - Increase staleness tolerance from 50ms to 2s (p50 gap was 651ms, caused 782-retry loops on WSL) - Split `file_stale` out of `validation` error class for cleaner triage - Move `http_error` pattern before `validation` and add `HTTP 4xx` keywords to prevent WebFetch 404s from being misclassified as validation errors - Update tests to use `fs.utimes()` for deterministic mtime manipulation Closes #610 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * wip: add telemetry instrumentation to FileTime and tighten tolerance Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 43e5f9a commit 086d7b2

5 files changed

Lines changed: 184 additions & 30 deletions

File tree

packages/opencode/src/altimate/telemetry/index.ts

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -475,12 +475,40 @@ export namespace Telemetry {
475475
session_id: string
476476
tool_name: string
477477
tool_category: string
478-
error_class: "parse_error" | "connection" | "timeout" | "validation" | "internal" | "permission" | "http_error" | "file_not_found" | "edit_mismatch" | "not_configured" | "resource_exhausted" | "unknown"
478+
error_class: "parse_error" | "connection" | "timeout" | "validation" | "internal" | "permission" | "http_error" | "file_not_found" | "file_stale" | "edit_mismatch" | "not_configured" | "resource_exhausted" | "unknown"
479479
error_message: string
480480
input_signature: string
481481
masked_args?: string
482482
duration_ms: number
483483
}
484+
// altimate_change start — FileTime observability: drift + assertion outcome tracking
485+
// Tracks the gap between Node.js wall-clock and filesystem mtime at read time.
486+
// Use to monitor whether the mtime clock-source change (PR #611) is preventing
487+
// false stale-file errors, and detect environments with significant drift.
488+
// KQL: customEvents | where name == "filetime_drift" | extend drift = todouble(customMeasurements.drift_ms)
489+
| {
490+
type: "filetime_drift"
491+
timestamp: number
492+
session_id: string
493+
/** Absolute difference in ms between Date.now() and filesystem mtime */
494+
drift_ms: number
495+
/** True if filesystem mtime is ahead of wall-clock (the problematic direction) */
496+
mtime_ahead: boolean
497+
}
498+
// Tracks FileTime.assert() outcomes: "stale" when a file fails the check.
499+
// High volume of "stale" with low delta_ms suggests tolerance is too tight.
500+
// KQL: customEvents | where name == "filetime_assert" | extend delta = todouble(customMeasurements.delta_ms)
501+
| {
502+
type: "filetime_assert"
503+
timestamp: number
504+
session_id: string
505+
outcome: "stale"
506+
/** mtime - readTime in ms (how far ahead the file's mtime is) */
507+
delta_ms: number
508+
/** Current tolerance threshold in ms */
509+
tolerance_ms: number
510+
}
511+
// altimate_change end
484512
// altimate_change start — sql quality telemetry for issue prevention metrics
485513
| {
486514
type: "sql_quality"
@@ -761,19 +789,31 @@ export namespace Telemetry {
761789
// altimate_change end
762790
{ class: "timeout", keywords: ["timeout", "etimedout", "bridge timeout", "timed out"] },
763791
{ class: "permission", keywords: ["permission", "access denied", "permission denied", "unauthorized", "forbidden", "authentication"] },
792+
// altimate_change start — http_error before validation to prevent "HTTP 404" matching "invalid"/"missing"
793+
{
794+
class: "http_error",
795+
keywords: ["status code: 4", "status code: 5", "request failed with status", "http 404", "http 410", "http 429", "http 451", "http 403"],
796+
},
797+
// altimate_change end
798+
// altimate_change start — split file_stale out of validation; remove "does not exist" (was catching HTTP 404s)
799+
{
800+
class: "file_stale",
801+
keywords: [
802+
"must read file",
803+
"has been modified since",
804+
"before overwriting",
805+
],
806+
},
764807
{
765808
class: "validation",
766809
keywords: [
767810
"invalid params",
768811
"invalid",
769812
"missing",
770813
"required",
771-
"must read file",
772-
"has been modified since",
773-
"does not exist",
774-
"before overwriting",
775814
],
776815
},
816+
// altimate_change end
777817
{ class: "internal", keywords: ["internal", "assertion"] },
778818
// altimate_change start — resource_exhausted class for OOM/quota errors
779819
{
@@ -788,10 +828,6 @@ export namespace Telemetry {
788828
],
789829
},
790830
// altimate_change end
791-
{
792-
class: "http_error",
793-
keywords: ["status code: 4", "status code: 5", "request failed with status"],
794-
},
795831
]
796832
// altimate_change end
797833

packages/opencode/src/file/time.ts

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,48 @@ import { Instance } from "../project/instance"
22
import { Log } from "../util/log"
33
import { Flag } from "../flag/flag"
44
import { Filesystem } from "../util/filesystem"
5+
// altimate_change start — telemetry for FileTime drift measurement
6+
import { Telemetry } from "../altimate/telemetry"
7+
// altimate_change end
58

69
export namespace FileTime {
710
const log = Log.create({ service: "file.time" })
11+
12+
// altimate_change start — FileTime clock source change documentation
13+
//
14+
// CHANGE: read() now records the file's filesystem mtime instead of Date.now().
15+
//
16+
// WHY: The original code used `new Date()` (wall-clock) as the "last read" timestamp,
17+
// then assert() compared it against `Filesystem.stat(file).mtime`. On WSL (NTFS-over-9P),
18+
// networked drives, and some macOS APFS mounts, the filesystem clock drifts 400ms–1.2s
19+
// behind Node.js's clock. This caused mtime > readTime even for the file's own write,
20+
// triggering false "modified since last read" errors. One user hit 782 consecutive retries.
21+
//
22+
// FIX: Both timestamps now come from the same clock (filesystem mtime), eliminating
23+
// cross-clock skew. The tolerance is kept at 50ms (upstream default) since same-clock
24+
// comparisons don't need a larger window.
25+
//
26+
// TRADE-OFF: On coarse-resolution filesystems (HFS+ = 1s, NFS with stale cache,
27+
// Docker overlayfs after copy-up), two writes within the same resolution window
28+
// produce identical mtimes — so we'd miss a real external modification. This is a
29+
// false-negative risk (missed edit) vs the false-positive risk (retry loop) we're
30+
// fixing. Acceptable because: (a) the file gets re-read on the next attempt anyway,
31+
// (b) HFS+ is rare (macOS defaulted to APFS since 2017), and (c) the wall-clock
32+
// approach was actively causing 782-retry production loops on WSL.
33+
//
34+
// MONITORING: filetime_drift telemetry event tracks the gap between wall-clock and mtime
35+
// at read time. If drift_ms is consistently 0, this change has no effect (good). If
36+
// drift_ms shows large values, this change is preventing false positives (also good).
37+
// If file_stale errors increase post-deploy, the tolerance may need adjustment.
38+
//
39+
// UPSTREAM: sst/opencode issues #19040, #14183, #20354 track the same problem.
40+
// Upstream is pursuing processor-level recovery (PR #19099) rather than fixing the clock
41+
// source. Both approaches are complementary.
42+
//
43+
// ROLLBACK: Set OPENCODE_DISABLE_FILETIME_CHECK=true to bypass all checks, or revert
44+
// this change to restore `new Date()` behavior with a wider tolerance.
45+
// altimate_change end
46+
847
// Per-session read times plus per-file write locks.
948
// All tools that overwrite existing files should run their
1049
// assert/read/write/update sequence inside withLock(filepath, ...)
@@ -26,7 +65,32 @@ export namespace FileTime {
2665
log.info("read", { sessionID, file })
2766
const { read } = state()
2867
read[sessionID] = read[sessionID] || {}
29-
read[sessionID][file] = new Date()
68+
// altimate_change start — use filesystem mtime instead of wall-clock (see doc block above)
69+
const wallClock = new Date()
70+
const mtime = Filesystem.stat(file)?.mtime
71+
read[sessionID][file] = mtime ?? wallClock
72+
73+
// Track drift between wall-clock and filesystem mtime for monitoring.
74+
// This lets us measure the real-world impact of the clock source change
75+
// and detect environments where drift is significant.
76+
if (mtime) {
77+
const driftMs = Math.abs(wallClock.getTime() - mtime.getTime())
78+
if (driftMs > 10) {
79+
// Only emit when drift is non-trivial (>10ms) to avoid noise
80+
try {
81+
Telemetry.track({
82+
type: "filetime_drift",
83+
timestamp: Date.now(),
84+
session_id: sessionID,
85+
drift_ms: driftMs,
86+
mtime_ahead: mtime.getTime() > wallClock.getTime(),
87+
})
88+
} catch {
89+
// Telemetry must never break file operations
90+
}
91+
}
92+
}
93+
// altimate_change end
3094
}
3195

3296
export function get(sessionID: string, file: string) {
@@ -61,11 +125,27 @@ export namespace FileTime {
61125
const time = get(sessionID, filepath)
62126
if (!time) throw new Error(`You must read file ${filepath} before overwriting it. Use the Read tool first`)
63127
const mtime = Filesystem.stat(filepath)?.mtime
64-
// Allow a 50ms tolerance for Windows NTFS timestamp fuzziness / async flushing
65-
if (mtime && mtime.getTime() > time.getTime() + 50) {
128+
// altimate_change start — keep upstream's 50ms tolerance (sufficient now that both
129+
// timestamps come from the same filesystem clock). Track assertion outcomes for monitoring.
130+
const toleranceMs = 50
131+
const deltaMs = mtime ? mtime.getTime() - time.getTime() : 0
132+
if (mtime && deltaMs > toleranceMs) {
133+
try {
134+
Telemetry.track({
135+
type: "filetime_assert",
136+
timestamp: Date.now(),
137+
session_id: sessionID,
138+
outcome: "stale",
139+
delta_ms: deltaMs,
140+
tolerance_ms: toleranceMs,
141+
})
142+
} catch {
143+
// Telemetry must never mask the stale-file error
144+
}
66145
throw new Error(
67-
`File ${filepath} has been modified since it was last read.\nLast modification: ${mtime.toISOString()}\nLast read: ${time.toISOString()}\n\nPlease read the file again before modifying it.`,
146+
`File ${filepath} has been modified since it was last read.\nLast modification: ${mtime.toISOString()}\nLast read: ${time.toISOString()}\nDelta: ${deltaMs}ms (tolerance: ${toleranceMs}ms)\n\nPlease read the file again before modifying it.`,
68147
)
69148
}
149+
// altimate_change end
70150
}
71151
}

packages/opencode/test/file/time.test.ts

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,18 +112,19 @@ describe("file/time", () => {
112112
fn: async () => {
113113
FileTime.read(sessionID, filepath)
114114

115-
// Wait to ensure different timestamps
116-
await new Promise((resolve) => setTimeout(resolve, 100))
117-
118-
// Modify file after reading
115+
// Simulate external modification by setting mtime 5s ahead of the
116+
// file's current mtime (read() stores mtime, assert() compares
117+
// against current mtime — both from filesystem clock).
119118
await fs.writeFile(filepath, "modified content", "utf-8")
119+
const future = new Date(Date.now() + 5000)
120+
await fs.utimes(filepath, future, future)
120121

121122
await expect(FileTime.assert(sessionID, filepath)).rejects.toThrow("modified since it was last read")
122123
},
123124
})
124125
})
125126

126-
test("includes timestamps in error message", async () => {
127+
test("includes timestamps and delta in error message", async () => {
127128
await using tmp = await tmpdir()
128129
const filepath = path.join(tmp.path, "file.txt")
129130
await fs.writeFile(filepath, "content", "utf-8")
@@ -132,8 +133,9 @@ describe("file/time", () => {
132133
directory: tmp.path,
133134
fn: async () => {
134135
FileTime.read(sessionID, filepath)
135-
await new Promise((resolve) => setTimeout(resolve, 100))
136136
await fs.writeFile(filepath, "modified", "utf-8")
137+
const future = new Date(Date.now() + 5000)
138+
await fs.utimes(filepath, future, future)
137139

138140
let error: Error | undefined
139141
try {
@@ -144,6 +146,8 @@ describe("file/time", () => {
144146
expect(error).toBeDefined()
145147
expect(error!.message).toContain("Last modification:")
146148
expect(error!.message).toContain("Last read:")
149+
expect(error!.message).toContain("Delta:")
150+
expect(error!.message).toContain("tolerance:")
147151
},
148152
})
149153
})
@@ -346,9 +350,10 @@ describe("file/time", () => {
346350

347351
const originalStat = Filesystem.stat(filepath)
348352

349-
// Wait and modify
350-
await new Promise((resolve) => setTimeout(resolve, 100))
353+
// Simulate external modification with mtime far ahead
351354
await fs.writeFile(filepath, "modified", "utf-8")
355+
const future = new Date(Date.now() + 5000)
356+
await fs.utimes(filepath, future, future)
352357

353358
const newStat = Filesystem.stat(filepath)
354359
expect(newStat!.mtime.getTime()).toBeGreaterThan(originalStat!.mtime.getTime())
@@ -357,5 +362,23 @@ describe("file/time", () => {
357362
},
358363
})
359364
})
365+
366+
test("read() stores filesystem mtime, not wall-clock time", async () => {
367+
await using tmp = await tmpdir()
368+
const filepath = path.join(tmp.path, "file.txt")
369+
await fs.writeFile(filepath, "content", "utf-8")
370+
371+
await Instance.provide({
372+
directory: tmp.path,
373+
fn: async () => {
374+
const statBefore = Filesystem.stat(filepath)
375+
FileTime.read(sessionID, filepath)
376+
const readTime = FileTime.get(sessionID, filepath)
377+
378+
// read() should store the file's mtime, not wall-clock
379+
expect(readTime!.getTime()).toBe(statBefore!.mtime.getTime())
380+
},
381+
})
382+
})
360383
})
361384
})

packages/opencode/test/telemetry/telemetry.test.ts

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,11 +1628,13 @@ describe("telemetry.classifyError", () => {
16281628
expect(Telemetry.classifyError("Invalid dialect specified")).toBe("validation")
16291629
expect(Telemetry.classifyError("Missing required field")).toBe("validation")
16301630
expect(Telemetry.classifyError("Required parameter 'query' not provided")).toBe("validation")
1631-
// altimate_change start — expanded validation patterns
1632-
expect(Telemetry.classifyError("You must read file /path/to/file before overwriting it")).toBe("validation")
1633-
expect(Telemetry.classifyError("File has been modified since it was last read")).toBe("validation")
1634-
expect(Telemetry.classifyError("error: column foo does not exist")).toBe("validation")
1635-
expect(Telemetry.classifyError("You must read file before overwriting it. Use the Read tool first")).toBe("validation")
1631+
// altimate_change start — file_stale split out from validation; "does not exist" moved to http_error
1632+
// These are now classified as file_stale, not validation
1633+
expect(Telemetry.classifyError("You must read file /path/to/file before overwriting it")).toBe("file_stale")
1634+
expect(Telemetry.classifyError("File has been modified since it was last read")).toBe("file_stale")
1635+
expect(Telemetry.classifyError("You must read file before overwriting it. Use the Read tool first")).toBe("file_stale")
1636+
// HTTP 404 "does not exist" is now http_error
1637+
expect(Telemetry.classifyError("HTTP 404: https://example.com/page does not exist")).toBe("http_error")
16361638
// altimate_change end
16371639
})
16381640

@@ -1651,12 +1653,25 @@ describe("telemetry.classifyError", () => {
16511653
expect(Telemetry.classifyError("Assertion failed: x > 0")).toBe("internal")
16521654
})
16531655

1656+
// altimate_change start — file_stale class tests
1657+
test("classifies file_stale errors", () => {
1658+
expect(Telemetry.classifyError("You must read file /path/to/file before overwriting it")).toBe("file_stale")
1659+
expect(Telemetry.classifyError("File /foo.ts has been modified since it was last read")).toBe("file_stale")
1660+
expect(Telemetry.classifyError("Read the file before overwriting it")).toBe("file_stale")
1661+
})
1662+
// altimate_change end
1663+
16541664
// altimate_change start — http_error class and priority ordering tests
16551665
test("classifies http errors", () => {
16561666
expect(Telemetry.classifyError("Request failed with status code: 404 (example.com)")).toBe("http_error")
16571667
expect(Telemetry.classifyError("Request failed with status code: 500")).toBe("http_error")
16581668
expect(Telemetry.classifyError("status code: 403")).toBe("http_error")
16591669
expect(Telemetry.classifyError("Request failed with status")).toBe("http_error")
1670+
// altimate_change start — HTTP status codes in webfetch error messages
1671+
expect(Telemetry.classifyError("HTTP 404: https://example.com does not exist. Do NOT retry")).toBe("http_error")
1672+
expect(Telemetry.classifyError("HTTP 410: https://example.com has been permanently removed")).toBe("http_error")
1673+
expect(Telemetry.classifyError("HTTP 429: Rate limited by example.com")).toBe("http_error")
1674+
// altimate_change end
16601675
})
16611676

16621677
test("priority ordering: earlier patterns win over later ones", () => {

packages/opencode/test/tool/edit.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,11 +238,11 @@ describe("tool.edit", () => {
238238
// Read first
239239
FileTime.read(ctx.sessionID, filepath)
240240

241-
// Wait a bit to ensure different timestamps
242-
await new Promise((resolve) => setTimeout(resolve, 100))
243-
244-
// Simulate external modification
241+
// Simulate external modification with mtime 5s in the future
242+
// to exceed the 50ms tolerance in FileTime.assert()
245243
await fs.writeFile(filepath, "modified externally", "utf-8")
244+
const future = new Date(Date.now() + 5000)
245+
await fs.utimes(filepath, future, future)
246246

247247
// Try to edit with the new content
248248
const edit = await EditTool.init()

0 commit comments

Comments
 (0)