@@ -17,24 +17,77 @@ package flock
1717import (
1818 "context"
1919 "errors"
20+ "io/fs"
2021 "os"
2122 "sync"
2223 "time"
2324
2425 "github.com/cenkalti/backoff/v5"
26+
27+ "github.com/docker/secrets-engine/x/logging"
2528)
2629
30+ // noopLogger discards every record. It is the [logging.Logger]
31+ // implementation used when no logger is provided in the context passed
32+ // to [TryLock] or [TryRLock], so the package never writes to the
33+ // caller's stderr unsolicited.
34+ type noopLogger struct {}
35+
36+ func (noopLogger ) Printf (string , ... any ) {}
37+ func (noopLogger ) Warnf (string , ... any ) {}
38+ func (noopLogger ) Errorf (string , ... any ) {}
39+
40+ // loggerFromCtx returns the [logging.Logger] stored on ctx by
41+ // [logging.WithLogger], or a [noopLogger] when none is set. The library
42+ // must never log to a default sink — silence is the safe default for a
43+ // dependency.
44+ func loggerFromCtx (ctx context.Context ) logging.Logger {
45+ if l , err := logging .FromContext (ctx ); err == nil {
46+ return l
47+ }
48+ return noopLogger {}
49+ }
50+
2751var (
2852 ErrLockUnsuccessful = errors .New ("store is locked" )
2953 ErrUnlockUnsuccessful = errors .New ("could not unlock store" )
54+
55+ // errStaleInode indicates that the file we flocked is no longer the
56+ // file at the lock-file path. This happens when another caller's
57+ // stale-recovery unlinked the file between our open and our flock.
58+ // Locking an unlinked inode would leave us holding a "ghost" lock
59+ // that no other caller can observe.
60+ errStaleInode = errors .New ("lock file inode changed under us" )
3061)
3162
3263const (
3364 lockFileName = ".posixage.lock"
3465)
3566
36- // UnlockFunc is the callback function returned by [TryLock] and [TryRLock]
37- // it should always be called inside a defer.
67+ // heartbeatInterval is how often the [heartbeat] goroutine refreshes the
68+ // lock file's modtime while a caller holds the lock. It must be well
69+ // below [staleThreshold] so that a holder which misses a tick or two
70+ // (GC pause, brief scheduler starvation) still appears live to concurrent
71+ // recovery callers.
72+ //
73+ // Exposed as a var rather than a const so tests can shorten it.
74+ var heartbeatInterval = 10 * time .Second
75+
76+ // UnlockFunc is the callback returned by [TryLock] and [TryRLock]. It
77+ // releases the advisory lock, closes the underlying file descriptor, and
78+ // stops the background heartbeat goroutine that refreshes the lock
79+ // file's modtime.
80+ //
81+ // Callers MUST invoke this function exactly once, typically via defer
82+ // immediately after a successful lock acquisition. Failing to call it
83+ // leaks both the file descriptor and the heartbeat goroutine for the
84+ // remaining lifetime of the process — the goroutine has no other
85+ // termination signal. Calling it more than once is safe and idempotent;
86+ // only the first call performs the release.
87+ //
88+ // The returned error reflects the unlock/close step only. The heartbeat
89+ // goroutine is always stopped and joined before the unlock is attempted,
90+ // so the file descriptor is never touched after it has been closed.
3891type UnlockFunc func () error
3992
4093// openFile is a helper function for internal use by [tryLock]
@@ -49,88 +102,171 @@ func openFile(root *os.Root) (*os.File, error) {
49102 return fl , nil
50103}
51104
52- func tryLock (ctx context.Context , root * os.Root , exclusive bool ) (UnlockFunc , error ) {
53- var (
54- err error
55- fl * os.File
56- )
57-
58- defer func () {
59- // we must always close the file on any error returned
60- if err != nil && fl != nil {
61- _ = fl .Close ()
62- }
63- }()
105+ // acquireOnce performs a single lock acquisition attempt and verifies the
106+ // resulting lock is on the file currently at the lock-file path.
107+ //
108+ // The sequence is open -> flock -> truncate -> compare-inodes. If any step
109+ // fails the function releases the flock (when held) and closes the fd
110+ // before returning. The returned [os.File] is the locked descriptor; the
111+ // caller is responsible for unlocking and closing it.
112+ //
113+ // The inode check is what prevents the "ghost lock" race: when a
114+ // concurrent stale-recovery unlinks the file between our [openFile] and
115+ // our [lockFile] call, [lockFile] will succeed on the unlinked inode but
116+ // the path will resolve to a brand-new inode. Treating that as a failure
117+ // forces the caller to drop the bad lock and try again with a fresh fd.
118+ func acquireOnce (root * os.Root , exclusive bool ) (* os.File , error ) {
119+ fl , err := openFile (root )
120+ if err != nil {
121+ return nil , err
122+ }
64123
65- fl , err = openFile (root )
124+ if err := lockFile (fl .Fd (), exclusive ); err != nil {
125+ _ = fl .Close ()
126+ return nil , err
127+ }
128+
129+ // Truncate first so the modtime refresh is visible to any concurrent
130+ // recovery caller before we check the inode. Doing it the other way
131+ // round leaves a window where a passing inode check is followed by an
132+ // unlink before truncate, and the caller walks away with a lock on an
133+ // already-orphaned inode.
134+ _ = fl .Truncate (0 )
135+
136+ same , err := isCurrentLockFile (fl , root )
66137 if err != nil {
138+ _ = releaseLock (fl )
139+ _ = fl .Close ()
67140 return nil , err
68141 }
142+ if ! same {
143+ _ = releaseLock (fl )
144+ _ = fl .Close ()
145+ return nil , errStaleInode
146+ }
147+ return fl , nil
148+ }
69149
70- if err = lockFile (fl .Fd (), exclusive ); err == nil {
71- // truncate to update the modtime to signal to other processes that the
72- // current lock is valid so they don't attempt a recovery on it.
73- _ = fl .Truncate (0 )
74- return sync .OnceValue (func () error {
75- return unlockFile (fl )
76- }), nil
150+ // isCurrentLockFile reports whether the locked descriptor [fl] still refers
151+ // to the file at the lock-file path. It returns false when the path no
152+ // longer exists or has been replaced by a different inode.
153+ func isCurrentLockFile (fl * os.File , root * os.Root ) (bool , error ) {
154+ fdInfo , err := fl .Stat ()
155+ if err != nil {
156+ return false , err
157+ }
158+ pathInfo , err := root .Stat (lockFileName )
159+ if err != nil {
160+ if errors .Is (err , fs .ErrNotExist ) {
161+ return false , nil
162+ }
163+ return false , err
77164 }
78- err = errors .Join (ErrLockUnsuccessful , err )
165+ return os .SameFile (fdInfo , pathInfo ), nil
166+ }
79167
80- if ctx .Err () != nil {
81- return nil , errors .Join (err , ctx .Err ())
168+ func tryLock (ctx context.Context , root * os.Root , exclusive bool ) (UnlockFunc , error ) {
169+ logger := loggerFromCtx (ctx )
170+ fl , err := acquireOnce (root , exclusive )
171+ if err == nil {
172+ return startHeartbeat (fl , root , logger ), nil
82173 }
174+ firstErr := errors .Join (ErrLockUnsuccessful , err )
83175
84- // recoverStaleLock always closes fl; clear our reference so the deferred
85- // close-on-error cleanup does not double-close it.
86- recoverErr := recoverStaleLock (root , fl )
87- fl = nil
88- if recoverErr != nil && ! errors .Is (recoverErr , errRecoverLock ) {
89- return nil , errors .Join (err , recoverErr )
176+ if ctxErr := ctx .Err (); ctxErr != nil {
177+ return nil , errors .Join (firstErr , ctxErr )
90178 }
91179
92- fl , err = openFile (root )
93- if err != nil {
94- return nil , err
180+ if recoverErr := recoverStaleLock (root ); recoverErr != nil && ! errors .Is (recoverErr , errRecoverLock ) {
181+ return nil , errors .Join (firstErr , recoverErr )
95182 }
96183
97- err = retryLock (ctx , fl , exclusive )
184+ fl , err = retryLock (ctx , root , exclusive )
98185 if err != nil {
99186 return nil , err
100187 }
188+ return startHeartbeat (fl , root , logger ), nil
189+ }
101190
191+ // startHeartbeat launches the modtime-refresh goroutine for a locked file
192+ // and returns an [UnlockFunc] that stops the goroutine, waits for it to
193+ // exit, and then unlocks/closes the file. The wait ensures the goroutine
194+ // never touches the fd after [unlockFile] closes it.
195+ //
196+ // The supplied logger is used by the goroutine to surface truncate
197+ // failures and inode-mismatch hijacks. A [noopLogger] is acceptable when
198+ // the caller has no logging plumbed.
199+ func startHeartbeat (fl * os.File , root * os.Root , logger logging.Logger ) UnlockFunc {
200+ hbCtx , stop := context .WithCancel (context .Background ())
201+ done := make (chan struct {})
202+ go func () {
203+ defer close (done )
204+ heartbeat (hbCtx , fl , root , logger )
205+ }()
102206 return sync .OnceValue (func () error {
207+ stop ()
208+ <- done
103209 return unlockFile (fl )
104- }), nil
210+ })
105211}
106212
107- // retryLock attempts to acquire an advisory lock on the given file
108- // using flock, retrying until the context is canceled or the lock is acquired.
213+ // heartbeat re-truncates the locked file every [heartbeatInterval] so its
214+ // modtime stays younger than [staleThreshold] for the lifetime of the
215+ // lock. Without this, a holder doing work that exceeds [staleThreshold]
216+ // would be misclassified as stale by concurrent callers, which would
217+ // unlink the lock file and let a fresh inode be created at the same path
218+ // — the holder-side half of the ghost-lock race.
109219//
110- // Retries use exponential backoff with a maximum delay of 100ms
111- // between attempts.
220+ // Each tick also re-verifies the locked descriptor still refers to the
221+ // file at the lock-file path. A mismatch means we have been hijacked
222+ // (heartbeat starved past [staleThreshold] long enough for recovery to
223+ // fire). There is no in-band way to fail the caller's outstanding
224+ // operation, so the mismatch is logged via the supplied [logging.Logger]
225+ // and the goroutine keeps running — surfacing the hijack is the best we
226+ // can do.
112227//
113- // Set exclusive to true for write or delete operations to prevent
114- // concurrent reads.
115- func retryLock (ctx context.Context , f * os.File , exclusive bool ) error {
228+ // The goroutine returns when ctx is canceled by [startHeartbeat]'s
229+ // returned [UnlockFunc].
230+ func heartbeat (ctx context.Context , fl * os.File , root * os.Root , logger logging.Logger ) {
231+ ticker := time .NewTicker (heartbeatInterval )
232+ defer ticker .Stop ()
233+ for {
234+ select {
235+ case <- ctx .Done ():
236+ return
237+ case <- ticker .C :
238+ if err := fl .Truncate (0 ); err != nil {
239+ logger .Warnf ("flock heartbeat: truncate failed: %v" , err )
240+ continue
241+ }
242+ same , err := isCurrentLockFile (fl , root )
243+ if err != nil {
244+ logger .Warnf ("flock heartbeat: inode verify failed: %v" , err )
245+ continue
246+ }
247+ if ! same {
248+ logger .Warnf ("flock heartbeat: lock file inode changed under us; lock has likely been hijacked" )
249+ }
250+ }
251+ }
252+ }
253+
254+ // retryLock loops [acquireOnce] with exponential backoff until ctx is
255+ // canceled or a verified lock is obtained. Each iteration opens a fresh
256+ // fd, so a [errStaleInode] result simply causes the next attempt to start
257+ // over against whatever file is currently at the path.
258+ func retryLock (ctx context.Context , root * os.Root , exclusive bool ) (* os.File , error ) {
116259 ep := backoff .NewExponentialBackOff ()
117260 ep .InitialInterval = time .Millisecond * 10
118261 ep .MaxInterval = time .Millisecond * 100
119- _ , err := backoff .Retry (ctx , func () (bool , error ) {
120- if err := lockFile (f .Fd (), exclusive ); err != nil {
121- return false , err
122- }
123- return true , nil
262+
263+ fl , err := backoff .Retry (ctx , func () (* os.File , error ) {
264+ return acquireOnce (root , exclusive )
124265 }, backoff .WithBackOff (ep ), backoff .WithMaxElapsedTime (0 ))
125266 if err != nil {
126- return errors .Join (ErrLockUnsuccessful , err )
267+ return nil , errors .Join (ErrLockUnsuccessful , err )
127268 }
128-
129- // truncate to update the modtime to signal to other processes that the
130- // current lock is valid so they don't attempt a recovery on it.
131- _ = f .Truncate (0 )
132-
133- return nil
269+ return fl , nil
134270}
135271
136272// TryLock acquires an exclusive advisory lock on a lock file.
@@ -140,11 +276,31 @@ func retryLock(ctx context.Context, f *os.File, exclusive bool) error {
140276// lock is acquired.
141277//
142278// As a safeguard, the function attempts to recover from stale locks,
143- // defined as lock files older than 30s. Stale lock recovery is skipped when
144- // ctx has been canceled. If recovery fails, manual intervention may be
145- // required.
279+ // defined as lock files older than 30s. While the lock is held, a
280+ // background goroutine refreshes the lock file's modtime every 10s so
281+ // long-running operations are not misclassified as stale. Stale lock
282+ // recovery is skipped when ctx has been canceled. If recovery fails,
283+ // manual intervention may be required.
146284//
147- // It returns an unlock function that must be called to release the lock.
285+ // The heartbeat goroutine surfaces truncate failures and hijack
286+ // detections through the [logging.Logger] stored on ctx via
287+ // [logging.WithLogger]. When no logger is set, those events are dropped
288+ // silently — the package never writes to a default sink.
289+ //
290+ // On success, the returned [UnlockFunc] MUST be called exactly once to
291+ // release the lock, close the file descriptor, and stop the heartbeat
292+ // goroutine. The idiomatic pattern is to defer it immediately:
293+ //
294+ // ctx = logging.WithLogger(ctx, myLogger) // optional
295+ // unlock, err := flock.TryLock(ctx, root)
296+ // if err != nil {
297+ // return err
298+ // }
299+ // defer unlock()
300+ //
301+ // Failing to call the returned function leaks both the file descriptor
302+ // and the heartbeat goroutine for the remaining lifetime of the process.
303+ // See [UnlockFunc] for details.
148304func TryLock (ctx context.Context , root * os.Root ) (UnlockFunc , error ) {
149305 return tryLock (ctx , root , true )
150306}
@@ -156,11 +312,31 @@ func TryLock(ctx context.Context, root *os.Root) (UnlockFunc, error) {
156312// lock is acquired.
157313//
158314// As a safeguard, the function attempts to recover from stale locks,
159- // defined as lock files older than 30s. Stale lock recovery is skipped when
160- // ctx has been canceled. If recovery fails, manual intervention may be
161- // required.
315+ // defined as lock files older than 30s. While the lock is held, a
316+ // background goroutine refreshes the lock file's modtime every 10s so
317+ // long-running operations are not misclassified as stale. Stale lock
318+ // recovery is skipped when ctx has been canceled. If recovery fails,
319+ // manual intervention may be required.
320+ //
321+ // The heartbeat goroutine surfaces truncate failures and hijack
322+ // detections through the [logging.Logger] stored on ctx via
323+ // [logging.WithLogger]. When no logger is set, those events are dropped
324+ // silently — the package never writes to a default sink.
325+ //
326+ // On success, the returned [UnlockFunc] MUST be called exactly once to
327+ // release the lock, close the file descriptor, and stop the heartbeat
328+ // goroutine. The idiomatic pattern is to defer it immediately:
329+ //
330+ // ctx = logging.WithLogger(ctx, myLogger) // optional
331+ // unlock, err := flock.TryRLock(ctx, root)
332+ // if err != nil {
333+ // return err
334+ // }
335+ // defer unlock()
162336//
163- // It returns an unlock function that must be called to release the lock.
337+ // Failing to call the returned function leaks both the file descriptor
338+ // and the heartbeat goroutine for the remaining lifetime of the process.
339+ // See [UnlockFunc] for details.
164340func TryRLock (ctx context.Context , root * os.Root ) (UnlockFunc , error ) {
165341 return tryLock (ctx , root , false )
166342}
0 commit comments