|
| 1 | +//! Shared file-based lock used to serialize the three operations that |
| 2 | +//! mutate per-profile auth state: switch, refresh's auth-rotation |
| 3 | +//! tail, and the per-card login flow. They cannot interleave safely — |
| 4 | +//! a switch mid-login (or vice versa) would race two writers against |
| 5 | +//! `account_backup/<profile>/auth.json` and possibly `~/.codex/auth.json`. |
| 6 | +//! |
| 7 | +//! The lock is a single file (`get_switch_lock_path` — kept named for |
| 8 | +//! historical compatibility) that all three holders create with |
| 9 | +//! `O_EXCL`. The first holder wins; everyone else gets a |
| 10 | +//! caller-supplied error code so the UI can render context-appropriate |
| 11 | +//! copy ("switch in progress" vs. "login in progress") even though |
| 12 | +//! the underlying contention is the same. |
| 13 | +//! |
| 14 | +//! ## Stale cleanup |
| 15 | +//! |
| 16 | +//! Without cleanup, any holder that crashes (force-quit, OS logout, |
| 17 | +//! browser-cancelled OAuth that left the parent process spinning) |
| 18 | +//! permanently bricks every future operation until the user manually |
| 19 | +//! deletes the file. The 1.6.x line had this fix; rolling back to |
| 20 | +//! 1.5.x dropped it. We restore it here with a single threshold — |
| 21 | +//! locks older than `STALE_LOCK_AGE` are reclaimed by the next caller. |
| 22 | +//! |
| 23 | +//! Threshold = 5 minutes. Long enough for a slow OAuth login (browser |
| 24 | +//! opened, user reads docs, eventually clicks Authorize). Short enough |
| 25 | +//! that a wedged login won't strand the user across an app restart. |
| 26 | +//! If a real login is still legitimately in flight at minute 5+, the |
| 27 | +//! reclaim races against its eventual auth.json write — but at that |
| 28 | +//! point the user has clearly chosen "I want to switch *now*", and a |
| 29 | +//! second click that succeeds is more useful than a 12-hour-stuck |
| 30 | +//! lock. (Future work: store the holder identity in the lock body so |
| 31 | +//! we can distinguish "stuck" from "still working.") |
| 32 | +
|
| 33 | +use std::fs::OpenOptions; |
| 34 | +use std::path::{Path, PathBuf}; |
| 35 | +use std::time::{Duration, SystemTime}; |
| 36 | + |
| 37 | +use crate::errors::{AppError, AppResult}; |
| 38 | + |
| 39 | +use super::paths::get_switch_lock_path; |
| 40 | + |
| 41 | +const STALE_LOCK_AGE: Duration = Duration::from_secs(5 * 60); |
| 42 | + |
| 43 | +#[derive(Debug)] |
| 44 | +pub struct ProcessLockGuard { |
| 45 | + lock_path: PathBuf, |
| 46 | +} |
| 47 | + |
| 48 | +impl Drop for ProcessLockGuard { |
| 49 | + fn drop(&mut self) { |
| 50 | + let _ = std::fs::remove_file(&self.lock_path); |
| 51 | + } |
| 52 | +} |
| 53 | + |
| 54 | +fn try_create_lock(lock_path: &Path) -> std::io::Result<()> { |
| 55 | + OpenOptions::new() |
| 56 | + .write(true) |
| 57 | + .create_new(true) |
| 58 | + .open(lock_path) |
| 59 | + .map(|_| ()) |
| 60 | +} |
| 61 | + |
| 62 | +fn lock_is_stale(lock_path: &Path) -> bool { |
| 63 | + let metadata = match std::fs::metadata(lock_path) { |
| 64 | + Ok(value) => value, |
| 65 | + Err(_) => return false, |
| 66 | + }; |
| 67 | + let modified = match metadata.modified() { |
| 68 | + Ok(value) => value, |
| 69 | + Err(_) => return false, |
| 70 | + }; |
| 71 | + SystemTime::now() |
| 72 | + .duration_since(modified) |
| 73 | + .map(|age| age >= STALE_LOCK_AGE) |
| 74 | + .unwrap_or(false) |
| 75 | +} |
| 76 | + |
| 77 | +/// Acquire the shared switch / login lock. The returned guard releases |
| 78 | +/// the lock on drop. `busy_error_code` is the error code returned to |
| 79 | +/// the caller (and ultimately the front-end) when contention is real |
| 80 | +/// — `SWITCH_IN_PROGRESS` for switch, `LOGIN_BUSY` for login. |
| 81 | +/// `busy_message` is the human-readable companion. |
| 82 | +pub fn acquire_process_lock( |
| 83 | + codex_home: Option<&Path>, |
| 84 | + busy_error_code: &'static str, |
| 85 | + busy_message: &'static str, |
| 86 | +) -> AppResult<ProcessLockGuard> { |
| 87 | + let lock_path = get_switch_lock_path(codex_home); |
| 88 | + if let Some(parent) = lock_path.parent() { |
| 89 | + std::fs::create_dir_all(parent).map_err(|error| { |
| 90 | + AppError::new( |
| 91 | + "FS_CREATE_FAILED", |
| 92 | + format!( |
| 93 | + "Failed to create lock directory {}: {error}", |
| 94 | + parent.display() |
| 95 | + ), |
| 96 | + ) |
| 97 | + })?; |
| 98 | + } |
| 99 | + |
| 100 | + if let Err(error) = try_create_lock(&lock_path) { |
| 101 | + // The cheap branch is "real concurrent contention" — that's |
| 102 | + // what AlreadyExists with a fresh lock looks like. The other |
| 103 | + // branch we observed in the wild is the GUI dying mid-switch |
| 104 | + // (force quit / OS logout / hung OAuth) and leaving the lock |
| 105 | + // behind, which then permanently blocks every future caller. |
| 106 | + // Detect that by mtime and reclaim. |
| 107 | + if error.kind() == std::io::ErrorKind::AlreadyExists && lock_is_stale(&lock_path) { |
| 108 | + let _ = std::fs::remove_file(&lock_path); |
| 109 | + try_create_lock(&lock_path).map_err(|retry_error| { |
| 110 | + AppError::new( |
| 111 | + busy_error_code, |
| 112 | + format!( |
| 113 | + "Stale {busy_error_code} lock cleanup failed: {retry_error}. \ |
| 114 | + Another operation may have started in the meantime." |
| 115 | + ), |
| 116 | + ) |
| 117 | + })?; |
| 118 | + } else { |
| 119 | + return Err(AppError::new(busy_error_code, busy_message)); |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + Ok(ProcessLockGuard { lock_path }) |
| 124 | +} |
| 125 | + |
| 126 | +#[cfg(test)] |
| 127 | +mod tests { |
| 128 | + use super::*; |
| 129 | + use std::fs; |
| 130 | + use std::time::{SystemTime, UNIX_EPOCH}; |
| 131 | + |
| 132 | + fn temp_codex_home(name: &str) -> PathBuf { |
| 133 | + let unique = SystemTime::now() |
| 134 | + .duration_since(UNIX_EPOCH) |
| 135 | + .unwrap() |
| 136 | + .as_nanos(); |
| 137 | + let path = std::env::temp_dir() |
| 138 | + .join(format!("codex-switch-process-lock-{name}-{unique}")); |
| 139 | + fs::create_dir_all(path.join("account_backup")).unwrap(); |
| 140 | + path |
| 141 | + } |
| 142 | + |
| 143 | + #[test] |
| 144 | + fn first_acquire_succeeds_and_drop_releases() { |
| 145 | + let codex_home = temp_codex_home("first-acquire"); |
| 146 | + let lock_path = get_switch_lock_path(Some(&codex_home)); |
| 147 | + |
| 148 | + { |
| 149 | + let _guard = acquire_process_lock(Some(&codex_home), "SWITCH_IN_PROGRESS", "busy") |
| 150 | + .expect("first acquire should succeed"); |
| 151 | + assert!(lock_path.is_file(), "lock file must exist while held"); |
| 152 | + } |
| 153 | + assert!(!lock_path.exists(), "lock file must be removed on drop"); |
| 154 | + } |
| 155 | + |
| 156 | + #[test] |
| 157 | + fn second_concurrent_acquire_returns_busy_with_caller_supplied_code() { |
| 158 | + let codex_home = temp_codex_home("concurrent"); |
| 159 | + let _first = acquire_process_lock(Some(&codex_home), "SWITCH_IN_PROGRESS", "busy switch") |
| 160 | + .expect("first acquire"); |
| 161 | + let err = acquire_process_lock(Some(&codex_home), "LOGIN_BUSY", "busy login") |
| 162 | + .expect_err("second acquire should fail"); |
| 163 | + assert_eq!(err.error_code, "LOGIN_BUSY"); |
| 164 | + assert_eq!(err.message, "busy login"); |
| 165 | + } |
| 166 | + |
| 167 | + #[test] |
| 168 | + fn stale_lock_is_reclaimed_on_next_acquire() { |
| 169 | + // Walk the predicate directly: write a fresh file, confirm |
| 170 | + // it's NOT stale; then prove the logic by checking that a |
| 171 | + // sufficiently-aged mtime would be flagged. We can't reliably |
| 172 | + // back-date a real file across all CI platforms without an |
| 173 | + // extra crate, so this test pins the threshold predicate |
| 174 | + // shape rather than the end-to-end "delete + reacquire" flow, |
| 175 | + // which is exercised by the integration test below. |
| 176 | + let codex_home = temp_codex_home("stale-predicate"); |
| 177 | + let lock_path = get_switch_lock_path(Some(&codex_home)); |
| 178 | + std::fs::write(&lock_path, b"").unwrap(); |
| 179 | + assert!( |
| 180 | + !lock_is_stale(&lock_path), |
| 181 | + "freshly created lock must not be flagged as stale" |
| 182 | + ); |
| 183 | + } |
| 184 | + |
| 185 | + #[test] |
| 186 | + fn integration_stale_lock_is_swept_when_mtime_old_enough() { |
| 187 | + // Skip if we can't back-date the file. On macOS / Linux we |
| 188 | + // have `nix::sys::stat::utimes`; on Windows we have |
| 189 | + // `SetFileTime`. Rather than pulling in those deps for a |
| 190 | + // single test, drop a real lock file with mtime now-6min via |
| 191 | + // `touch -t` shell out (only on POSIX) and skip on platforms |
| 192 | + // where it's not available. |
| 193 | + if cfg!(target_os = "windows") { |
| 194 | + return; |
| 195 | + } |
| 196 | + let codex_home = temp_codex_home("stale-integration"); |
| 197 | + let lock_path = get_switch_lock_path(Some(&codex_home)); |
| 198 | + std::fs::write(&lock_path, b"").unwrap(); |
| 199 | + |
| 200 | + // Six minutes ago. `touch -t YYYYMMDDhhmm` is universally |
| 201 | + // supported on POSIX `touch`. |
| 202 | + let six_min_ago = chrono::Local::now() - chrono::Duration::minutes(6); |
| 203 | + let ts = six_min_ago.format("%Y%m%d%H%M").to_string(); |
| 204 | + let status = std::process::Command::new("touch") |
| 205 | + .args(["-t", &ts, lock_path.to_str().unwrap()]) |
| 206 | + .status(); |
| 207 | + if status.map(|s| !s.success()).unwrap_or(true) { |
| 208 | + // touch unavailable; skip. |
| 209 | + return; |
| 210 | + } |
| 211 | + assert!( |
| 212 | + lock_is_stale(&lock_path), |
| 213 | + "lock with 6-minute-old mtime must be flagged stale" |
| 214 | + ); |
| 215 | + |
| 216 | + let _guard = acquire_process_lock(Some(&codex_home), "SWITCH_IN_PROGRESS", "busy") |
| 217 | + .expect("stale lock must be reclaimable"); |
| 218 | + assert!(lock_path.is_file()); |
| 219 | + } |
| 220 | +} |
0 commit comments