diff --git a/scarb/Cargo.toml b/scarb/Cargo.toml index c05be6f3d..63c256111 100644 --- a/scarb/Cargo.toml +++ b/scarb/Cargo.toml @@ -152,3 +152,4 @@ zip.workspace = true [features] default = ["scarb-lint"] scarb-lint = ["dep:cairo-lint"] +test-utils = [] diff --git a/scarb/src/ops/cache.rs b/scarb/src/ops/cache.rs index 1804a81f4..be4dd14b9 100644 --- a/scarb/src/ops/cache.rs +++ b/scarb/src/ops/cache.rs @@ -2,6 +2,16 @@ use anyhow::{Context, Result}; use crate::core::Config; +// Name of the package cache lock file inside the cache dir. Kept in sync with +// `Config::package_cache_lock`. `cache_clean` must NOT unlink this file: the +// `flock` held by the in-progress clean is per-inode, and unlinking allows a +// concurrent `acquire_async` to create a different inode at the same path +// and succeed against it as if uncontended — both processes would then +// believe they hold the exclusive lock at the same time. The path is also +// part of the on-disk contract with older scarb versions, so we cannot move +// it without losing cross-version mutual exclusion. +const PACKAGE_CACHE_LOCK_FILENAME: &str = ".package-cache.lock"; + #[tracing::instrument(skip_all, level = "debug")] pub fn cache_clean(config: &Config) -> Result<()> { let path = config.dirs().cache_dir.path_unchecked(); @@ -9,7 +19,58 @@ pub fn cache_clean(config: &Config) -> Result<()> { let _lock = config .tokio_handle() .block_on(config.package_cache_lock().acquire_async())?; - scarb_fs_utils::remove_dir_all(path).context("failed to clean cache")?; + + // Remove every entry in the cache dir except the lock file itself. + // See `PACKAGE_CACHE_LOCK_FILENAME` for why the lock file must stay. + for entry in std::fs::read_dir(path).context("failed to clean cache")? { + let entry = entry.context("failed to clean cache")?; + if entry.file_name() == PACKAGE_CACHE_LOCK_FILENAME { + continue; + } + let entry_path = entry.path(); + let file_type = entry + .file_type() + .with_context(|| format!("failed to stat cache entry: {}", entry_path.display()))?; + if file_type.is_dir() { + scarb_fs_utils::remove_dir_all(&entry_path).with_context(|| { + format!("failed to clean cache entry: {}", entry_path.display()) + })?; + } else { + std::fs::remove_file(&entry_path).with_context(|| { + format!("failed to clean cache entry: {}", entry_path.display()) + })?; + } + } + + test_pause_hook(); } Ok(()) } + +// Test-only synchronization point (see +// `cache_clean_race_with_concurrent_lock_acquisition` in `scarb/tests/cache.rs`). +// +// When `SCARB_INTERNAL_CACHE_CLEAN_PAUSE` is set to a path prefix `P`, this: +// 1. creates `P.ready` to signal the test that the lock is held and the +// cache contents (other than the lock file) have just been removed, +// 2. blocks until `P.go` appears. +// +// At this point the cache dir has been emptied except for the lock file, and +// `_lock` in `cache_clean` still holds the `flock` — exactly the state a +// concurrent `acquire_async` must observe and block on. The env var is +// namespaced as `_INTERNAL_` to make clear it is not part of the public +// CLI/env contract. +#[cfg(feature = "test-utils")] +fn test_pause_hook() { + let Ok(prefix) = std::env::var("SCARB_INTERNAL_CACHE_CLEAN_PAUSE") else { + return; + }; + let _ = std::fs::write(format!("{prefix}.ready"), ""); + let go = std::path::PathBuf::from(format!("{prefix}.go")); + while !go.exists() { + std::thread::sleep(std::time::Duration::from_millis(10)); + } +} + +#[cfg(not(feature = "test-utils"))] +fn test_pause_hook() {} diff --git a/scarb/tests/cache.rs b/scarb/tests/cache.rs index 996ddfd69..cee8341d0 100644 --- a/scarb/tests/cache.rs +++ b/scarb/tests/cache.rs @@ -1,4 +1,8 @@ +use std::process::Stdio; +use std::time::{Duration, Instant}; + use assert_fs::{TempDir, prelude::*}; +use ntest::timeout; use snapbox::Data; use scarb_test_support::command::Scarb; @@ -27,7 +31,15 @@ fn simple_clean() { .current_dir(&t) .assert() .success(); - cache_dir.assert(predicates::path::missing()); + // After `cache clean`, the cache dir survives but contains only the + // package cache lock file. See `ops::cache::PACKAGE_CACHE_LOCK_FILENAME` + // for why the lock file is preserved across clean. + cache_dir.assert(predicates::path::is_dir()); + let entries: Vec<_> = std::fs::read_dir(cache_dir.path()) + .unwrap() + .map(|e| e.unwrap().file_name()) + .collect(); + assert_eq!(entries, [".package-cache.lock"]); } #[test] @@ -47,3 +59,112 @@ fn path_print() { .success(); cache_dir.assert(predicates::path::is_dir()); } + +// Regression test for the lock-file race in `ops::cache::cache_clean`: +// 1. clean acquires `flock(EX)` on `/.package-cache.lock`, +// 2. `remove_dir_all()` unlinks the lock file, +// 3. a concurrent `package_cache_lock().acquire_async()` recreates the +// cache dir, opens a brand-new lock file at the same path (different +// inode), and `flock(EX)` succeeds immediately — `flock` is per-inode, +// so it does not conflict with the still-open, now-unlinked old inode. +// +// Without a fix, the two scarb processes both believe they hold the +// exclusive package cache lock at the same time. +// +// This test makes the race deterministic via the +// `SCARB_INTERNAL_CACHE_CLEAN_PAUSE` hook in `cache_clean`: clean is paused +// while still holding `_lock` and after `remove_dir_all` has completed, the +// test then runs `scarb fetch` and signals clean to resume. +// +// Correct behavior: while clean holds the package cache lock, `scarb fetch` +// must block on lock acquisition and emit `flock.rs`'s "Blocking waiting +// for file lock" message. The test asserts that, and therefore fails until +// the lock is kept on a stable inode across `cache clean` (e.g. moved out +// of the cache dir). +#[cfg(unix)] +#[test] +#[timeout(60_000)] +fn cache_clean_race_with_concurrent_lock_acquisition() { + let cache_dir = TempDir::new().unwrap(); + let config_dir = TempDir::new().unwrap(); + let sync_dir = TempDir::new().unwrap(); + let t = TempDir::new().unwrap(); + ProjectBuilder::start() + .name("hello") + .version("0.1.0") + .build(&t); + + // cache_clean only does work when the cache dir already exists. + std::fs::create_dir_all(cache_dir.path()).unwrap(); + + let prefix = sync_dir.child("clean-pause"); + let ready_path = sync_dir.child("clean-pause.ready"); + let go_path = sync_dir.child("clean-pause.go"); + + let mut clean = Scarb::new() + .cache(cache_dir.path()) + .config(config_dir.path()) + .std() + .arg("cache") + .arg("clean") + .env("SCARB_INTERNAL_CACHE_CLEAN_PAUSE", prefix.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + + // Wait until cache_clean has acquired the lock and finished remove_dir_all. + let deadline = Instant::now() + Duration::from_secs(30); + while !ready_path.path().exists() { + if Instant::now() > deadline { + let _ = clean.kill(); + panic!("scarb cache clean never reached the pause sentinel"); + } + std::thread::sleep(Duration::from_millis(20)); + } + + // Clean is now paused mid-flight: cache dir + lock file unlinked, but it + // still holds the flock on the original inode. Spawn fetch — under a + // correct implementation it must block on lock acquisition until clean + // releases. + let fetch = Scarb::new() + .cache(cache_dir.path()) + .config(config_dir.path()) + .std() + .arg("fetch") + .current_dir(&t) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .unwrap(); + + // Give fetch a moment to reach `acquire_async` and either block (correct) + // or grab a brand-new-inode lock (buggy). Then release clean so fetch + // can complete in either case. + std::thread::sleep(Duration::from_millis(500)); + std::fs::write(go_path.path(), "").unwrap(); + + let fetch_output = fetch.wait_with_output().unwrap(); + let clean_output = clean.wait_with_output().unwrap(); + + let fetch_stdout = String::from_utf8_lossy(&fetch_output.stdout); + let fetch_stderr = String::from_utf8_lossy(&fetch_output.stderr); + let clean_stdout = String::from_utf8_lossy(&clean_output.stdout); + let clean_stderr = String::from_utf8_lossy(&clean_output.stderr); + + assert!( + clean_output.status.success(), + "scarb cache clean failed\nstdout: {clean_stdout}\nstderr: {clean_stderr}", + ); + assert!( + fetch_output.status.success(), + "scarb fetch failed\nstdout: {fetch_stdout}\nstderr: {fetch_stderr}", + ); + assert!( + fetch_stdout.contains("Blocking waiting for file lock"), + "scarb fetch did NOT block on the package cache lock while \ + `scarb cache clean` was holding it — both processes acquired what \ + they each consider an exclusive lock at the same time. \ + fetch stdout: {fetch_stdout}", + ); +}