diff --git a/Cargo.lock b/Cargo.lock index b11d20e6d..63e31bf6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1747,8 +1747,10 @@ dependencies = [ "serde_json", "sha2", "slog", + "slog-term", "static_assertions", "subprocess", + "tempfile", "tokio", "uuid", ] diff --git a/pantry/Cargo.toml b/pantry/Cargo.toml index 41131459f..55bafd1d0 100644 --- a/pantry/Cargo.toml +++ b/pantry/Cargo.toml @@ -35,4 +35,6 @@ crucible-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true +slog-term.workspace = true subprocess.workspace = true +tempfile.workspace = true diff --git a/pantry/src/pantry.rs b/pantry/src/pantry.rs index 817027ee7..ff0e5d549 100644 --- a/pantry/src/pantry.rs +++ b/pantry/src/pantry.rs @@ -566,6 +566,17 @@ impl PantryJobs { } } +enum PantryEntryState { + /// Volume construction and/or activation is in progress. + Attaching { + vcr: Box, + job_id: Option, + }, + + /// Volume is constructed and (at least at one point) active. + Attached(Arc), +} + /// Pantry stores opened Volumes in-memory pub struct Pantry { pub log: Logger, @@ -573,7 +584,7 @@ pub struct Pantry { /// Store a Volume Construction Request and Volume, indexed by id. Use this /// Mutex -> Arc structure in order for multiple requests to act on /// multiple PantryEntry objects at the same time. - entries: Mutex>>, + entries: Mutex>, /// Pantry can run background jobs on Volumes, and currently running jobs /// are stored here. @@ -605,83 +616,167 @@ impl Pantry { volume_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<(), CrucibleError> { - let mut entries = self.entries.lock().await; - if let Some(entry) = entries.get(&volume_id) { - // This function must be idempotent for the same inputs. If an entry - // at this ID exists already, compare the existing volume - // construction request, and return either Ok or conflict - - let inner = entry.inner.lock().await; - - if let Some(job_id) = &inner.activation_job_id { - crucible_bail!( - Unsupported, - "existing entry for {} with activation job id {}", - volume_id, - job_id, - ); - } + // This function must be idempotent for the same inputs. + // Check existing state, and if no entry exists, insert + // an Attaching marker before releasing the lock so that + // concurrent callers see this volume is being worked on. + { + let mut entries = self.entries.lock().await; + match entries.get(&volume_id) { + Some(PantryEntryState::Attached(entry)) => { + let inner = entry.inner.lock().await; + + if let Some(job_id) = &inner.activation_job_id { + crucible_bail!( + Unsupported, + "existing entry for {volume_id} \ + with activation job id {job_id}", + ); + } - if inner.volume_construction_request == volume_construction_request - { - info!( - self.log, - "volume {} already an entry, and has same volume \ - construction request, returning OK", - volume_id, - ); + if inner.volume_construction_request + == volume_construction_request + { + info!( + self.log, + "volume {volume_id} already an \ + entry, and has same volume \ + construction request, \ + returning OK", + ); + return Ok(()); + } else { + error!( + self.log, + "volume {volume_id} already an \ + entry, but has different volume \ + construction request, bailing!", + ); + crucible_bail!( + Unsupported, + "Existing entry for {volume_id} \ + with different volume \ + construction request!", + ); + } + } - return Ok(()); - } else { - error!( - self.log, - "volume {} already an entry, but has different volume \ - construction request, bailing!", - volume_id, - ); + // Another task is already constructing / + // activating this volume. Return an error so + // the caller can retry — on retry they will + // either hit the Attached idempotency check + // (if the first attach succeeded) or the None + // branch (if it failed and was cleaned up). + Some(PantryEntryState::Attaching { + vcr, + job_id: existing_job_id, + }) => { + if **vcr == volume_construction_request { + crucible_bail!( + Unsupported, + "volume {volume_id} attach \ + already in progress \ + (job_id: {existing_job_id:?})", + ); + } else { + crucible_bail!( + Unsupported, + "volume {volume_id} is being \ + attached with a different VCR \ + (job_id: {existing_job_id:?})", + ); + } + } - crucible_bail!( - Unsupported, - "Existing entry for {} with different volume construction \ - request!", - volume_id, - ); + // Reserve this volume_id with an Attaching + // marker so concurrent requests see that + // construction is in progress. + None => { + entries.insert( + volume_id.clone(), + PantryEntryState::Attaching { + vcr: Box::new(volume_construction_request.clone()), + job_id: None, + }, + ); + } } } - // If no entry exists, then add one info!( self.log, - "no entry exists for volume {}, constructing...", volume_id + "no entry exists for volume {volume_id}, \ + constructing...", ); + // Construct and activate without holding the entries + // lock. On failure, clean up the Attaching marker. let volume = Volume::construct( volume_construction_request.clone(), None, self.log.clone(), ) - .await?; + .await; - info!(self.log, "volume {} constructed ok", volume_id); + let volume = match volume { + Ok(v) => v, + Err(e) => { + error!( + self.log, + "volume {volume_id} construct \ + failed: {e}", + ); + self.entries.lock().await.remove(&volume_id); + return Err(e.into()); + } + }; - volume.activate().await?; + info!(self.log, "volume {volume_id} constructed ok"); - info!(self.log, "volume {} activated ok", volume_id); + if let Err(e) = volume.activate().await { + error!( + self.log, + "volume {volume_id} activate \ + failed: {e}", + ); + self.entries.lock().await.remove(&volume_id); + return Err(e); + } - entries.insert( - volume_id.clone(), - Arc::new(PantryEntry { - log: self.log.new(o!("volume" => volume_id.clone())), - volume, - inner: Mutex::new(PantryEntryInner { - volume_construction_request, - active_observation: ActiveObservation::SawActive, - activation_job_id: None, - }), - }), - ); + info!(self.log, "volume {volume_id} activated ok"); + + // Verify our Attaching marker is still present. Nothing + // should remove it (detach rejects Attaching entries + // and waits for attach to finish or fail first), but + // check defensively in case the code evolves. + let mut entries = self.entries.lock().await; + match entries.get(&volume_id) { + Some(PantryEntryState::Attaching { .. }) => { + entries.insert( + volume_id.clone(), + PantryEntryState::Attached(Arc::new(PantryEntry { + log: self.log.new(o!("volume" => volume_id.clone())), + volume, + inner: Mutex::new(PantryEntryInner { + volume_construction_request, + active_observation: ActiveObservation::SawActive, + activation_job_id: None, + }), + })), + ); + } + _ => { + drop(entries); + let _ = volume.deactivate().await; + crucible_bail!( + Unsupported, + "volume {volume_id} was removed \ + during attach", + ); + } + } - info!(self.log, "volume {} constructed and inserted ok", volume_id); + info!(self.log, "volume {volume_id} constructed and inserted ok",); Ok(()) } @@ -693,87 +788,138 @@ impl Pantry { job_id: String, volume_construction_request: VolumeConstructionRequest, ) -> Result<(), CrucibleError> { - let mut entries = self.entries.lock().await; - if let Some(entry) = entries.get(&volume_id) { - // This function must be idempotent for the same inputs. If an entry - // at this ID exists already, compare the existing volume - // construction request, and return either Ok or conflict + // This function must be idempotent for the same inputs. + // Check existing state, and if no entry exists, insert + // an Attaching marker before releasing the lock. + { + let mut entries = self.entries.lock().await; + match entries.get(&volume_id) { + Some(PantryEntryState::Attached(entry)) => { + let inner = entry.inner.lock().await; + + match &inner.activation_job_id { + Some(entry_job_id) => { + if *entry_job_id != job_id { + crucible_bail!( + Unsupported, + "existing entry for \ + {volume_id} with \ + different activation \ + job id {job_id}", + ); + } + } + + None => { + crucible_bail!( + Unsupported, + "existing entry for \ + {volume_id} with no \ + activation job id", + ); + } + } - let inner = entry.inner.lock().await; + if inner.volume_construction_request + == volume_construction_request + { + info!( + self.log, + "volume {volume_id} already an \ + entry, and has same volume \ + construction request, \ + returning OK", + ); + return Ok(()); + } else { + error!( + self.log, + "volume {volume_id} already an \ + entry, but has different volume \ + construction request, bailing!", + ); + crucible_bail!( + Unsupported, + "Existing entry for {volume_id} \ + with different volume \ + construction request!", + ); + } + } - match &inner.activation_job_id { - Some(entry_job_id) => { - if *entry_job_id != job_id { + Some(PantryEntryState::Attaching { + vcr, + job_id: existing_job_id, + }) => { + if **vcr == volume_construction_request { crucible_bail!( Unsupported, - "existing entry for {} with different activation job id {}", - volume_id, - job_id, + "volume {volume_id} attach \ + already in progress \ + (job_id: {existing_job_id:?})", + ); + } else { + crucible_bail!( + Unsupported, + "volume {volume_id} is being \ + attached with a different VCR \ + (job_id: {existing_job_id:?})", ); } } None => { - // volume was attached with `attach`, not with this - // function. return an error! - - crucible_bail!( - Unsupported, - "existing entry for {} with no activation job id", - volume_id, + entries.insert( + volume_id.clone(), + PantryEntryState::Attaching { + vcr: Box::new(volume_construction_request.clone()), + job_id: Some(job_id.clone()), + }, ); } } - - if inner.volume_construction_request == volume_construction_request - { - error!( - self.log, - "volume {} already an entry, and has same volume \ - construction request, returning OK", - volume_id, - ); - - return Ok(()); - } else { - error!( - self.log, - "volume {} already an entry, but has different volume \ - construction request, bailing!", - volume_id, - ); - - crucible_bail!( - Unsupported, - "Existing entry for {} with different volume construction \ - request!", - volume_id, - ); - } } - // To make this function idempotent, the user must supply the job id. If - // that job id already exists, then bail out. - let mut jobs = self.jobs.lock().await; - - if jobs.contains_job(&job_id) { - crucible_bail!(Unsupported, "Existing job id {}", job_id,); + // To make this function idempotent, the user must + // supply the job id. If that job id already exists, + // clean up the Attaching marker and bail out. + { + let jobs = self.jobs.lock().await; + if jobs.contains_job(&job_id) { + self.entries.lock().await.remove(&volume_id); + crucible_bail!(Unsupported, "Existing job id {job_id}",); + } } - // If no entry exists, then add one info!( self.log, - "no entry exists for volume {}, constructing...", volume_id + "no entry exists for volume {volume_id}, \ + constructing...", ); + // Construct without holding any locks. On failure, + // clean up the Attaching marker. let volume = Volume::construct( volume_construction_request.clone(), None, self.log.clone(), ) - .await?; + .await; - info!(self.log, "volume {} constructed ok", volume_id); + let volume = match volume { + Ok(v) => v, + Err(e) => { + error!( + self.log, + "volume {volume_id} construct \ + failed: {e}", + ); + self.entries.lock().await.remove(&volume_id); + return Err(e.into()); + } + }; + + info!(self.log, "volume {volume_id} constructed ok"); let entry = Arc::new(PantryEntry { log: self.log.new(o!("volume" => volume_id.clone())), @@ -785,35 +931,70 @@ impl Pantry { }), }); - entries.insert(volume_id.clone(), entry.clone()); + // Replace the Attaching marker with the Attached + // entry and register the background activation job. + { + let mut entries = self.entries.lock().await; + match entries.get(&volume_id) { + Some(PantryEntryState::Attaching { .. }) => { + entries.insert( + volume_id.clone(), + PantryEntryState::Attached(entry.clone()), + ); + } + _ => { + crucible_bail!( + Unsupported, + "volume {volume_id} was removed \ + during attach", + ); + } + } + } - info!(self.log, "volume {} constructed and inserted ok", volume_id); + info!(self.log, "volume {volume_id} constructed and inserted ok",); let join_handle = tokio::spawn(async move { entry.activate().await }); - info!(self.log, "volume {} activating in background", volume_id); + info!(self.log, "volume {volume_id} activating in background",); - jobs.insert(volume_id, job_id.clone(), join_handle); - drop(jobs); + let mut jobs = self.jobs.lock().await; + jobs.insert(volume_id, job_id, join_handle); Ok(()) } - /// Return a PantryEntry if it's in the map, or a 404. + /// Return a PantryEntry if it's in the map and attached, a + /// 503 if it's still being attached, or a 404. pub(crate) async fn entry_get( &self, volume_id: String, ) -> Result, HttpError> { let entries = self.entries.lock().await; match entries.get(&volume_id) { - Some(entry) => { + Some(PantryEntryState::Attached(entry)) => { let entry = entry.clone(); drop(entries); Ok(entry) } + Some(PantryEntryState::Attaching { .. }) => { + info!( + self.log, + "volume {volume_id} is still being \ + attached", + ); + Err(HttpError::for_unavail( + None, + format!( + "volume {volume_id} attach in \ + progress", + ), + )) + } + None => { - error!(self.log, "volume {} not in pantry", volume_id); + error!(self.log, "volume {volume_id} not in pantry",); Err(HttpError::for_not_found(None, volume_id)) } } @@ -827,39 +1008,33 @@ impl Pantry { ) -> Result, HttpError> { let entry = self.entry_get(volume_id.clone()).await?; - let inner = entry.inner.lock().await; - match &inner.active_observation { - ActiveObservation::NeverSawActive => { - // Return the entry so that it can receive commands before it is - // active. - drop(inner); - Ok(entry) - } + let saw_active = { + let inner = entry.inner.lock().await; + matches!(inner.active_observation, ActiveObservation::SawActive,) + }; - ActiveObservation::SawActive => { - // Before returning the entry, check if something else activated - // the volume. - if !entry.volume.query_is_active().await? { - // If it's not active, then return "410 Gone". If this - // volume is no longer active then it's likely a Propolis - // has activated and taken over from the Pantry. Do not - // return 503 in this case, no operation will be retryable - // if inactive. - - Err(HttpError::for_client_error( - Some(format!( - "volume {} is no longer active!", - volume_id - )), - dropshot::ClientErrorStatusCode::GONE, - format!("volume {} is no longer active!", volume_id), - )) - } else { - drop(inner); - Ok(entry) - } + if saw_active { + // Check if something else activated the volume. + // If it's not active, return "410 Gone" — likely a + // Propolis has activated and taken over from the + // Pantry. Do not return 503 in this case, no + // operation will be retryable if inactive. + if !entry.volume.query_is_active().await? { + return Err(HttpError::for_client_error( + Some(format!( + "volume {volume_id} is no longer \ + active!", + )), + dropshot::ClientErrorStatusCode::GONE, + format!( + "volume {volume_id} is no longer \ + active!", + ), + )); } } + + Ok(entry) } pub async fn volume_status( @@ -917,33 +1092,29 @@ impl Pantry { &self, job_id: String, ) -> Result, HttpError> { - let mut jobs = self.jobs.lock().await; - - // Remove the job from the list of jobs, then await on the join handle. - // If this errors, then the job has failed in some way, so don't leave - // it in the list of jobs. - match jobs.remove(&job_id) { - Some(join_handle) => { - let result: Result<(), CrucibleError> = - join_handle.await.map_err(|e| { - HttpError::for_internal_error(e.to_string()) - })?; - - jobs.remove(&job_id); - - if let Err(e) = &result { - error!(self.log, "job {} failed with {}", job_id, e); + let join_handle = { + let mut jobs = self.jobs.lock().await; + match jobs.remove(&job_id) { + Some(handle) => handle, + None => { + error!(self.log, "job {job_id} not a pantry job",); + return Err(HttpError::for_not_found( + None, + job_id.to_string(), + )); } - - Ok(result.map_err(|e| e.into())) } + }; - None => { - error!(self.log, "job {} not a pantry job", job_id); + let result: Result<(), CrucibleError> = join_handle + .await + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; - Err(HttpError::for_not_found(None, job_id.to_string())) - } + if let Err(e) = &result { + error!(self.log, "job {job_id} failed with {e}",); } + + Ok(result.map_err(|e| e.into())) } pub async fn import_from_url( @@ -1031,21 +1202,46 @@ impl Pantry { /// Remove an entry from the pantry, and detach it. If detach fails, the /// entry is still gone but this function will return an error. pub async fn detach(&self, volume_id: String) -> Result<(), CrucibleError> { - let mut entries = self.entries.lock().await; + let entry = { + let mut entries = self.entries.lock().await; + info!( + self.log, + "detach removing entry for volume \ + {volume_id}", + ); + match entries.get(&volume_id) { + Some(PantryEntryState::Attaching { .. }) => { + // Can't detach until attach finishes or + // fails. + crucible_bail!( + Unsupported, + "volume {volume_id} attach in \ + progress, cannot detach", + ); + } + + Some(PantryEntryState::Attached(_)) => { + match entries.remove(&volume_id) { + Some(PantryEntryState::Attached(entry)) => Some(entry), + _ => unreachable!(), + } + } - info!(self.log, "detach removing entry for volume {}", volume_id); + None => None, + } + }; - match entries.remove(&volume_id) { + match entry { Some(entry) => { - info!(self.log, "detaching volume {}", volume_id); + info!(self.log, "detaching volume {volume_id}",); entry.detach().await?; - drop(entry); } None => { info!( self.log, - "detach did nothing, no entry for volume {}", volume_id + "detach did nothing, no entry for \ + volume {volume_id}", ); } } @@ -1053,3 +1249,241 @@ impl Pantry { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use slog::Drain; + use std::io::Write; + use tempfile::NamedTempFile; + + fn test_logger() -> Logger { + let plain = slog_term::PlainSyncDecorator::new(std::io::stdout()); + Logger::root(slog_term::FullFormat::new(plain).build().fuse(), o!()) + } + + fn file_vcr(path: &str) -> VolumeConstructionRequest { + VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(VolumeConstructionRequest::File { + id: Uuid::new_v4(), + block_size: 512, + path: path.to_string(), + })), + } + } + + fn temp_disk() -> NamedTempFile { + let mut f = NamedTempFile::new().unwrap(); + f.write_all(&vec![0u8; 512]).unwrap(); + f + } + + #[tokio::test] + async fn attach_idempotent_same_vcr() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let id = Uuid::new_v4(); + let vcr = file_vcr(path); + + pantry.attach(id.to_string(), vcr.clone()).await.unwrap(); + + // Same VCR again should succeed (idempotent). + pantry.attach(id.to_string(), vcr).await.unwrap(); + } + + #[tokio::test] + async fn attach_rejects_different_vcr() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let id = Uuid::new_v4(); + let vcr1 = file_vcr(path); + + // Use a different UUID to make a different VCR pointing + // at the same file. + let vcr2 = VolumeConstructionRequest::Volume { + id: Uuid::new_v4(), + block_size: 512, + sub_volumes: vec![], + read_only_parent: Some(Box::new(VolumeConstructionRequest::File { + id: Uuid::new_v4(), + block_size: 512, + path: path.to_string(), + })), + }; + + pantry.attach(id.to_string(), vcr1).await.unwrap(); + + let err = pantry.attach(id.to_string(), vcr2).await.unwrap_err(); + + assert!( + err.to_string().contains("different"), + "expected 'different' error, got: {err}", + ); + } + + #[tokio::test] + async fn construct_failure_cleans_up_attaching() { + let pantry = Pantry::new(test_logger()).unwrap(); + + // Use a path that does not exist so construct fails. + let vcr = file_vcr("/no/such/file"); + let id = Uuid::new_v4(); + + let err = pantry.attach(id.to_string(), vcr).await; + assert!(err.is_err()); + + // The Attaching marker must have been cleaned up. + // A fresh attach to the same volume_id should work — + // the map entry should be gone. + let entries = pantry.entries.lock().await; + assert!( + !entries.contains_key(&id.to_string()), + "Attaching marker was not cleaned up", + ); + } + + #[tokio::test] + async fn entry_get_returns_503_for_attaching() { + let pantry = Pantry::new(test_logger()).unwrap(); + let id = Uuid::new_v4().to_string(); + + // Manually insert an Attaching marker. + { + let mut entries = pantry.entries.lock().await; + entries.insert( + id.clone(), + PantryEntryState::Attaching { + vcr: Box::new(file_vcr("/fake")), + job_id: None, + }, + ); + } + + let Err(err) = pantry.entry_get(id).await else { + panic!("expected error for Attaching entry"); + }; + assert_eq!(err.status_code, http::StatusCode::SERVICE_UNAVAILABLE,); + } + + #[tokio::test] + async fn entry_get_returns_404_for_missing() { + let pantry = Pantry::new(test_logger()).unwrap(); + let id = Uuid::new_v4().to_string(); + + let Err(err) = pantry.entry_get(id).await else { + panic!("expected error for missing entry"); + }; + assert_eq!(err.status_code, http::StatusCode::NOT_FOUND); + } + + #[tokio::test] + async fn detach_rejects_attaching() { + let pantry = Pantry::new(test_logger()).unwrap(); + let id = Uuid::new_v4().to_string(); + + { + let mut entries = pantry.entries.lock().await; + entries.insert( + id.clone(), + PantryEntryState::Attaching { + vcr: Box::new(file_vcr("/fake")), + job_id: None, + }, + ); + } + + let err = pantry.detach(id).await.unwrap_err(); + assert!( + err.to_string().contains("cannot detach"), + "expected 'cannot detach' error, got: {err}", + ); + } + + #[tokio::test] + async fn detach_after_attach() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let id = Uuid::new_v4().to_string(); + let vcr = file_vcr(path); + + pantry.attach(id.clone(), vcr).await.unwrap(); + pantry.detach(id.clone()).await.unwrap(); + + // After detach, entry_get should return 404. + let Err(err) = pantry.entry_get(id).await else { + panic!("expected 404 after detach"); + }; + assert_eq!(err.status_code, http::StatusCode::NOT_FOUND); + } + + #[tokio::test] + async fn status_shows_attached_volume() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let id = Uuid::new_v4().to_string(); + let vcr = file_vcr(path); + + pantry.attach(id.clone(), vcr).await.unwrap(); + + let status = pantry.status().await.unwrap(); + assert!(status.volumes.contains(&id)); + } + + #[tokio::test] + async fn attach_activate_background_and_get_job() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let volume_id = Uuid::new_v4().to_string(); + let job_id = Uuid::new_v4().to_string(); + let vcr = file_vcr(path); + + pantry + .attach_activate_background(volume_id.clone(), job_id.clone(), vcr) + .await + .unwrap(); + + // Wait for the background job to finish (File VCR + // activation is instant). + let result = pantry.get_job_result(job_id).await.unwrap(); + assert!(result.is_ok()); + } + + #[tokio::test] + async fn attach_activate_background_idempotent() { + let pantry = Pantry::new(test_logger()).unwrap(); + let disk = temp_disk(); + let path = disk.path().to_str().unwrap(); + + let volume_id = Uuid::new_v4().to_string(); + let job_id = Uuid::new_v4().to_string(); + let vcr = file_vcr(path); + + pantry + .attach_activate_background( + volume_id.clone(), + job_id.clone(), + vcr.clone(), + ) + .await + .unwrap(); + + // Same volume_id + job_id + vcr should succeed. + pantry + .attach_activate_background(volume_id, job_id, vcr) + .await + .unwrap(); + } +} diff --git a/tools/test_pantry_hang.sh b/tools/test_pantry_hang.sh new file mode 100755 index 000000000..798d001ea --- /dev/null +++ b/tools/test_pantry_hang.sh @@ -0,0 +1,451 @@ +#!/bin/bash + +# Demonstrate that the pantry blocks when attaching a RW volume with one +# downstairs stopped. +# +# The synchronous attach endpoint holds the pantry's entries lock for the +# entire duration of activation. When a downstairs is unavailable, the +# upstairs cannot activate, so the lock is held indefinitely. This blocks +# every other pantry operation that needs the entries lock (status, +# volume-status, duplicate attach, etc.). +# +# The progenitor-generated pantry client has a 15-second request timeout, +# so pantest will return an error, but the server-side handler keeps running +# (HandlerTaskMode::Detached) and continues to hold the lock. +# +# This test verifies: +# 1. The initial attach times out on the client side. +# 2. After the timeout, the pantry is still running. +# 3. A second volume (against a healthy dsc) can be attached. +# (This requires the fix -- without it, the entries lock blocks +# all other attach/status/detach operations.) +# 4. That second volume can be detached. +# 5. A status request works. +# 6. After starting the stopped downstairs, the original volume +# activates and the pantry reports it correctly. + +set -o pipefail + +SECONDS=0 + +trap ctrl_c INT +function ctrl_c() { + echo "" + echo "Stopping at your request" + cleanup + exit 1 +} + +function cleanup() { + # Kill any background pantest processes. + for pid in $pantest_pid $attach2_pid $detach_pid $status_pid; do + if [[ -n "$pid" ]] && ps -p "$pid" > /dev/null 2>&1; then + kill "$pid" 2>/dev/null + wait "$pid" 2>/dev/null + fi + done + if [[ -n "$pantry_pid" ]] && ps -p "$pantry_pid" > /dev/null 2>&1; then + echo "Stopping pantry (pid $pantry_pid)" + kill "$pantry_pid" + wait "$pantry_pid" 2>/dev/null + fi + if [[ -n "$dsc_pid" ]] && ps -p "$dsc_pid" > /dev/null 2>&1; then + echo "Shutting down dsc 1" + ${dsc} cmd shutdown 2>/dev/null + wait "$dsc_pid" 2>/dev/null + fi + if [[ -n "$dsc2_pid" ]] && ps -p "$dsc2_pid" > /dev/null 2>&1; then + echo "Shutting down dsc 2" + ${dsc} cmd --server http://127.0.0.1:9999 shutdown 2>/dev/null + wait "$dsc2_pid" 2>/dev/null + fi +} + +ROOT=$(cd "$(dirname "$0")/.." && pwd) +export BINDIR=${BINDIR:-$ROOT/target/release} +cds="$BINDIR/crucible-downstairs" +dsc="$BINDIR/dsc" +pantry="$BINDIR/crucible-pantry" +pantest_bin="$BINDIR/pantest" + +for bin in $cds $dsc $pantry $pantest_bin; do + if [[ ! -f "$bin" ]]; then + echo "Can't find binary at $bin" >&2 + exit 1 + fi +done + +if pgrep -fl -U "$(id -u)" "$cds" > /dev/null 2>&1; then + echo "Downstairs already running" >&2 + echo "Run: pkill -f -U $(id -u) $cds" >&2 + exit 1 +fi + +if pgrep -fl -U "$(id -u)" crucible-pantry > /dev/null 2>&1; then + echo "Pantry already running" >&2 + echo "Run: pkill -f -U $(id -u) crucible-pantry" >&2 + exit 1 +fi + +REGION_ROOT=${REGION_ROOT:-/var/tmp} +MY_REGION_ROOT="${REGION_ROOT}/test_pantry_hang" +if [[ -d "$MY_REGION_ROOT" ]]; then + rm -rf "$MY_REGION_ROOT" +fi +mkdir -p "$MY_REGION_ROOT" +if [[ $? -ne 0 ]]; then + echo "Failed to make region root $MY_REGION_ROOT" + exit 1 +fi + +WORK_ROOT=${WORK_ROOT:-/tmp} +TEST_ROOT="${WORK_ROOT}/test_pantry_hang" +if [[ -d "$TEST_ROOT" ]]; then + # Delete previous test data + rm -r "$TEST_ROOT" +fi +mkdir -p "$TEST_ROOT" +if [[ $? -ne 0 ]]; then + echo "Failed to make test root $TEST_ROOT" + exit 1 +fi + +dsc_log=${TEST_ROOT}/test_pantry_hang_dsc.log +dsc2_log=${TEST_ROOT}/test_pantry_hang_dsc2.log +pantry_log=${TEST_ROOT}/test_pantry_hang_pantry.log +pantest_log=${TEST_ROOT}/test_pantry_hang_pantest.log + +echo "" > "$dsc_log" +echo "" > "$dsc2_log" +echo "" > "$pantry_log" +echo "" > "$pantest_log" + +result=0 + +echo "=========================================" +echo "Test: pantry blocks on RW attach with a" +echo " stopped downstairs" +echo "=========================================" + +# Use fixed volume IDs so we can reference them later. +volume_id="$(uuidgen)" +volume_id2="$(uuidgen)" +echo "Volume 1 ID: $volume_id (will have a stopped downstairs)" +echo "Volume 2 ID: $volume_id2 (all downstairs healthy)" +echo "" + +# Step 1: Start dsc 1 (ports 8810/8820/8830, control 9998) +echo "Step 1: Create and start dsc 1 (3 downstairs)" +if ! ${dsc} create --cleanup \ + --ds-bin "$cds" \ + --region-dir "$MY_REGION_ROOT" + --extent-count 15 \ + --extent-size 100 >> "$dsc_log" 2>&1; then + echo "ERROR: Failed to create dsc 1 regions" + echo "Check $dsc_log" + exit 1 +fi + +${dsc} start --ds-bin "$cds" --region-dir "$MY_REGION_ROOT" >> "$dsc_log" 2>&1 & +dsc_pid=$! +sleep 3 + +if ! ps -p $dsc_pid > /dev/null 2>&1; then + echo "ERROR: dsc 1 failed to start" + exit 1 +fi +echo " dsc 1 running (pid $dsc_pid, control 9998)" + +if ! ${dsc} cmd all-running > /dev/null 2>&1; then + echo "ERROR: not all dsc 1 downstairs are running" + cleanup + exit 1 +fi +echo " All 3 dsc 1 downstairs running" + +# Step 2: Start dsc 2 (ports 8910/8920/8930, control 9999) +echo "" +echo "Step 2: Create and start dsc 2 (3 downstairs)" +if ! ${dsc} create --cleanup \ + --ds-bin "$cds" \ + --extent-count 15 \ + --extent-size 100 \ + --port-base 8910 \ + --output-dir /tmp/dsc2 \ + --region-dir /var/tmp/dsc2/region >> "$dsc2_log" 2>&1; then + echo "ERROR: Failed to create dsc 2 regions" + cleanup + exit 1 +fi + +${dsc} start --ds-bin "$cds" \ + --control 127.0.0.1:9999 \ + --port-base 8910 \ + --output-dir /tmp/dsc2 \ + --region-dir /var/tmp/dsc2/region >> "$dsc2_log" 2>&1 & +dsc2_pid=$! +sleep 3 + +if ! ps -p $dsc2_pid > /dev/null 2>&1; then + echo "ERROR: dsc 2 failed to start" + cleanup + exit 1 +fi +echo " dsc 2 running (pid $dsc2_pid, control 9999)" + +if ! ${dsc} cmd --server http://127.0.0.1:9999 \ + all-running > /dev/null 2>&1; then + echo "ERROR: not all dsc 2 downstairs are running" + cleanup + exit 1 +fi +echo " All 3 dsc 2 downstairs running" + +# Step 3: Start the pantry +echo "" +echo "Step 3: Start the pantry" +${pantry} run -l 127.0.0.1:17000 >> "$pantry_log" 2>&1 & +pantry_pid=$! +sleep 2 + +if ! ps -p $pantry_pid > /dev/null 2>&1; then + echo "ERROR: pantry failed to start" + cleanup + exit 1 +fi +echo " Pantry running (pid $pantry_pid)" + +if ! ${pantest_bin} -p 127.0.0.1:17000 status >> "$pantest_log" 2>&1; then + echo "ERROR: pantry status check failed" + cleanup + exit 1 +fi +echo " Pantry responding to status requests" + +# Step 4: Stop one downstairs on dsc 1, then attempt attach +echo "" +echo "Step 4: Stop downstairs 0 on dsc 1" +${dsc} cmd disable-restart --cid 0 > /dev/null 2>&1 +${dsc} cmd stop --cid 0 > /dev/null 2>&1 +sleep 2 + +state=$(${dsc} cmd state --cid 0 2>/dev/null) +echo " Downstairs 0 state: $state" + +# Step 5: Attach volume 1 +echo "" +echo "Step 5: Attach volume 1 via dsc 1 (expect client timeout)" +echo " The progenitor client has a 15s request timeout." +echo " The attach will fail on the client side, but the server" +echo " handler keeps running and holds the entries lock." +echo "" + +${pantest_bin} -p 127.0.0.1:17000 \ + attach --dsc 127.0.0.1:9998 -v "$volume_id" \ + >> "$pantest_log" 2>&1 & +pantest_pid=$! + +echo " Waiting for pantest attach to timeout..." +count=0 +while ps -p "$pantest_pid" > /dev/null 2>&1; do + sleep 2 + (( count += 2 )) + if (( count > 30 )); then + echo " pantest has not timed out after 30s (unexpected)" + break + fi +done + +wait "$pantest_pid" 2>/dev/null +attach_rc=$? +echo " pantest attach exited with rc=$attach_rc (expected non-zero)" + +if (( attach_rc == 0 )); then + echo " FAIL: attach succeeded, but should have timed out" + result=1 +else + echo " PASS: attach timed out as expected" +fi + +# Step 6: Verify the pantry is still running +echo "" +echo "Step 6: Verify pantry process is still alive" +if ! ps -p "$pantry_pid" > /dev/null 2>&1; then + echo " FAIL: pantry process exited!" + result=1 + cleanup + exit 1 +fi +echo " PASS: pantry still running (pid $pantry_pid)" + +# Step 7: Attach volume 2 via dsc 2 (all healthy) +# Without the fix, this will hang because the entries lock is held by the +# stuck volume 1 activation. With the fix, this should succeed. +echo "" +echo "Step 7: Attach volume 2 via dsc 2 (all downstairs healthy)" +echo " Without the fix this will hang (entries lock held)." +echo " With the fix this should succeed." + +${pantest_bin} -p 127.0.0.1:17000 \ + attach --dsc 127.0.0.1:9999 -v "$volume_id2" \ + >> "$pantest_log" 2>&1 & +attach2_pid=$! + +# Give it time to complete -- 20s is enough if the lock is free, but will hit +# the client timeout if the lock is held. +count=0 +while ps -p "$attach2_pid" > /dev/null 2>&1; do + sleep 2 + (( count += 2 )) + if (( count > 20 )); then + echo " Volume 2 attach still running after 20s" + echo " FAIL: attach is blocked (entries lock held by volume 1)" + kill "$attach2_pid" 2>/dev/null + wait "$attach2_pid" 2>/dev/null + attach2_pid="" + result=1 + break + fi +done + +if [[ -n "$attach2_pid" ]]; then + wait "$attach2_pid" 2>/dev/null + attach2_rc=$? + if (( attach2_rc == 0 )); then + echo " PASS: volume 2 attached successfully" + else + echo " FAIL: volume 2 attach returned rc=$attach2_rc" + result=1 + fi + attach2_pid="" +fi + +# Step 8: Detach volume 2 +# This also requires the entries lock, so without the fix it will also hang. +echo "" +echo "Step 8: Detach volume 2" + +${pantest_bin} -p 127.0.0.1:17000 \ + detach "$volume_id2" \ + >> "$pantest_log" 2>&1 & +detach_pid=$! + +count=0 +while ps -p "$detach_pid" > /dev/null 2>&1; do + sleep 2 + (( count += 2 )) + if (( count > 20 )); then + echo " Detach still running after 20s" + echo " FAIL: detach is blocked (entries lock held)" + kill "$detach_pid" 2>/dev/null + wait "$detach_pid" 2>/dev/null + detach_pid="" + result=1 + break + fi +done + +if [[ -n "$detach_pid" ]]; then + wait "$detach_pid" 2>/dev/null + detach_rc=$? + if (( detach_rc == 0 )); then + echo " PASS: volume 2 detached successfully" + else + echo " FAIL: volume 2 detach returned rc=$detach_rc" + result=1 + fi + detach_pid="" +fi + +# Step 9: Check pantry status. Without the fix, this would hang too. +echo "" +echo "Step 9: Check pantry status" + +${pantest_bin} -p 127.0.0.1:17000 status \ + >> "$pantest_log" 2>&1 & +status_pid=$! + +count=0 +while ps -p "$status_pid" > /dev/null 2>&1; do + sleep 2 + (( count += 2 )) + if (( count > 20 )); then + echo " Status still running after 20s" + echo " FAIL: status is blocked (entries lock held)" + kill "$status_pid" 2>/dev/null + wait "$status_pid" 2>/dev/null + status_pid="" + result=1 + break + fi +done + +if [[ -n "$status_pid" ]]; then + wait "$status_pid" 2>/dev/null + status_rc=$? + if (( status_rc == 0 )); then + echo " PASS: pantry status responded" + else + echo " FAIL: pantry status returned rc=$status_rc" + result=1 + fi + status_pid="" +fi + +# ----------------------------------------------------------- +# Step 10: Restart the stopped downstairs and verify the +# original volume activates. +# ----------------------------------------------------------- +echo "" +echo "Step 10: Restart downstairs 0 on dsc 1" +${dsc} cmd enable-restart --cid 0 > /dev/null 2>&1 +${dsc} cmd start --cid 0 > /dev/null 2>&1 +sleep 5 + +state=$(${dsc} cmd state --cid 0 2>/dev/null) +echo " Downstairs 0 state: $state" + +echo " Waiting for volume 1 activation to complete..." +sleep 10 + +echo "" +echo "Step 11: Verify volume 1 is now attached and active" +if timeout 15 ${pantest_bin} -p 127.0.0.1:17000 \ + volume-status "$volume_id" >> "$pantest_log" 2>&1; then + echo " PASS: volume 1 status responded after downstairs restart" +else + echo " FAIL: volume 1 status not available after restart" + result=1 +fi + +# Final status check to make sure everything is healthy. +if timeout 15 ${pantest_bin} -p 127.0.0.1:17000 status \ + >> "$pantest_log" 2>&1; then + echo " PASS: pantry status healthy" +else + echo " FAIL: pantry status not responding" + result=1 +fi + +# ----------------------------------------------------------- +# Summary +# ----------------------------------------------------------- +echo "" +echo "=========================================" +if (( result == 0 )); then + echo "ALL CHECKS PASSED" +else + echo "SOME CHECKS FAILED" +fi +echo "=========================================" +echo "" +echo "Logs:" +echo " dsc 1: $dsc_log" +echo " dsc 2: $dsc2_log" +echo " pantry: $pantry_log" +echo " pantest: $pantest_log" +echo "" + +cleanup +echo "Done (${SECONDS}s elapsed)" +exit $result