Skip to content

Commit 546b652

Browse files
authored
phd-runner option to defer guest cleanup on failure (#1088)
When `--manual-stop-on-failure` is passed, each propolis-server in failed test cases is left running (if it hadn't been shut down by the test case prior to failure explicitly), and its address is echoed to the operator such that they can e.g. connect to its serial console to investigate or debug whatever may have caused the test failure. The test suite pauses until the instances left in this state are shut down manually, then continues running further tests (unless interrupted). Aside from convenience, this can be useful vs. reproducing test failures with manually-reconstructed scenarios via a transcription of a phd-test's instance spec and steps, which may have differences due to human-scale timing of guest shell command invocations.
1 parent 7237e16 commit 546b652

4 files changed

Lines changed: 129 additions & 4 deletions

File tree

phd-tests/framework/src/lib.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ use log_config::LogConfig;
4040
use port_allocator::PortAllocator;
4141
pub use test_vm::TestVm;
4242
use test_vm::{
43-
environment::EnvironmentSpec, spec::VmSpec, VmConfig, VmLocation,
43+
environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig,
44+
VmLocation,
4445
};
4546
use tokio::{
4647
sync::mpsc::{UnboundedReceiver, UnboundedSender},
@@ -63,6 +64,7 @@ pub(crate) mod zfs;
6364
pub struct TestCtx {
6465
pub(crate) framework: Arc<Framework>,
6566
pub(crate) output_dir: Utf8PathBuf,
67+
pub(crate) manual_stop: Option<TestVmManualStop>,
6668
}
6769

6870
/// An instance of the PHD test framework.
@@ -239,6 +241,16 @@ impl TestCtx {
239241
)
240242
.await
241243
}
244+
245+
/// When phd-runner is configured to leave instances running on failed
246+
/// tests, the watch channel whose Receiver is passed to this function is
247+
/// used to indicate to the instance cleanup task that a test *has* failed.
248+
pub fn set_cleanup_task_outcome_receiver(
249+
&mut self,
250+
manual_stop: TestVmManualStop,
251+
) {
252+
self.manual_stop = Some(manual_stop);
253+
}
242254
}
243255

244256
// The framework implementation includes some "runner-only" functions
@@ -330,7 +342,7 @@ impl Framework {
330342
pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
331343
let output_dir =
332344
self.tmp_directory.as_path().join(&fully_qualified_name);
333-
TestCtx { framework: self.clone(), output_dir }
345+
TestCtx { framework: self.clone(), output_dir, manual_stop: None }
334346
}
335347

336348
/// Resets the state of any stateful objects in the framework to prepare it

phd-tests/framework/src/test_vm/mod.rs

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ use propolis_client::{
4343
use propolis_client::{Client, ResponseValue};
4444
use thiserror::Error;
4545
use tokio::{
46-
sync::{mpsc::UnboundedSender, oneshot},
46+
sync::{mpsc::UnboundedSender, oneshot, watch},
4747
task::JoinHandle,
4848
time::timeout,
4949
};
@@ -272,6 +272,10 @@ pub struct TestVm {
272272

273273
state: VmState,
274274

275+
/// If we should wait for operator intervention before terminating this
276+
/// instance, this will be populated.
277+
manual_stop: Option<TestVmManualStop>,
278+
275279
/// Sending a task handle to this channel will ensure that the task runs to
276280
/// completion as part of the post-test cleanup fixture (i.e. before any
277281
/// other tests run).
@@ -328,6 +332,7 @@ impl TestVm {
328332
environment.clone(),
329333
params,
330334
ctx.framework.cleanup_task_channel(),
335+
ctx.manual_stop.clone(),
331336
),
332337
}
333338
}
@@ -338,6 +343,7 @@ impl TestVm {
338343
environment_spec: EnvironmentSpec,
339344
mut params: ServerProcessParameters,
340345
cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
346+
manual_stop: Option<TestVmManualStop>,
341347
) -> Result<Self> {
342348
let metrics = environment_spec.metrics.as_ref().map(|m| match m {
343349
MetricsLocation::Local => {
@@ -372,6 +378,7 @@ impl TestVm {
372378
guest_os,
373379
state: VmState::New,
374380
cleanup_task_tx,
381+
manual_stop,
375382
})
376383
}
377384

@@ -1134,6 +1141,9 @@ impl Drop for TestVm {
11341141

11351142
let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();
11361143

1144+
let manual_stop_opt = self.manual_stop.take();
1145+
let vm_name = self.vm_spec().vm_name.to_owned();
1146+
11371147
// The order in which the task destroys objects is important: the server
11381148
// can't be killed until the client has gotten a chance to shut down
11391149
// the VM, and the disks can't be destroyed until the server process has
@@ -1144,6 +1154,12 @@ impl Drop for TestVm {
11441154
// kept alive until the server process is gone.
11451155
let _disks = disks;
11461156

1157+
// Check if we should let the user access the instance of a
1158+
// failed testcase before ensuring its demolition
1159+
if let Some(manual_stop) = manual_stop_opt {
1160+
manual_stop.wait_for_stop(vm_name, &client, &server).await;
1161+
}
1162+
11471163
// Try to make sure the server's kernel VMM is cleaned up before
11481164
// killing the server process. This is best-effort; if it fails,
11491165
// the kernel VMM is leaked. This generally indicates a bug in
@@ -1233,3 +1249,74 @@ async fn try_ensure_vm_destroyed(client: &Client) {
12331249
error!(%error, "VM not destroyed after 5 seconds");
12341250
}
12351251
}
1252+
1253+
/// For waiting for instances of failed testcases to be manually shut down,
1254+
/// when phd-runner is invoked with --manual-stop-on-failure
1255+
#[derive(Clone)]
1256+
pub struct TestVmManualStop {
1257+
test_name: String,
1258+
/// If we should wait for operator intervention before terminating this
1259+
/// instance, this will be sent a `Some(false)`.
1260+
success_rx: watch::Receiver<Option<bool>>,
1261+
/// While waiting for instance shutdown, this may be sent a `true` if the
1262+
/// user sends a keyboard interrupt to indicate we should stop waiting.
1263+
sigint_rx: watch::Receiver<bool>,
1264+
}
1265+
impl TestVmManualStop {
1266+
pub fn new(
1267+
test_name: String,
1268+
success_rx: watch::Receiver<Option<bool>>,
1269+
sigint_rx: watch::Receiver<bool>,
1270+
) -> Self {
1271+
Self { test_name, success_rx, sigint_rx }
1272+
}
1273+
async fn wait_for_stop(
1274+
mut self,
1275+
vm_name: String,
1276+
client: &Client,
1277+
server: &server::PropolisServer,
1278+
) {
1279+
if self.success_rx.changed().await.is_ok() {
1280+
let success_opt = *self.success_rx.borrow();
1281+
if let Some(false) = success_opt {
1282+
let sock = server.server_addr();
1283+
let ip = sock.ip();
1284+
let port = sock.port();
1285+
let test_name = self.test_name;
1286+
let mut uninformed = true;
1287+
// States that might be worth inspecting out-of-band
1288+
while let Ok(
1289+
InstanceState::Running
1290+
| InstanceState::Migrating
1291+
| InstanceState::Rebooting
1292+
| InstanceState::Repairing,
1293+
) = client
1294+
.instance_get()
1295+
.send()
1296+
.await
1297+
.map(|inst| inst.instance.state)
1298+
{
1299+
if *self.sigint_rx.borrow() {
1300+
break;
1301+
}
1302+
if uninformed {
1303+
error!(
1304+
r#"
1305+
test {test_name:?} failed. propolis-server {vm_name:?} was left running,
1306+
with API accessible at http://{sock}
1307+
phd-runner will resume when this instance is shut down; e.g. by one of:
1308+
1309+
$ propolis-cli -s {ip} -p {port} serial
1310+
localhost:~# poweroff
1311+
1312+
$ propolis-cli -s {ip} -p {port} state stop
1313+
"#
1314+
);
1315+
uninformed = false;
1316+
}
1317+
tokio::time::sleep(Duration::from_secs(1)).await;
1318+
}
1319+
}
1320+
}
1321+
}
1322+
}

phd-tests/runner/src/config.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,16 @@ pub struct RunOptions {
202202
// number of seconds, but i'm lazy...
203203
#[clap(long, value_parser, default_value_t = 60 * 20)]
204204
pub max_buildomat_wait_secs: u64,
205+
206+
/// When a testcase fails while this is enabled, any instances started by
207+
/// the failed test are left running and phd-runner waits until they are
208+
/// manually shut down out-of-band by the operator.
209+
///
210+
/// This feature is intended to give the operator a chance to inspect the
211+
/// state of the guest(s) easily without necessarily having to reconstruct
212+
/// the scenario by hand.
213+
#[clap(long, value_parser)]
214+
pub manual_stop_on_failure: bool,
205215
}
206216

207217
#[derive(Args, Debug)]

phd-tests/runner/src/execute.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
use std::sync::Arc;
66
use std::time::{Duration, Instant};
77

8+
use phd_framework::test_vm::TestVmManualStop;
89
use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
910
use tokio::signal::unix::{signal, SignalKind};
1011
use tokio::sync::watch;
@@ -101,8 +102,18 @@ pub async fn run_tests_with_ctx(
101102
let framework = ctx.clone();
102103
let tc = execution.tc;
103104
let mut sigint_rx_task = sigint_rx.clone();
105+
let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
106+
let mut success_tx = None;
107+
if run_opts.manual_stop_on_failure {
108+
let (tx, success_rx) = watch::channel(None);
109+
test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new(
110+
tc.fully_qualified_name(),
111+
success_rx,
112+
sigint_rx.clone(),
113+
));
114+
success_tx = Some(tx);
115+
}
104116
let test_outcome = tokio::spawn(async move {
105-
let test_ctx = framework.test_ctx(tc.fully_qualified_name());
106117
tokio::select! {
107118
// Ensure interrupt signals are always handled instead of
108119
// continuing to run the test.
@@ -144,6 +155,11 @@ pub async fn run_tests_with_ctx(
144155
}
145156
);
146157

158+
if let Some(tx) = success_tx {
159+
let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
160+
let _: Result<_, _> = tx.send(Some(succeeded));
161+
}
162+
147163
match test_outcome {
148164
TestOutcome::Passed => stats.tests_passed += 1,
149165
TestOutcome::Failed(_) => {

0 commit comments

Comments
 (0)