Skip to content

Commit 2761772

Browse files
committed
ephemeral: Fix run-ssh error handling
If we fail to launch, because we used `--rm` the container would exit and we couldn't get its logs. Change to removing the container image via a drop handler, so if the system fails to launch (e.g. it's missing a kernel or bwrap) we can get the logs from that. Signed-off-by: Colin Walters <walters@verbum.org>
1 parent cec4402 commit 2761772

4 files changed

Lines changed: 274 additions & 31 deletions

File tree

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Test fixture: Bootc image with kernel removed
2+
# This simulates a broken/malformed bootc image that will fail during ephemeral VM startup
3+
# Used to test error handling and cleanup in ephemeral run-ssh command
4+
5+
ARG BASE_IMAGE=quay.io/centos-bootc/centos-bootc:stream10
6+
7+
FROM ${BASE_IMAGE}
8+
9+
# Remove kernel and modules to simulate a broken bootc image
10+
RUN rm -rf /usr/lib/modules

crates/integration-tests/src/tests/run_ephemeral_ssh.rs

Lines changed: 129 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,75 @@ use color_eyre::Result;
1818
use integration_tests::{integration_test, parameterized_integration_test};
1919

2020
use std::process::Command;
21-
use std::thread;
22-
use std::time::Duration;
21+
use std::time::{Duration, Instant};
2322

2423
use crate::{get_test_image, run_bcvk, INTEGRATION_TEST_LABEL};
2524

25+
/// Poll until a container is removed or timeout is reached
26+
///
27+
/// Returns Ok(()) if container is removed within timeout, Err otherwise.
28+
/// Timeout is set to 60 seconds to account for slow CI runners.
29+
fn wait_for_container_removal(container_name: &str) -> Result<()> {
30+
let timeout = Duration::from_secs(60);
31+
let start = Instant::now();
32+
let poll_interval = Duration::from_millis(100);
33+
34+
while start.elapsed() < timeout {
35+
let output = Command::new("podman")
36+
.args(["ps", "-a", "--format", "{{.Names}}"])
37+
.output()
38+
.expect("Failed to list containers");
39+
40+
let containers = String::from_utf8_lossy(&output.stdout);
41+
if !containers.contains(container_name) {
42+
return Ok(());
43+
}
44+
45+
std::thread::sleep(poll_interval);
46+
}
47+
48+
let output = Command::new("podman")
49+
.args(["ps", "-a", "--format", "{{.Names}}"])
50+
.output()
51+
.expect("Failed to list containers");
52+
let containers = String::from_utf8_lossy(&output.stdout);
53+
54+
Err(color_eyre::eyre::eyre!(
55+
"Timeout waiting for container {} to be removed. Active containers: {}",
56+
container_name,
57+
containers
58+
))
59+
}
60+
61+
/// Build a test fixture image with the kernel removed
62+
fn build_broken_image() -> Result<String> {
63+
let fixture_path = concat!(env!("CARGO_MANIFEST_DIR"), "/fixtures/Dockerfile.no-kernel");
64+
let image_name = format!("localhost/bcvk-test-no-kernel:{}", std::process::id());
65+
66+
let output = Command::new("podman")
67+
.args([
68+
"build",
69+
"-f",
70+
fixture_path,
71+
"-t",
72+
&image_name,
73+
"--build-arg",
74+
&format!("BASE_IMAGE={}", get_test_image()),
75+
".",
76+
])
77+
.output()?;
78+
79+
if !output.status.success() {
80+
let stderr = String::from_utf8_lossy(&output.stderr);
81+
return Err(color_eyre::eyre::eyre!(
82+
"Failed to build broken test image: {}",
83+
stderr
84+
));
85+
}
86+
87+
Ok(image_name)
88+
}
89+
2690
/// Test running a non-interactive command via SSH
2791
fn test_run_ephemeral_ssh_command() -> Result<()> {
2892
let output = run_bcvk(&[
@@ -66,20 +130,9 @@ fn test_run_ephemeral_ssh_cleanup() -> Result<()> {
66130

67131
output.assert_success("ephemeral run-ssh");
68132

69-
thread::sleep(Duration::from_secs(1));
70-
71-
let check_output = Command::new("podman")
72-
.args(["ps", "-a", "--format", "{{.Names}}"])
73-
.output()
74-
.expect("Failed to list containers");
133+
// Poll for container removal with timeout
134+
wait_for_container_removal(&container_name)?;
75135

76-
let containers = String::from_utf8_lossy(&check_output.stdout);
77-
assert!(
78-
!containers.contains(&container_name),
79-
"Container {} was not cleaned up after SSH exit. Active containers: {}",
80-
container_name,
81-
containers
82-
);
83136
Ok(())
84137
}
85138
integration_test!(test_run_ephemeral_ssh_cleanup);
@@ -248,3 +301,64 @@ echo "All checks passed!"
248301
Ok(())
249302
}
250303
integration_test!(test_run_tmpfs);
304+
305+
/// Test that containers are properly cleaned up even when the image is broken
306+
///
307+
/// This test verifies that the drop handler for ContainerCleanup works correctly
308+
/// when ephemeral run-ssh fails early due to a broken image (missing kernel).
309+
/// Previously this would fail with "setns `mnt`: Bad file descriptor" when using
310+
/// podman's --rm flag. Now it should fail cleanly and remove the container.
311+
fn test_run_ephemeral_ssh_broken_image_cleanup() -> Result<()> {
312+
// Build a broken test image (bootc image with kernel removed)
313+
eprintln!("Building broken test image...");
314+
let broken_image = build_broken_image()?;
315+
eprintln!("Built broken image: {}", broken_image);
316+
317+
let container_name = format!("test-broken-cleanup-{}", std::process::id());
318+
319+
// Try to run ephemeral SSH with the broken image - this should fail
320+
let output = run_bcvk(&[
321+
"ephemeral",
322+
"run-ssh",
323+
"--name",
324+
&container_name,
325+
"--label",
326+
INTEGRATION_TEST_LABEL,
327+
&broken_image,
328+
"--",
329+
"echo",
330+
"should not reach here",
331+
])?;
332+
333+
// The command should fail (no kernel found)
334+
assert!(
335+
!output.success(),
336+
"Expected ephemeral run-ssh to fail with broken image, but it succeeded"
337+
);
338+
339+
// Verify the error message indicates the problem
340+
assert!(
341+
output
342+
.stderr
343+
.contains("Failed to read kernel modules directory")
344+
|| output
345+
.stderr
346+
.contains("Container exited before SSH became available")
347+
|| output
348+
.stderr
349+
.contains("Monitor process exited unexpectedly"),
350+
"Expected error about missing kernel or container failure, got: {}",
351+
output.stderr
352+
);
353+
354+
// Poll for container removal with timeout
355+
wait_for_container_removal(&container_name)?;
356+
357+
// Clean up the test image
358+
let _ = Command::new("podman")
359+
.args(["rmi", "-f", &broken_image])
360+
.output();
361+
362+
Ok(())
363+
}
364+
integration_test!(test_run_ephemeral_ssh_broken_image_cleanup);

crates/kit/src/run_ephemeral.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,10 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> {
777777
let mut vmlinuz_path: Option<Utf8PathBuf> = None;
778778
let mut initramfs_path: Option<Utf8PathBuf> = None;
779779

780-
for entry in fs::read_dir(modules_dir)? {
780+
let entries = fs::read_dir(modules_dir)
781+
.with_context(|| format!("Failed to read kernel modules directory at {}. This container image may not be a valid bootc image.", modules_dir))?;
782+
783+
for entry in entries {
781784
let entry = entry?;
782785
let path = Utf8PathBuf::from_path_buf(entry.path())
783786
.map_err(|p| eyre!("Path is not valid UTF-8: {}", p.display()))?;

crates/kit/src/run_ephemeral_ssh.rs

Lines changed: 131 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,102 @@ use crate::run_ephemeral::{run_detached, RunEphemeralOpts};
1010
use crate::ssh;
1111
use crate::supervisor_status::{SupervisorState, SupervisorStatus};
1212

13+
/// Fetch and display container logs to help diagnose startup failures
14+
fn show_container_logs(container_name: &str) {
15+
debug!("Fetching container logs for {}", container_name);
16+
17+
// Try to get container state/exit code and error
18+
let exit_code = if let Ok(output) = Command::new("podman")
19+
.args([
20+
"inspect",
21+
container_name,
22+
"--format",
23+
"{{.State.Status}} (exit code: {{.State.ExitCode}}){{if .State.Error}} - Error: {{.State.Error}}{{end}}",
24+
])
25+
.output()
26+
{
27+
let state = String::from_utf8_lossy(&output.stdout);
28+
if !state.trim().is_empty() {
29+
eprintln!("\nContainer state: {}", state.trim());
30+
}
31+
32+
// Extract exit code for interpretation
33+
Command::new("podman")
34+
.args(["inspect", container_name, "--format", "{{.State.ExitCode}}"])
35+
.output()
36+
.ok()
37+
.and_then(|o| String::from_utf8_lossy(&o.stdout).trim().parse::<i32>().ok())
38+
} else {
39+
None
40+
};
41+
42+
// Provide helpful hints for common exit codes
43+
if let Some(code) = exit_code {
44+
match code {
45+
127 => {
46+
eprintln!("\nNote: Exit code 127 typically means 'command not found'.");
47+
eprintln!("This container image may not be a valid bootc image.");
48+
eprintln!("Bootc images must have systemd and kernel modules in /usr/lib/modules.");
49+
}
50+
126 => {
51+
eprintln!("\nNote: Exit code 126 typically means 'permission denied' or file not executable.");
52+
}
53+
_ => {}
54+
}
55+
}
56+
57+
let output = match Command::new("podman")
58+
.args(["logs", container_name])
59+
.stderr(Stdio::inherit())
60+
.output()
61+
{
62+
Ok(output) => output,
63+
Err(e) => {
64+
eprintln!("Failed to fetch container logs: {}", e);
65+
return;
66+
}
67+
};
68+
69+
let logs = String::from_utf8_lossy(&output.stdout);
70+
if !logs.trim().is_empty() {
71+
eprintln!("\nContainer logs:");
72+
eprintln!("----------------------------------------");
73+
for line in logs.lines() {
74+
eprintln!("{}", line);
75+
}
76+
eprintln!("----------------------------------------\n");
77+
} else {
78+
eprintln!("(Container produced no output)");
79+
}
80+
}
81+
82+
/// RAII guard for ephemeral container cleanup
83+
/// Ensures container is removed when dropped, even on error paths
84+
struct ContainerCleanup {
85+
container_id: String,
86+
}
87+
88+
impl ContainerCleanup {
89+
fn new(container_id: String) -> Self {
90+
Self { container_id }
91+
}
92+
}
93+
94+
impl Drop for ContainerCleanup {
95+
fn drop(&mut self) {
96+
debug!("Cleaning up ephemeral container {}", self.container_id);
97+
let result = Command::new("podman")
98+
.args(["rm", "-f", &self.container_id])
99+
.stdout(Stdio::null())
100+
.stderr(Stdio::null())
101+
.status();
102+
103+
if let Err(e) = result {
104+
tracing::warn!("Failed to remove container {}: {}", self.container_id, e);
105+
}
106+
}
107+
}
108+
13109
/// Timeout waiting for connection
14110
pub(crate) const SSH_TIMEOUT: std::time::Duration = const { Duration::from_secs(240) };
15111

@@ -23,6 +119,17 @@ pub struct RunEphemeralSshOpts {
23119
pub ssh_args: Vec<String>,
24120
}
25121

122+
/// Check if container is running
123+
fn is_container_running(container_name: &str) -> Result<bool> {
124+
let output = Command::new("podman")
125+
.args(["inspect", container_name, "--format", "{{.State.Status}}"])
126+
.output()
127+
.context("Failed to inspect container state")?;
128+
129+
let state = String::from_utf8_lossy(&output.stdout);
130+
Ok(state.trim() == "running")
131+
}
132+
26133
/// Wait for VM SSH availability using the supervisor status file
27134
///
28135
/// Monitors /run/supervisor-status.json inside the container for SSH.
@@ -40,6 +147,13 @@ pub fn wait_for_vm_ssh(
40147
timeout.as_secs()
41148
);
42149

150+
// Check if container is still running before attempting exec
151+
if !is_container_running(container_name)? {
152+
progress.finish_and_clear();
153+
show_container_logs(container_name);
154+
return Err(eyre!("Container exited before SSH became available"));
155+
}
156+
43157
// Use the new monitor-status subcommand for efficient inotify-based monitoring
44158
let mut cmd = Command::new("podman");
45159
cmd.args([
@@ -56,10 +170,14 @@ pub fn wait_for_vm_ssh(
56170
.map_err(Into::into)
57171
});
58172
}
59-
let mut child = cmd
60-
.stdout(Stdio::piped())
61-
.spawn()
62-
.context("Failed to start status monitor")?;
173+
let mut child = match cmd.stdout(Stdio::piped()).stderr(Stdio::inherit()).spawn() {
174+
Ok(child) => child,
175+
Err(e) => {
176+
progress.finish_and_clear();
177+
show_container_logs(container_name);
178+
return Err(e).context("Failed to start status monitor");
179+
}
180+
};
63181

64182
let stdout = child.stdout.take().unwrap();
65183
let reader = std::io::BufReader::new(stdout);
@@ -100,6 +218,9 @@ pub fn wait_for_vm_ssh(
100218
}
101219

102220
let status = child.wait()?;
221+
222+
progress.finish_and_clear();
223+
show_container_logs(container_name);
103224
Err(eyre!("Monitor process exited unexpectedly: {status:?}"))
104225
}
105226

@@ -147,14 +268,16 @@ pub fn wait_for_ssh_ready(
147268
pub fn run_ephemeral_ssh(opts: RunEphemeralSshOpts) -> Result<()> {
148269
// Start the ephemeral pod in detached mode with SSH enabled
149270
let mut ephemeral_opts = opts.run_opts.clone();
150-
ephemeral_opts.podman.rm = true;
151271
ephemeral_opts.podman.detach = true;
152272
ephemeral_opts.common.ssh_keygen = true; // Enable SSH key generation and access
153273

154274
debug!("Starting ephemeral VM...");
155275
let container_id = run_detached(ephemeral_opts)?;
156276
debug!("Ephemeral VM started with container ID: {}", container_id);
157277

278+
// Create cleanup guard to ensure container removal on any exit path
279+
let _cleanup = ContainerCleanup::new(container_id.clone());
280+
158281
// Use the container ID for SSH and cleanup
159282
let container_name = container_id;
160283
debug!("Using container ID: {}", container_name);
@@ -176,16 +299,9 @@ pub fn run_ephemeral_ssh(opts: RunEphemeralSshOpts) -> Result<()> {
176299
let exit_code = status.code().unwrap_or(1);
177300
debug!("SSH exit code: {}", exit_code);
178301

179-
// SSH completed, proceed with cleanup
180-
181-
// Cleanup: stop and remove the container immediately
182-
debug!("SSH session ended, cleaning up ephemeral pod...");
183-
184-
let _ = Command::new("podman")
185-
.args(["rm", "-f", &container_name])
186-
.stdout(Stdio::null())
187-
.stderr(Stdio::null())
188-
.status();
302+
// Explicitly drop the cleanup guard before exit to ensure container removal
303+
// (std::process::exit doesn't run drop handlers)
304+
drop(_cleanup);
189305

190306
// Exit with SSH client's exit code
191307
std::process::exit(exit_code);

0 commit comments

Comments
 (0)