Skip to content

Commit c8998a0

Browse files
committed
libvirt: Unify SSH polling with shared wait_for_readiness
The libvirt SSH path had its own hand-rolled retry loop inside connect_ssh() (60s timeout, 1s poll), while the ephemeral path cleanly uses the shared wait_for_readiness() utility. Worse, wait_for_ssh_ready() called run_ssh_impl() which hit the connect_ssh() retry, creating a double-retry: each "attempt" in the outer 180s loop took ~60s, yielding only 3 effective iterations instead of ~36. Unify on the same pattern the ephemeral path uses: - Extract verify_domain_running() and prepare_ssh_session() helpers so setup (domain check, config extraction, temp key) is done once. - Replace connect_ssh()'s retry loop with wait_for_readiness() inside run_ssh_impl(), matching run_ephemeral_ssh::wait_for_ssh_ready(). - Rename connect_ssh to exec_ssh_session (it no longer retries). - Simplify wait_for_ssh_ready() in run.rs to do setup once then poll with just the SSH command, instead of reconstructing everything per attempt. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters <walters@verbum.org>
1 parent 22a4223 commit c8998a0

2 files changed

Lines changed: 115 additions & 121 deletions

File tree

crates/kit/src/libvirt/run.rs

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -359,9 +359,10 @@ impl LibvirtRunOpts {
359359
}
360360
}
361361

362-
/// Wait for SSH to become available on a libvirt domain
362+
/// Wait for SSH to become available on a libvirt domain.
363363
///
364-
/// Polls SSH connectivity by attempting simple commands until successful or timeout.
364+
/// Uses the same `wait_for_readiness` polling loop as the ephemeral path
365+
/// and `run_ssh_impl`, just with a longer timeout for initial VM boot.
365366
fn wait_for_ssh_ready(
366367
global_opts: &crate::libvirt::LibvirtOptions,
367368
domain_name: &str,
@@ -374,39 +375,36 @@ fn wait_for_ssh_ready(
374375
domain_name, timeout_secs
375376
);
376377

377-
// Create progress bar
378-
let pb = crate::boot_progress::create_boot_progress_bar();
379-
pb.set_message("Waiting for SSH to become available...");
380-
381-
// Clone values for closure
382-
let global_opts_clone = global_opts.clone();
383-
let domain_name_clone = domain_name.to_string();
378+
// Do expensive setup once: verify domain, extract SSH config, create temp key.
379+
let ssh_opts = crate::libvirt::ssh::LibvirtSshOpts {
380+
domain_name: domain_name.to_string(),
381+
user: "root".to_string(),
382+
command: vec![],
383+
strict_host_keys: false,
384+
timeout: 5,
385+
log_level: "ERROR".to_string(),
386+
extra_options: vec![],
387+
suppress_output: true,
388+
};
389+
ssh_opts.verify_domain_running(global_opts)?;
390+
let ssh_config = ssh_opts.extract_ssh_config(global_opts)?;
391+
let (temp_key, parsed_extra_options) = ssh_opts.prepare_ssh_session(&ssh_config)?;
384392

385-
// Use shared polling function with libvirt-specific test
393+
let pb = crate::boot_progress::create_boot_progress_bar();
386394
let (_elapsed, pb) = crate::utils::wait_for_readiness(
387395
pb,
388396
"Waiting for SSH",
389397
|| {
390-
// Create a test SSH connection with short timeout
391-
let ssh_opts = crate::libvirt::ssh::LibvirtSshOpts {
392-
domain_name: domain_name_clone.clone(),
393-
user: "root".to_string(),
394-
command: vec!["true".to_string()], // Simple command to test connectivity
395-
strict_host_keys: false,
396-
timeout: 5, // Short timeout for each attempt
397-
log_level: "ERROR".to_string(),
398-
extra_options: vec![],
399-
suppress_output: true, // Suppress error messages during connectivity testing
400-
};
401-
402-
// Try to connect
403-
match crate::libvirt::ssh::run_ssh_impl(&global_opts_clone, ssh_opts) {
404-
Ok(_) => Ok(true),
405-
Err(_) => Ok(false),
398+
let mut test_cmd =
399+
ssh_opts.build_ssh_command(&ssh_config, &temp_key, parsed_extra_options.clone());
400+
test_cmd.arg("--").arg("true");
401+
match test_cmd.output() {
402+
Ok(output) if output.status.success() => Ok(true),
403+
_ => Ok(false),
406404
}
407405
},
408406
Duration::from_secs(timeout_secs),
409-
Duration::from_secs(2), // Poll every 2 seconds
407+
Duration::from_secs(2),
410408
)?;
411409

412410
pb.finish_and_clear();

crates/kit/src/libvirt/ssh.rs

Lines changed: 90 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use std::io::Write;
1515
use std::os::unix::fs::PermissionsExt as _;
1616
use std::os::unix::process::CommandExt;
1717
use std::process::Command;
18-
use std::time::{Duration, Instant};
18+
use std::time::Duration;
1919
use tempfile;
2020
use tracing::debug;
2121

@@ -60,7 +60,7 @@ pub struct LibvirtSshOpts {
6060

6161
/// SSH configuration extracted from domain metadata
6262
#[derive(Debug)]
63-
struct DomainSshConfig {
63+
pub(crate) struct DomainSshConfig {
6464
private_key_content: String,
6565
ssh_port: u16,
6666
is_generated: bool,
@@ -93,7 +93,7 @@ impl LibvirtSshOpts {
9393
}
9494

9595
/// Extract SSH configuration from domain XML metadata
96-
fn extract_ssh_config(
96+
pub(crate) fn extract_ssh_config(
9797
&self,
9898
global_opts: &crate::libvirt::LibvirtOptions,
9999
) -> Result<DomainSshConfig> {
@@ -243,7 +243,7 @@ impl LibvirtSshOpts {
243243
}
244244

245245
/// Build SSH command with configured options
246-
fn build_ssh_command(
246+
pub(crate) fn build_ssh_command(
247247
&self,
248248
ssh_config: &DomainSshConfig,
249249
temp_key: &tempfile::NamedTempFile,
@@ -269,25 +269,34 @@ impl LibvirtSshOpts {
269269
ssh_cmd
270270
}
271271

272-
/// Execute SSH connection to domain with retries
273-
fn connect_ssh(
272+
/// Verify the domain exists and is running.
273+
pub(crate) fn verify_domain_running(
274274
&self,
275-
_global_opts: &crate::libvirt::LibvirtOptions,
276-
ssh_config: &DomainSshConfig,
275+
global_opts: &crate::libvirt::LibvirtOptions,
277276
) -> Result<()> {
278-
debug!(
279-
"Connecting to domain '{}' via SSH on port {} (user: {})",
280-
self.domain_name, ssh_config.ssh_port, self.user
281-
);
282-
283-
if ssh_config.is_generated {
284-
debug!("Using ephemeral SSH key from domain metadata");
277+
if !self.check_domain_exists(global_opts)? {
278+
return Err(eyre!("Domain '{}' not found", self.domain_name));
285279
}
280+
let state = self.get_domain_state(global_opts)?;
281+
if state != "running" {
282+
return Err(eyre!(
283+
"Domain '{}' is not running (current state: {}). Start it first with: virsh start {}",
284+
self.domain_name,
285+
state,
286+
self.domain_name
287+
));
288+
}
289+
Ok(())
290+
}
286291

287-
// Create temporary SSH key file
292+
/// Create temp key file and parse extra SSH options — shared setup for
293+
/// both the retry path and single-attempt tests.
294+
pub(crate) fn prepare_ssh_session(
295+
&self,
296+
ssh_config: &DomainSshConfig,
297+
) -> Result<(tempfile::NamedTempFile, Vec<(String, String)>)> {
288298
let temp_key = self.create_temp_ssh_key(ssh_config)?;
289299

290-
// Parse extra options
291300
let mut parsed_extra_options = Vec::new();
292301
for option in &self.extra_options {
293302
if let Some((key, value)) = option.split_once('=') {
@@ -299,65 +308,31 @@ impl LibvirtSshOpts {
299308
));
300309
}
301310
}
311+
Ok((temp_key, parsed_extra_options))
312+
}
302313

303-
let start_time = Instant::now();
304-
let timeout = Duration::from_secs(SSH_RETRY_TIMEOUT_SECS);
305-
306-
// First, do connectivity check with retries (for both interactive and command)
307-
debug!("Testing SSH connectivity before session");
308-
309-
// Create progress bar for user feedback (only shown in terminals)
310-
let pb = crate::boot_progress::create_boot_progress_bar();
311-
pb.set_message("Waiting for SSH to be ready...");
312-
313-
loop {
314-
let mut test_cmd =
315-
self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options.clone());
316-
test_cmd.arg("--").arg("true"); // Simple test command
317-
318-
let output = test_cmd.output().context("Failed to spawn SSH command")?;
319-
320-
if output.status.success() {
321-
debug!(
322-
"SSH connectivity confirmed after {:.1}s",
323-
start_time.elapsed().as_secs_f64()
324-
);
325-
pb.finish_and_clear();
326-
break;
327-
}
328-
329-
// Check if we've exceeded timeout
330-
if start_time.elapsed() >= timeout {
331-
pb.finish_and_clear();
332-
if !self.suppress_output {
333-
let stderr_str = String::from_utf8_lossy(&output.stderr);
334-
eprint!("{}", stderr_str);
335-
eprintln!(
336-
"\nSSH connection failed after {:.1}s. To see VM console output, run: virsh console {}",
337-
start_time.elapsed().as_secs_f64(),
338-
self.domain_name
339-
);
340-
}
341-
return Err(eyre!("SSH connection failed after timeout"));
342-
}
343-
344-
std::thread::sleep(Duration::from_secs(SSH_POLL_DELAY_SECS));
345-
}
346-
347-
// SSH is ready - now do the actual operation (oneshot)
314+
/// Execute the SSH session (interactive or command) after connectivity
315+
/// has already been confirmed by the caller.
316+
fn exec_ssh_session(
317+
&self,
318+
ssh_config: &DomainSshConfig,
319+
temp_key: &tempfile::NamedTempFile,
320+
parsed_extra_options: Vec<(String, String)>,
321+
) -> Result<()> {
348322
if self.command.is_empty() {
349-
// Interactive: exec directly
350-
debug!("SSH ready, launching interactive session");
351-
let mut ssh_cmd = self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options);
323+
// Interactive: exec directly (replaces current process)
324+
debug!("Launching interactive SSH session");
325+
let mut ssh_cmd =
326+
self.build_ssh_command(ssh_config, temp_key, parsed_extra_options);
352327
let error = ssh_cmd.exec();
353328
return Err(eyre!("Failed to exec SSH command: {}", error));
354329
}
355330

356-
// Command execution: single attempt since we already confirmed connectivity
357-
debug!("SSH ready, executing command");
358-
let mut ssh_cmd = self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options);
331+
// Command execution
332+
debug!("Executing SSH command");
333+
let mut ssh_cmd =
334+
self.build_ssh_command(ssh_config, temp_key, parsed_extra_options);
359335

360-
// Add command
361336
ssh_cmd.arg("--");
362337
if self.command.len() > 1 {
363338
let combined_command = crate::ssh::shell_escape_command(&self.command)
@@ -367,7 +342,6 @@ impl LibvirtSshOpts {
367342
ssh_cmd.args(&self.command);
368343
}
369344

370-
// Execute command
371345
let output = ssh_cmd
372346
.output()
373347
.map_err(|e| eyre!("Failed to execute SSH command: {}", e))?;
@@ -376,14 +350,9 @@ impl LibvirtSshOpts {
376350
if !output.stdout.is_empty() && !self.suppress_output {
377351
print!("{}", String::from_utf8_lossy(&output.stdout));
378352
}
379-
debug!(
380-
"Command completed successfully after {:.1}s total",
381-
start_time.elapsed().as_secs_f64()
382-
);
383353
return Ok(());
384354
}
385355

386-
// Command failed
387356
if !self.suppress_output {
388357
let stderr_str = String::from_utf8_lossy(&output.stderr);
389358
eprint!("{}", stderr_str);
@@ -400,36 +369,63 @@ pub fn run(global_opts: &crate::libvirt::LibvirtOptions, opts: LibvirtSshOpts) -
400369
run_ssh_impl(global_opts, opts)
401370
}
402371

403-
/// SSH implementation
372+
/// SSH implementation — waits for connectivity then runs the session.
404373
pub fn run_ssh_impl(
405374
global_opts: &crate::libvirt::LibvirtOptions,
406375
opts: LibvirtSshOpts,
407376
) -> Result<()> {
408377
debug!("Connecting to libvirt domain: {}", opts.domain_name);
409378

410-
// Check if domain exists
411-
if !opts.check_domain_exists(global_opts)? {
412-
return Err(eyre!("Domain '{}' not found", opts.domain_name));
413-
}
379+
opts.verify_domain_running(global_opts)?;
380+
381+
let ssh_config = opts.extract_ssh_config(global_opts)?;
414382

415-
// Check if domain is running
416-
let state = opts.get_domain_state(global_opts)?;
417-
if state != "running" {
418-
return Err(eyre!(
419-
"Domain '{}' is not running (current state: {}). Start it first with: virsh start {}",
420-
opts.domain_name,
421-
state,
422-
opts.domain_name
423-
));
383+
if ssh_config.is_generated {
384+
debug!("Using ephemeral SSH key from domain metadata");
424385
}
425386

426-
// Extract SSH configuration from domain metadata
427-
let ssh_config = opts.extract_ssh_config(global_opts)?;
387+
let (temp_key, parsed_extra_options) = opts.prepare_ssh_session(&ssh_config)?;
428388

429-
// Connect via SSH with retries
430-
opts.connect_ssh(global_opts, &ssh_config)?;
389+
// Wait for SSH connectivity using the shared polling loop — same
390+
// pattern as the ephemeral path in run_ephemeral_ssh::wait_for_ssh_ready.
391+
let mut last_stderr = String::new();
392+
let pb = crate::boot_progress::create_boot_progress_bar();
393+
let (_elapsed, pb) = crate::utils::wait_for_readiness(
394+
pb,
395+
"Waiting for SSH",
396+
|| {
397+
let mut test_cmd =
398+
opts.build_ssh_command(&ssh_config, &temp_key, parsed_extra_options.clone());
399+
test_cmd.arg("--").arg("true");
400+
401+
match test_cmd.output() {
402+
Ok(output) if output.status.success() => Ok(true),
403+
Ok(output) => {
404+
last_stderr = String::from_utf8_lossy(&output.stderr).into_owned();
405+
Ok(false)
406+
}
407+
Err(_) => Ok(false),
408+
}
409+
},
410+
Duration::from_secs(SSH_RETRY_TIMEOUT_SECS),
411+
Duration::from_secs(SSH_POLL_DELAY_SECS),
412+
)
413+
.map_err(|_| {
414+
if !opts.suppress_output {
415+
if !last_stderr.is_empty() {
416+
eprint!("{}", last_stderr);
417+
}
418+
eprintln!(
419+
"\nSSH connection failed. To see VM console output, run: virsh console {}",
420+
opts.domain_name
421+
);
422+
}
423+
eyre!("SSH connection failed after timeout")
424+
})?;
425+
pb.finish_and_clear();
431426

432-
Ok(())
427+
// Connectivity confirmed — run the actual session
428+
opts.exec_ssh_session(&ssh_config, &temp_key, parsed_extra_options)
433429
}
434430

435431
#[cfg(test)]

0 commit comments

Comments
 (0)