Skip to content

Commit 6915a58

Browse files
committed
fix(machine-controller): wait for a zero-DPU host's boot NIC instead of faulting
Boot configuration resolves the host's boot interface from its primary `machine_interface`. A zero-DPU host -- `NoDpu`, or a BlueField flipped to `NicMode` -- boots from a plain NIC that only takes its first HostInband lease once the host is up, so for a window after registration it has no boot interface yet. The boot steps (`CheckHostConfig`, `SetBootOrder`, `CheckBootOrder`) now treat that as a wait-and-retry rather than an error. A host with managed DPUs always has its DPU-facing primary set at promotion, so a missing boot interface there is still a genuine fault. The wait-versus-fault decision is centralized in `resolve_boot_interface`, and `SetBootOrderOutcome` gains a `Wait` variant distinct from `WaitingForReboot` so the wait reads as an unmet precondition, not a reboot. Adds unit tests for the classification: a zero-DPU host with no boot interface waits, a DPU host without one faults, and a resolved interface is used as-is. Signed-off-by: Chet Nichols III <chetn@nvidia.com>
1 parent b6d77d8 commit 6915a58

2 files changed

Lines changed: 137 additions & 21 deletions

File tree

crates/machine-controller/src/boot_interface.rs

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,78 @@ pub fn boot_interface_target(
3838
.boot_interface_mac()
3939
.map(BootInterfaceTarget::MacOnly)
4040
}
41+
42+
/// What a Redfish boot step should do with a host's boot interface.
43+
///
44+
/// Separates "not ready yet" from "broken". A zero-DPU host (`NoDpu` or
45+
/// `NicMode`) boots from a plain NIC that takes its first HostInband lease only
46+
/// after the host comes up, so until then it has no boot interface to
47+
/// resolve -- the controller should wait, not fail. A host with managed DPUs
48+
/// always has its DPU-facing primary set at promotion, so a missing boot
49+
/// interface there is a genuine fault.
50+
pub enum BootInterfaceResolution {
51+
/// The boot interface resolved; target it.
52+
Ready(BootInterfaceTarget),
53+
/// A zero-DPU host whose boot NIC has not been discovered yet -- wait.
54+
AwaitingNic,
55+
/// A host that should already have a boot interface is missing one.
56+
Missing,
57+
}
58+
59+
/// Resolve this host's boot interface for a Redfish boot step, classifying a
60+
/// missing one as either "wait for the NIC" (zero-DPU) or "fault".
61+
pub fn resolve_boot_interface(mh_snapshot: &ManagedHostStateSnapshot) -> BootInterfaceResolution {
62+
classify_boot_interface(
63+
boot_interface_target(mh_snapshot),
64+
mh_snapshot.has_managed_dpus(),
65+
)
66+
}
67+
68+
/// The decision behind [`resolve_boot_interface`], split out from the snapshot
69+
/// lookup so it can be unit-tested directly.
70+
fn classify_boot_interface(
71+
boot_interface: Option<BootInterfaceTarget>,
72+
has_managed_dpus: bool,
73+
) -> BootInterfaceResolution {
74+
match boot_interface {
75+
Some(target) => BootInterfaceResolution::Ready(target),
76+
None if !has_managed_dpus => BootInterfaceResolution::AwaitingNic,
77+
None => BootInterfaceResolution::Missing,
78+
}
79+
}
80+
81+
#[cfg(test)]
82+
mod tests {
83+
use mac_address::MacAddress;
84+
85+
use super::*;
86+
87+
#[test]
88+
fn classify_waits_for_a_zero_dpu_host_without_a_boot_interface() {
89+
// The zero-DPU host's boot NIC has not taken its first lease yet: wait
90+
// for it instead of faulting.
91+
assert!(matches!(
92+
classify_boot_interface(None, false),
93+
BootInterfaceResolution::AwaitingNic
94+
));
95+
}
96+
97+
#[test]
98+
fn classify_faults_when_a_dpu_host_has_no_boot_interface() {
99+
// A host with managed DPUs always has its DPU-facing primary set at
100+
// promotion, so a missing boot interface is a real fault.
101+
assert!(matches!(
102+
classify_boot_interface(None, true),
103+
BootInterfaceResolution::Missing
104+
));
105+
}
106+
107+
#[test]
108+
fn classify_uses_the_resolved_interface_when_present() {
109+
let target = BootInterfaceTarget::MacOnly(MacAddress::new([0, 0, 0, 0, 0, 1]));
110+
assert!(matches!(
111+
classify_boot_interface(Some(target), false),
112+
BootInterfaceResolution::Ready(_)
113+
));
114+
}
115+
}

crates/machine-controller/src/handler.rs

Lines changed: 62 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ use tokio::sync::Semaphore;
9898
use tracing::instrument;
9999
use version_compare::Cmp;
100100

101-
use crate::boot_interface::boot_interface_target;
101+
use crate::boot_interface::{BootInterfaceResolution, resolve_boot_interface};
102102
use crate::config::{
103103
FirmwareGlobal, MachineStateHandlerSiteConfig, MachineValidationConfig, TimePeriod,
104104
};
@@ -4176,6 +4176,10 @@ enum SetBootOrderOutcome {
41764176
Continue(SetBootOrderInfo),
41774177
Done,
41784178
WaitingForReboot(String),
4179+
/// No boot interface to act on yet -- e.g. a zero-DPU host whose boot NIC
4180+
/// has not been discovered. Distinct from `WaitingForReboot`: nothing was
4181+
/// rebooted, the caller just waits and retries.
4182+
Wait(String),
41794183
}
41804184

41814185
/// In case machine does not come up until a specified duration, this function tries to reboot
@@ -4564,6 +4568,9 @@ async fn handle_host_boot_order_setup(
45644568
SetBootOrderOutcome::WaitingForReboot(reason) => {
45654569
return Ok(StateHandlerOutcome::wait(reason));
45664570
}
4571+
SetBootOrderOutcome::Wait(reason) => {
4572+
return Ok(StateHandlerOutcome::wait(reason));
4573+
}
45674574
}
45684575
}
45694576
None => ManagedHostState::HostInit {
@@ -10019,13 +10026,24 @@ async fn handle_instance_host_platform_config(
1001910026
//
1002010027
// For zero-DPU hosts, it's the operator-declared primary host
1002110028
// NIC (which comes from `ExpectedHostNic.primary`) *or* the
10022-
// "lowest" deterministic-fallback host NIC.
10023-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
10024-
StateHandlerError::GenericError(eyre::eyre!(
10025-
"Missing boot interface for host: {}",
10026-
mh_snapshot.host_snapshot.id
10027-
))
10028-
})?;
10029+
// "lowest" deterministic-fallback host NIC. A zero-DPU host whose
10030+
// boot NIC has not taken its first HostInband lease yet has no boot
10031+
// interface to resolve -- wait for it rather than failing.
10032+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
10033+
BootInterfaceResolution::Ready(target) => target,
10034+
BootInterfaceResolution::AwaitingNic => {
10035+
return Ok(StateHandlerOutcome::wait(format!(
10036+
"Waiting for zero-DPU host {} to discover its boot NIC before configuring boot.",
10037+
mh_snapshot.host_snapshot.id
10038+
)));
10039+
}
10040+
BootInterfaceResolution::Missing => {
10041+
return Err(StateHandlerError::GenericError(eyre::eyre!(
10042+
"Missing boot interface for host: {}",
10043+
mh_snapshot.host_snapshot.id
10044+
)));
10045+
}
10046+
};
1002910047

1003010048
let vendor = mh_snapshot.host_snapshot.bmc_vendor();
1003110049

@@ -10256,6 +10274,9 @@ async fn handle_instance_host_platform_config(
1025610274
SetBootOrderOutcome::WaitingForReboot(reason) => {
1025710275
return Ok(StateHandlerOutcome::wait(reason));
1025810276
}
10277+
SetBootOrderOutcome::Wait(reason) => {
10278+
return Ok(StateHandlerOutcome::wait(reason));
10279+
}
1025910280
}
1026010281
}
1026110282
HostPlatformConfigurationState::LockHost => {
@@ -10302,13 +10323,24 @@ async fn set_host_boot_order(
1030210323
// for verification.
1030310324
//
1030410325
// Resolve the boot NIC MAC the same way `CheckHostConfig` does,
10305-
// supporting hosts with DPU(s) and zero DPUs alike.
10306-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
10307-
StateHandlerError::GenericError(eyre::eyre!(
10308-
"Missing boot interface for host: {}",
10309-
mh_snapshot.host_snapshot.id
10310-
))
10311-
})?;
10326+
// supporting hosts with DPU(s) and zero DPUs alike. A zero-DPU host
10327+
// whose boot NIC has not taken its first HostInband lease yet has no
10328+
// boot interface to resolve -- wait for it rather than failing.
10329+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
10330+
BootInterfaceResolution::Ready(target) => target,
10331+
BootInterfaceResolution::AwaitingNic => {
10332+
return Ok(SetBootOrderOutcome::Wait(format!(
10333+
"Waiting for zero-DPU host {} to discover its boot NIC before setting boot order.",
10334+
mh_snapshot.host_snapshot.id
10335+
)));
10336+
}
10337+
BootInterfaceResolution::Missing => {
10338+
return Err(StateHandlerError::GenericError(eyre::eyre!(
10339+
"Missing boot interface for host: {}",
10340+
mh_snapshot.host_snapshot.id
10341+
)));
10342+
}
10343+
};
1031210344

1031310345
let jid = match set_boot_order_dpu_first_and_handle_no_dpu_error(
1031410346
redfish_client,
@@ -10587,12 +10619,21 @@ async fn set_host_boot_order(
1058710619

1058810620
let retry_count = set_boot_order_info.retry_count;
1058910621

10590-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
10591-
StateHandlerError::GenericError(eyre::eyre!(
10592-
"Missing boot interface for host: {}",
10593-
mh_snapshot.host_snapshot.id
10594-
))
10595-
})?;
10622+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
10623+
BootInterfaceResolution::Ready(target) => target,
10624+
BootInterfaceResolution::AwaitingNic => {
10625+
return Ok(SetBootOrderOutcome::Wait(format!(
10626+
"Waiting for zero-DPU host {} to discover its boot NIC before verifying boot order.",
10627+
mh_snapshot.host_snapshot.id
10628+
)));
10629+
}
10630+
BootInterfaceResolution::Missing => {
10631+
return Err(StateHandlerError::GenericError(eyre::eyre!(
10632+
"Missing boot interface for host: {}",
10633+
mh_snapshot.host_snapshot.id
10634+
)));
10635+
}
10636+
};
1059610637

1059710638
let boot_order_configured = boot_interface
1059810639
.run(|bi| redfish_client.is_boot_order_setup(bi))

0 commit comments

Comments
 (0)