Skip to content

Commit e8fef01

Browse files
committed
fix(machine-controller): wait for a zero-DPU host's boot NIC instead of faulting
Boot configuration resolves the host's boot interface from its primary `machine_interface`. A zero-DPU host -- `NoDpu`, or a BlueField flipped to `NicMode` -- boots from a plain NIC that only takes its first HostInband lease once the host is up, so for a window after registration it has no boot interface yet. The boot-config checks (`check_host_boot_config` and the `SetBootOrder` / `CheckBootOrder` steps) now treat that as a wait-and-retry rather than an error. A host with managed DPUs always has its DPU-facing primary set at promotion, so a missing boot interface there is still a genuine fault. The wait-versus-fault decision is centralized in `resolve_boot_interface`: `check_host_boot_config` returns the existing `HostBootConfigDecision::Wait`, and `SetBootOrderOutcome` gains a matching `Wait` variant distinct from `WaitingForReboot` so the wait reads as an unmet precondition, not a reboot. Adds unit tests for the classification: a zero-DPU host with no boot interface waits, a DPU host without one faults, and a resolved interface is used as-is. Signed-off-by: Chet Nichols III <chetn@nvidia.com>
1 parent 47e42bc commit e8fef01

2 files changed

Lines changed: 139 additions & 21 deletions

File tree

crates/machine-controller/src/boot_interface.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,79 @@ pub fn boot_interface_target(
3838
.boot_interface_mac()
3939
.map(BootInterfaceTarget::MacOnly)
4040
}
41+
42+
/// What a Redfish boot step should do with a host's boot interface.
43+
///
44+
/// Separates "not ready yet" from "broken". A zero-DPU host (`NoDpu` or
45+
/// `NicMode`) boots from a plain NIC that takes its first HostInband lease only
46+
/// after the host comes up, so until then it has no boot interface to
47+
/// resolve -- the controller should wait, not fail. A host with managed DPUs
48+
/// always has its DPU-facing primary set at promotion, so a missing boot
49+
/// interface there is a genuine fault.
50+
#[derive(Debug)]
51+
pub enum BootInterfaceResolution {
52+
/// The boot interface resolved; target it.
53+
Ready(BootInterfaceTarget),
54+
/// A zero-DPU host whose boot NIC has not been discovered yet -- wait.
55+
AwaitingNic,
56+
/// A host that should already have a boot interface is missing one.
57+
Missing,
58+
}
59+
60+
/// Resolve this host's boot interface for a Redfish boot step, classifying a
61+
/// missing one as either "wait for the NIC" (zero-DPU) or "fault".
62+
pub fn resolve_boot_interface(mh_snapshot: &ManagedHostStateSnapshot) -> BootInterfaceResolution {
63+
classify_boot_interface(
64+
boot_interface_target(mh_snapshot),
65+
mh_snapshot.has_managed_dpus(),
66+
)
67+
}
68+
69+
/// The decision behind [`resolve_boot_interface`], split out from the snapshot
70+
/// lookup so it can be unit-tested directly.
71+
fn classify_boot_interface(
72+
boot_interface: Option<BootInterfaceTarget>,
73+
has_managed_dpus: bool,
74+
) -> BootInterfaceResolution {
75+
match boot_interface {
76+
Some(target) => BootInterfaceResolution::Ready(target),
77+
None if !has_managed_dpus => BootInterfaceResolution::AwaitingNic,
78+
None => BootInterfaceResolution::Missing,
79+
}
80+
}
81+
82+
#[cfg(test)]
83+
mod tests {
84+
use mac_address::MacAddress;
85+
86+
use super::*;
87+
88+
#[test]
89+
fn classify_waits_for_a_zero_dpu_host_without_a_boot_interface() {
90+
// The zero-DPU host's boot NIC has not taken its first lease yet: wait
91+
// for it instead of faulting.
92+
assert!(matches!(
93+
classify_boot_interface(None, false),
94+
BootInterfaceResolution::AwaitingNic
95+
));
96+
}
97+
98+
#[test]
99+
fn classify_faults_when_a_dpu_host_has_no_boot_interface() {
100+
// A host with managed DPUs always has its DPU-facing primary set at
101+
// promotion, so a missing boot interface is a real fault.
102+
assert!(matches!(
103+
classify_boot_interface(None, true),
104+
BootInterfaceResolution::Missing
105+
));
106+
}
107+
108+
#[test]
109+
fn classify_uses_the_resolved_interface_when_present() {
110+
let target = BootInterfaceTarget::MacOnly(MacAddress::new([0, 0, 0, 0, 0, 1]));
111+
assert!(matches!(
112+
classify_boot_interface(Some(target), false),
113+
BootInterfaceResolution::Ready(_)
114+
));
115+
}
116+
}

crates/machine-controller/src/handler.rs

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ use tokio::sync::Semaphore;
9898
use tracing::instrument;
9999
use version_compare::Cmp;
100100

101-
use crate::boot_interface::boot_interface_target;
101+
use crate::boot_interface::{BootInterfaceResolution, resolve_boot_interface};
102102
use crate::config::{
103103
FirmwareGlobal, MachineStateHandlerSiteConfig, MachineValidationConfig, TimePeriod,
104104
};
@@ -3339,6 +3339,7 @@ async fn handle_dpu_reprovision(
33393339
SetBootOrderOutcome::WaitingForReboot(reason) => {
33403340
Ok(StateHandlerOutcome::wait(reason))
33413341
}
3342+
SetBootOrderOutcome::Wait(reason) => Ok(StateHandlerOutcome::wait(reason)),
33423343
}
33433344
}
33443345
ReprovisionState::LockHostAfterBootRepair => {
@@ -3527,13 +3528,24 @@ async fn check_host_boot_config(
35273528
));
35283529
}
35293530

3530-
// Resolve the interface whose boot option should be first in host UEFI.
3531-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
3532-
StateHandlerError::GenericError(eyre::eyre!(
3533-
"Missing boot interface for host: {}",
3534-
mh_snapshot.host_snapshot.id
3535-
))
3536-
})?;
3531+
// Resolve the interface whose boot option should be first in host UEFI. A
3532+
// zero-DPU host whose boot NIC has not taken its first HostInband lease yet
3533+
// has no boot interface to resolve -- wait for it rather than failing.
3534+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
3535+
BootInterfaceResolution::Ready(target) => target,
3536+
BootInterfaceResolution::AwaitingNic => {
3537+
return Ok(HostBootConfigDecision::Wait(format!(
3538+
"Waiting for zero-DPU host {} to discover its boot NIC before configuring boot.",
3539+
mh_snapshot.host_snapshot.id
3540+
)));
3541+
}
3542+
BootInterfaceResolution::Missing => {
3543+
return Err(StateHandlerError::GenericError(eyre::eyre!(
3544+
"Missing boot interface for host: {}",
3545+
mh_snapshot.host_snapshot.id
3546+
)));
3547+
}
3548+
};
35373549

35383550
let vendor = mh_snapshot.host_snapshot.bmc_vendor();
35393551

@@ -4835,6 +4847,10 @@ enum SetBootOrderOutcome {
48354847
Continue(SetBootOrderInfo),
48364848
Done,
48374849
WaitingForReboot(String),
4850+
/// No boot interface to act on yet -- e.g. a zero-DPU host whose boot NIC
4851+
/// has not been discovered. Distinct from `WaitingForReboot`: nothing was
4852+
/// rebooted, the caller just waits and retries.
4853+
Wait(String),
48384854
}
48394855

48404856
/// Decision from checking whether host boot repair is still required.
@@ -5237,6 +5253,9 @@ async fn handle_host_boot_order_setup(
52375253
SetBootOrderOutcome::WaitingForReboot(reason) => {
52385254
return Ok(StateHandlerOutcome::wait(reason));
52395255
}
5256+
SetBootOrderOutcome::Wait(reason) => {
5257+
return Ok(StateHandlerOutcome::wait(reason));
5258+
}
52405259
}
52415260
}
52425261
None => ManagedHostState::HostInit {
@@ -10876,6 +10895,9 @@ async fn handle_instance_host_platform_config(
1087610895
SetBootOrderOutcome::WaitingForReboot(reason) => {
1087710896
return Ok(StateHandlerOutcome::wait(reason));
1087810897
}
10898+
SetBootOrderOutcome::Wait(reason) => {
10899+
return Ok(StateHandlerOutcome::wait(reason));
10900+
}
1087910901
}
1088010902
}
1088110903
HostPlatformConfigurationState::LockHost => {
@@ -10922,13 +10944,24 @@ async fn set_host_boot_order(
1092210944
// for verification.
1092310945
//
1092410946
// Resolve the boot NIC MAC the same way `CheckHostConfig` does,
10925-
// supporting hosts with DPU(s) and zero DPUs alike.
10926-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
10927-
StateHandlerError::GenericError(eyre::eyre!(
10928-
"Missing boot interface for host: {}",
10929-
mh_snapshot.host_snapshot.id
10930-
))
10931-
})?;
10947+
// supporting hosts with DPU(s) and zero DPUs alike. A zero-DPU host
10948+
// whose boot NIC has not taken its first HostInband lease yet has no
10949+
// boot interface to resolve -- wait for it rather than failing.
10950+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
10951+
BootInterfaceResolution::Ready(target) => target,
10952+
BootInterfaceResolution::AwaitingNic => {
10953+
return Ok(SetBootOrderOutcome::Wait(format!(
10954+
"Waiting for zero-DPU host {} to discover its boot NIC before setting boot order.",
10955+
mh_snapshot.host_snapshot.id
10956+
)));
10957+
}
10958+
BootInterfaceResolution::Missing => {
10959+
return Err(StateHandlerError::GenericError(eyre::eyre!(
10960+
"Missing boot interface for host: {}",
10961+
mh_snapshot.host_snapshot.id
10962+
)));
10963+
}
10964+
};
1093210965

1093310966
let jid = match set_boot_order_dpu_first_and_handle_no_dpu_error(
1093410967
redfish_client,
@@ -11207,12 +11240,21 @@ async fn set_host_boot_order(
1120711240

1120811241
let retry_count = set_boot_order_info.retry_count;
1120911242

11210-
let boot_interface = boot_interface_target(mh_snapshot).ok_or_else(|| {
11211-
StateHandlerError::GenericError(eyre::eyre!(
11212-
"Missing boot interface for host: {}",
11213-
mh_snapshot.host_snapshot.id
11214-
))
11215-
})?;
11243+
let boot_interface = match resolve_boot_interface(mh_snapshot) {
11244+
BootInterfaceResolution::Ready(target) => target,
11245+
BootInterfaceResolution::AwaitingNic => {
11246+
return Ok(SetBootOrderOutcome::Wait(format!(
11247+
"Waiting for zero-DPU host {} to discover its boot NIC before verifying boot order.",
11248+
mh_snapshot.host_snapshot.id
11249+
)));
11250+
}
11251+
BootInterfaceResolution::Missing => {
11252+
return Err(StateHandlerError::GenericError(eyre::eyre!(
11253+
"Missing boot interface for host: {}",
11254+
mh_snapshot.host_snapshot.id
11255+
)));
11256+
}
11257+
};
1121611258

1121711259
let boot_order_configured = boot_interface
1121811260
.run(|bi| redfish_client.is_boot_order_setup(bi))

0 commit comments

Comments
 (0)