Skip to content

Commit 5c25a50

Browse files
committed
feat: honor a declared primary host NIC when selecting the boot device
ExpectedHostNic.primary is meant to say "this NIC is the host's boot interface," but ingestion ignored it: a zero-DPU or NIC-mode host's predicted interfaces always promoted as non-primary, so the declared NIC never stuck and the host booted via the lowest-MAC fallback until an operator ran set-primary-interface. The declared primary is now authoritative across the writers that own a NIC -- recorded on the prediction so it survives promotion, and honored by the DHCP path. This also settles the pre-ingestion window for multi-NIC hosts: several NICs that lease before ingestion each default to primary, and adopting the second one tripped the one_primary_interface_per_machine index and failed the host's ingestion. Ingestion now reconciles the adopted rows so exactly one interface is primary. - Record the declared primary on predicted_machine_interfaces and resolve it through one ExpectedMachineData::declared_primary_mac() the writers share; promotion and the DHCP find-or-create path both honor it. - Reconcile the adopted interfaces at zero-DPU ingestion to exactly one primary, ending the OnePrimaryInterface adoption failure. - Nothing declared -> today's automation stands: DPU takeover during ingestion, else the pick_boot_interface lowest-MAC fallback. Tests cover the declared primary surviving DHCP arrival order, promotion landing it on the real row, multi-NIC adoption no longer colliding, and a resolver unit test. Part of #2657 (epic #2660). The genuine-DPU-mode case -- a declared integrated NIC while the DPU stays managed -- is tracked separately in Signed-off-by: Chet Nichols III <chetn@nvidia.com> #2668.
1 parent 939a865 commit 5c25a50

10 files changed

Lines changed: 493 additions & 63 deletions

File tree

crates/api-core/src/dhcp/discover.rs

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -266,20 +266,18 @@ pub async fn discover_dhcp(
266266
.await
267267
.map_err(CarbideError::from)?
268268
{
269-
// Walk the host_nics list to see if there's a matching NIC (because it
270-
// may have a static reservation need or a primary-interface need)
271-
let mut declared_primary_mac: Option<MacAddress> = None;
272-
for nic in &m.data.host_nics {
273-
if nic.primary == Some(true) {
274-
declared_primary_mac = Some(nic.mac_address);
275-
}
276-
if nic.mac_address == parsed_mac {
277-
host_nic = Some(nic.clone());
278-
}
279-
}
280-
if let Some(pmac) = declared_primary_mac {
281-
is_primary_nic = Some(pmac == parsed_mac);
269+
// The host's declared primary NIC (if any) decides whether this
270+
// MAC is its boot interface; the matched NIC also carries any
271+
// static reservation need handled below.
272+
if let Some(declared_primary_mac) = m.data.declared_primary_mac() {
273+
is_primary_nic = Some(declared_primary_mac == parsed_mac);
282274
}
275+
host_nic = m
276+
.data
277+
.host_nics
278+
.iter()
279+
.find(|nic| nic.mac_address == parsed_mac)
280+
.cloned();
283281
if let Some(ref nic) = host_nic
284282
&& let Some(fixed_ip) = nic.fixed_ip
285283
{

crates/api-core/src/handlers/bmc_endpoint_explorer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,7 @@ mod tests {
11401140
mac_address: mac.parse().unwrap(),
11411141
expected_network_segment_type: NetworkSegmentType::HostInband,
11421142
boot_interface_id: boot_interface_id.map(String::from),
1143+
primary_interface: false,
11431144
}
11441145
}
11451146

crates/api-core/src/tests/expected_machine.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2622,6 +2622,82 @@ async fn test_add_rejects_multiple_primary_host_nics(
26222622
Ok(())
26232623
}
26242624

2625+
/// The declared primary survives whichever order its NICs DHCP in: leasing the
2626+
/// non-primary NIC first, then the declared primary, still lands the declared
2627+
/// primary as `primary_interface` and the other as non-primary.
2628+
#[crate::sqlx_test]
2629+
async fn test_declared_primary_survives_dhcp_arrival_order(
2630+
pool: sqlx::PgPool,
2631+
) -> Result<(), Box<dyn std::error::Error>> {
2632+
let env = {
2633+
let mut config = get_config();
2634+
config.rack_management_enabled = true;
2635+
create_test_env_with_overrides(pool, TestEnvOverrides::with_config(config)).await
2636+
};
2637+
let bmc_mac: MacAddress = "9A:9B:9C:9D:9F:10".parse().unwrap();
2638+
let primary_mac: MacAddress = "9A:9B:9C:9D:9F:11".parse().unwrap();
2639+
let other_mac: MacAddress = "9A:9B:9C:9D:9F:12".parse().unwrap();
2640+
2641+
env.api
2642+
.add_expected_machine(tonic::Request::new(rpc::forge::ExpectedMachine {
2643+
id: None,
2644+
bmc_mac_address: bmc_mac.to_string(),
2645+
bmc_username: "ADMIN".into(),
2646+
bmc_password: "PASS".into(),
2647+
chassis_serial_number: "EM-PRIMARY-003".into(),
2648+
host_nics: vec![
2649+
rpc::forge::ExpectedHostNic {
2650+
mac_address: primary_mac.to_string(),
2651+
nic_type: Some("onboard".into()),
2652+
fixed_ip: None,
2653+
fixed_mask: None,
2654+
fixed_gateway: None,
2655+
primary: Some(true),
2656+
},
2657+
rpc::forge::ExpectedHostNic {
2658+
mac_address: other_mac.to_string(),
2659+
nic_type: Some("onboard".into()),
2660+
fixed_ip: None,
2661+
fixed_mask: None,
2662+
fixed_gateway: None,
2663+
primary: None,
2664+
},
2665+
],
2666+
..Default::default()
2667+
}))
2668+
.await?;
2669+
2670+
// The non-primary NIC leases first, then the declared primary.
2671+
for mac in [other_mac, primary_mac] {
2672+
let mac_str = mac.to_string();
2673+
env.api
2674+
.discover_dhcp(
2675+
common::rpc_builder::DhcpDiscovery::builder(
2676+
&mac_str,
2677+
common::api_fixtures::FIXTURE_DHCP_RELAY_ADDRESS,
2678+
)
2679+
.tonic_request(),
2680+
)
2681+
.await?;
2682+
}
2683+
2684+
let mut txn = env.pool.begin().await?;
2685+
let primary = db::machine_interface::find_by_mac_address(&mut *txn, primary_mac).await?;
2686+
let other = db::machine_interface::find_by_mac_address(&mut *txn, other_mac).await?;
2687+
assert_eq!(primary.len(), 1);
2688+
assert_eq!(other.len(), 1);
2689+
assert!(
2690+
primary[0].primary_interface,
2691+
"the declared primary NIC should be primary even when it leases last"
2692+
);
2693+
assert!(
2694+
!other[0].primary_interface,
2695+
"the non-declared NIC should not be primary"
2696+
);
2697+
2698+
Ok(())
2699+
}
2700+
26252701
/// Simple test to have some round-trip coverage for `ExpectedMachine.dpu_mode`
26262702
/// to make sure a `NicMode` setting makes it from the API to the DB and back
26272703
/// correctly. Verifies:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-- Carry the declared ExpectedHostNic.primary boot interface on the prediction so
2+
-- it survives promotion into machine_interfaces (the predicted row previously
3+
-- promoted as primary_interface = false unconditionally). Defaults false: a host
4+
-- that declares nothing keeps today's automation -- the boot interface is chosen
5+
-- by the pick_boot_interface fallback (lowest-MAC non-underlay) or DPU takeover.
6+
ALTER TABLE predicted_machine_interfaces
7+
ADD COLUMN primary_interface boolean NOT NULL DEFAULT false;
8+
9+
-- The machine_id foreign key has no backing index (Postgres does not create one
10+
-- for FK columns), so find_by_machine_id scans the table. Add it now that
11+
-- promotion reads predictions by machine more often.
12+
CREATE INDEX predicted_machine_interfaces_machine_id_idx
13+
ON predicted_machine_interfaces (machine_id);

crates/api-db/src/machine_interface.rs

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -415,20 +415,12 @@ pub async fn find_one(
415415
// newly_created_interface indicates that we couldn't find a
416416
// MachineInterface, so created new one.
417417
//
418-
// `is_primary` integrates `ExpectedHostNic.primary` into machine
419-
// interface creation. If True, this NIC is declared the primary
420-
// boot NIC (which is/was the previous default behavior anyway,
421-
// meaning None does the same thing), and this is fine, because
422-
// at the end of the day, site-explorer will end up demoting it
423-
// as part of attaching a DPU.
424-
//
425-
// Now, if it's *False*, there's a different NIC on this host declared
426-
// as the boot NIC, so we actually overide the new interface and
427-
// explicitly mark it as non-primary here. We *could* bake this in
428-
// as part of validate_existing_mac_and_create, but since this is
429-
// the only call-site that cares about it, I'm making it specific
430-
// to here.
431-
// TODO(chet): ...but consider plumbing it through.
418+
// `is_primary` carries the declared `ExpectedHostNic.primary` for this MAC:
419+
// `Some(true)` -- this NIC is the host's declared boot interface, `Some(false)`
420+
// -- a different NIC is, `None` -- nothing was declared. On a newly created (and
421+
// thus still machine-less) row we make that declaration stick, promoting to or
422+
// demoting from the creation default as needed, so the boot interface is right
423+
// from the first lease. `None` keeps the creation default.
432424
//
433425
// If we're not making a new interface, then existing interfaces
434426
// are returned untouched.
@@ -452,10 +444,6 @@ pub async fn find_or_create_machine_interface(
452444
%mac_address,
453445
"Found no existing machine with mac address {mac_address} using networks with relays {relaystr}",
454446
);
455-
// validate_existing_mac_and_create hardcodes primary_interface: true
456-
// at creation. If the caller has explicitly declared a *different*
457-
// NIC as this machine's primary (i.e. is_primary == false), override the
458-
// true/default here.
459447
let mut interface = validate_existing_mac_and_create(
460448
&mut *txn,
461449
mac_address,
@@ -464,9 +452,22 @@ pub async fn find_or_create_machine_interface(
464452
retained_window,
465453
)
466454
.await?;
467-
if is_primary == Some(false) && interface.primary_interface {
468-
set_primary_interface(&interface.id, false, &mut *txn).await?;
469-
interface.primary_interface = false;
455+
// Make the declaration authoritative on this machine-less row.
456+
// `validate_existing_mac_and_create` defaults a freshly created row to
457+
// primary, so the demote covers "a different NIC is declared primary"
458+
// and the promote covers a row we *found* (rather than created) that is
459+
// the declared primary. Safe on a NULL machine_id row: the
460+
// one_primary_interface_per_machine index does not constrain it.
461+
match is_primary {
462+
Some(false) if interface.primary_interface => {
463+
set_primary_interface(&interface.id, false, &mut *txn).await?;
464+
interface.primary_interface = false;
465+
}
466+
Some(true) if !interface.primary_interface => {
467+
set_primary_interface(&interface.id, true, &mut *txn).await?;
468+
interface.primary_interface = true;
469+
}
470+
_ => {}
470471
}
471472
Ok(interface)
472473
}
@@ -1453,33 +1454,52 @@ pub async fn move_predicted_machine_interface_to_machine(
14531454
);
14541455
}
14551456

1456-
let (machine_interface_id, current_boot_interface_id, row_created_here) = match existing_row {
1457-
// This host has already DHCP'd once and created a machine_interface;
1458-
// we will migrate it below.
1459-
Some(machine_interface_snapshot) => (
1460-
machine_interface_snapshot.id,
1461-
machine_interface_snapshot.boot_interface_id,
1462-
false,
1463-
),
1464-
None => {
1465-
// This host has never DHCP'd before, create a new machine_interface for it
1466-
// (`create` recovers any retained boot interface id onto it).
1467-
let machine_interface = create(
1468-
txn,
1469-
&[network_segment],
1470-
&predicted_machine_interface.mac_address,
1457+
let (machine_interface_id, current_boot_interface_id, current_primary, row_created_here) =
1458+
match existing_row {
1459+
// This host has already DHCP'd once and created a machine_interface;
1460+
// we will migrate it below.
1461+
Some(machine_interface_snapshot) => (
1462+
machine_interface_snapshot.id,
1463+
machine_interface_snapshot.boot_interface_id,
1464+
machine_interface_snapshot.primary_interface,
14711465
false,
1472-
AddressSelectionStrategy::NextAvailableIp,
1473-
retained_window,
1474-
)
1475-
.await?;
1476-
(
1477-
machine_interface.id,
1478-
machine_interface.boot_interface_id,
1479-
true,
1480-
)
1481-
}
1482-
};
1466+
),
1467+
None => {
1468+
// This host has never DHCP'd before, create a new machine_interface for it
1469+
// (`create` recovers any retained boot interface id onto it). The promoted row
1470+
// is primary exactly when the prediction carries the declared
1471+
// `ExpectedHostNic.primary`.
1472+
let machine_interface = create(
1473+
txn,
1474+
&[network_segment],
1475+
&predicted_machine_interface.mac_address,
1476+
predicted_machine_interface.primary_interface,
1477+
AddressSelectionStrategy::NextAvailableIp,
1478+
retained_window,
1479+
)
1480+
.await?;
1481+
(
1482+
machine_interface.id,
1483+
machine_interface.boot_interface_id,
1484+
machine_interface.primary_interface,
1485+
true,
1486+
)
1487+
}
1488+
};
1489+
1490+
// Land the declared boot interface as we promote: the prediction holds the
1491+
// host's declared `ExpectedHostNic.primary`, so a promoted interface is primary
1492+
// exactly when it was declared. (An anonymous row found here keeps whatever
1493+
// flag DHCP set, so reconcile it to the declaration.) Done before association
1494+
// so a row reaches its machine already carrying the right flag.
1495+
if current_primary != predicted_machine_interface.primary_interface {
1496+
set_primary_interface(
1497+
&machine_interface_id,
1498+
predicted_machine_interface.primary_interface,
1499+
&mut *txn,
1500+
)
1501+
.await?;
1502+
}
14831503

14841504
// Take either the newly-created interface or the anonymous one we found, and associate it with
14851505
// this machine.

crates/api-db/src/predicted_machine_interface.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,13 @@ pub async fn create(
112112
value: NewPredictedMachineInterface<'_>,
113113
txn: &mut PgConnection,
114114
) -> Result<PredictedMachineInterface, DatabaseError> {
115-
let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id) VALUES ($1, $2, $3, $4) RETURNING *";
115+
let query = "INSERT INTO predicted_machine_interfaces (machine_id, mac_address, expected_network_segment_type, boot_interface_id, primary_interface) VALUES ($1, $2, $3, $4, $5) RETURNING *";
116116
sqlx::query_as(query)
117117
.bind(value.machine_id)
118118
.bind(value.mac_address)
119119
.bind(value.expected_network_segment_type)
120120
.bind(&value.boot_interface_id)
121+
.bind(value.primary_interface)
121122
.fetch_one(txn)
122123
.await
123124
.map_err(|e| DatabaseError::query(query, e))

crates/api-model/src/expected_machine.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,22 @@ pub struct ExpectedMachineData {
170170
// unless you want to go update all the files in each production deployment that autoload
171171
// the expected machines on api startup
172172

173+
impl ExpectedMachineData {
174+
/// The MAC the operator declared as this host's boot interface via
175+
/// `ExpectedHostNic.primary`. This is the single source of declared boot
176+
/// intent the writers consult -- site-explorer ingestion, DHCP, and
177+
/// prediction promotion -- so they all agree on which NIC wins. The API
178+
/// enforces at most one `primary` host NIC, so the first match is the
179+
/// declaration. `None` leaves the boot interface to today's automation
180+
/// (DPU takeover during ingestion, else the `pick_boot_interface` fallback).
181+
pub fn declared_primary_mac(&self) -> Option<MacAddress> {
182+
self.host_nics
183+
.iter()
184+
.find(|nic| nic.primary == Some(true))
185+
.map(|nic| nic.mac_address)
186+
}
187+
}
188+
173189
/// Per-host lifecycle profile for settings that affect state-machine progression.
174190
/// `Option<bool>` fields support CLI patch semantics (`None` = not specified,
175191
/// keep existing DB value via `COALESCE`). Converts to the runtime `HostProfile`
@@ -427,4 +443,40 @@ mod tests {
427443
};
428444
assert!(!hlp.is_empty());
429445
}
446+
447+
/// `declared_primary_mac` returns the MAC of the one NIC flagged
448+
/// `primary: Some(true)`, and `None` when nothing is declared. `primary:
449+
/// Some(false)` is an explicit non-primary, not a declaration.
450+
#[test]
451+
fn declared_primary_mac_returns_the_flagged_nic() {
452+
let mac_a: MacAddress = "AA:BB:CC:00:00:01".parse().unwrap();
453+
let mac_b: MacAddress = "AA:BB:CC:00:00:02".parse().unwrap();
454+
455+
let nic = |mac: MacAddress, primary: Option<bool>| ExpectedHostNic {
456+
mac_address: mac,
457+
primary,
458+
..Default::default()
459+
};
460+
461+
// Nothing declared -- empty, or only explicit non-primaries.
462+
assert_eq!(ExpectedMachineData::default().declared_primary_mac(), None);
463+
assert_eq!(
464+
ExpectedMachineData {
465+
host_nics: vec![nic(mac_a, None), nic(mac_b, Some(false))],
466+
..Default::default()
467+
}
468+
.declared_primary_mac(),
469+
None
470+
);
471+
472+
// The declared NIC wins.
473+
assert_eq!(
474+
ExpectedMachineData {
475+
host_nics: vec![nic(mac_a, Some(false)), nic(mac_b, Some(true))],
476+
..Default::default()
477+
}
478+
.declared_primary_mac(),
479+
Some(mac_b)
480+
);
481+
}
430482
}

crates/api-model/src/predicted_machine_interface.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ pub struct PredictedMachineInterface {
3232
/// MAC, handed to the `machine_interfaces` row at DHCP promotion so
3333
/// the host's boot target is a full pair from its first owned interface.
3434
pub boot_interface_id: Option<String>,
35+
/// The declared `ExpectedHostNic.primary` intent, carried so promotion into
36+
/// `machine_interfaces` lands the operator's chosen boot interface as
37+
/// `primary_interface`. `false` when nothing is declared -- promotion then
38+
/// leaves the row non-primary and the boot interface falls to the
39+
/// `pick_boot_interface` automation.
40+
pub primary_interface: bool,
3541
}
3642

3743
impl PredictedMachineInterface {
@@ -50,4 +56,6 @@ pub struct NewPredictedMachineInterface<'a> {
5056
pub mac_address: MacAddress,
5157
pub expected_network_segment_type: NetworkSegmentType,
5258
pub boot_interface_id: Option<String>,
59+
/// See [`PredictedMachineInterface::primary_interface`].
60+
pub primary_interface: bool,
5361
}

0 commit comments

Comments
 (0)