Skip to content

Commit 83ffad4

Browse files
committed
fix(site-explorer): enforce the declared DPU mode on fallback-serial matches
The per-host matching loop now runs the same NIC/DPU mode check on a DPU paired through `fallback_dpu_serial_numbers` as it does on one the host reports over PCIe: a BlueField in the wrong mode gets `set_nic_mode` and the host is power-cycled to apply it, instead of being trusted as already configured. That reset now fires even when the host BMC never enumerated the DPU over PCIe -- the usual reason we end up on the fallback path -- so the queued flip can actually take effect. Until now the fallback path attached a matched DPU with no mode check. On a host the operator declared `nic_mode`, that DPU would then be dropped (a NIC-mode host has no managed DPUs), so the host registered as zero-DPU with the flip never issued -- the database read "NIC-mode host" while the BlueField stayed in DPU mode. Incomplete PCIe enumeration (a GB200 dropping a DPU from its inventory, say) is exactly what pushes a DPU-to-NIC migration onto this path, so this was the common case, not an edge. Adds a regression test -- a NIC-mode host whose DPU is paired only by fallback serial and still reporting DPU mode -- that fails on the old code (the host registers zero-DPU with no `set_nic_mode`) and passes now. Signed-off-by: Chet Nichols III <chetn@nvidia.com>
1 parent 47e42bc commit 83ffad4

2 files changed

Lines changed: 186 additions & 30 deletions

File tree

crates/site-explorer/src/lib.rs

Lines changed: 60 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,7 +1149,7 @@ impl SiteExplorer {
11491149
let DpuExplorationState {
11501150
reported_total: host_reported_dpus_total,
11511151
running_as_nic_total: mut host_reported_dpus_nic_mode_total,
1152-
all_configured: all_dpus_configured_properly_in_host,
1152+
all_configured: mut all_dpus_configured_properly_in_host,
11531153
running_as_dpu: mut dpus_explored_for_host,
11541154
} = dpu_exploration;
11551155

@@ -1166,30 +1166,53 @@ impl SiteExplorer {
11661166
{
11671167
for dpu_sn in &expected_machine.data.fallback_dpu_serial_numbers {
11681168
if let Some(dpu_ep) = dpu_sn_to_endpoint.remove(dpu_sn.as_str()) {
1169-
// We do not want to attach bluefields that are in NIC mode as DPUs to the host
1170-
if is_dpu_in_nic_mode(&dpu_ep, &ep)
1171-
&& host_reported_dpus_total
1172-
.saturating_sub(host_reported_dpus_nic_mode_total)
1173-
> 0
1174-
{
1175-
host_reported_dpus_nic_mode_total += 1;
1176-
continue;
1177-
}
1169+
// Enforce the host's declared DPU mode on a fallback-serial
1170+
// match the same way the host-reported path does, rather than
1171+
// trusting it as already-configured. A DPU still in the wrong
1172+
// mode gets a `set_nic_mode` here and has to wait for the host
1173+
// reset to apply it; without this, a DPU-mode BlueField on a
1174+
// `NicMode` host would be attached and then dropped to zero-DPU
1175+
// (the `NicMode` arm further down), leaving the database reading
1176+
// "NIC-mode host" while the hardware stayed in DPU mode.
1177+
let mode_check = Some(
1178+
self.check_and_configure_dpu_mode(
1179+
&dpu_ep,
1180+
dpu_ep.report.model().unwrap_or_default(),
1181+
host_dpu_mode,
1182+
)
1183+
.await,
1184+
);
11781185

1179-
// we found at least one DPU from expected machines for this host
1180-
// assume that the expected machines is the source of truth. Clear the
1181-
// contents of dpus_explored_for_host to discard the previous results of
1182-
// iterating over the hosts pcie devices.
1183-
if !dpu_added {
1184-
dpus_explored_for_host.clear();
1186+
match classify_matched_dpu(&dpu_ep, &ep, mode_check) {
1187+
DiscoveredDpu::RunningAsDpu(dpu) => {
1188+
// The expected-machine fallback list is the source of
1189+
// truth here, so discard whatever the PCIe scan found
1190+
// on the first confirmed match.
1191+
if !dpu_added {
1192+
dpus_explored_for_host.clear();
1193+
}
1194+
dpu_added = true;
1195+
dpus_explored_for_host.push(dpu);
1196+
}
1197+
DiscoveredDpu::RunningAsNic => {
1198+
host_reported_dpus_nic_mode_total += 1;
1199+
}
1200+
DiscoveredDpu::NeedsReconfig => {
1201+
// `set_nic_mode` was just issued; the host needs a
1202+
// reset before this DPU re-reports in the new mode, so
1203+
// mark it not-yet-configured and let the reset path
1204+
// below run.
1205+
all_dpus_configured_properly_in_host = false;
1206+
}
1207+
DiscoveredDpu::ModeCheckFailed(err) => {
1208+
tracing::warn!(
1209+
dpu = %dpu_ep.address,
1210+
dpu_sn = %dpu_sn,
1211+
error = %err,
1212+
"failed to check fallback-matched DPU mode; skipping this device this pass",
1213+
);
1214+
}
11851215
}
1186-
1187-
dpu_added = true;
1188-
dpus_explored_for_host.push(ExploredDpu {
1189-
bmc_ip: dpu_ep.address,
1190-
host_pf_mac_address: get_host_pf_mac_address(&dpu_ep),
1191-
report: dpu_ep.report.into(),
1192-
});
11931216
}
11941217
}
11951218
}
@@ -1203,13 +1226,20 @@ impl SiteExplorer {
12031226
// confirmed to be running as plain NICs.
12041227
let expected_managed_dpus_total =
12051228
host_reported_dpus_total.saturating_sub(host_reported_dpus_nic_mode_total);
1206-
if expected_managed_dpus_total > 0 {
1207-
tracing::warn!(
1208-
address = %ep.address,
1209-
exploration_report = ?ep,
1210-
"cannot identify managed host because the site explorer has only discovered {} out of the {} attached DPUs (all_dpus_configured_properly_in_host={all_dpus_configured_properly_in_host}):\n{:#?}",
1211-
dpus_explored_for_host.len(), expected_managed_dpus_total, dpus_explored_for_host
1212-
);
1229+
// Enter the reset/wait path when DPUs are still expected to pair, or
1230+
// when a `set_nic_mode` was just issued -- a fallback-serial match can
1231+
// queue a flip even on a host whose BMC reports no DPU over PCIe
1232+
// (`expected_managed_dpus_total == 0`), which is the usual reason we are
1233+
// on the fallback path at all.
1234+
if expected_managed_dpus_total > 0 || !all_dpus_configured_properly_in_host {
1235+
if expected_managed_dpus_total > 0 {
1236+
tracing::warn!(
1237+
address = %ep.address,
1238+
exploration_report = ?ep,
1239+
"cannot identify managed host because the site explorer has only discovered {} out of the {} attached DPUs (all_dpus_configured_properly_in_host={all_dpus_configured_properly_in_host}):\n{:#?}",
1240+
dpus_explored_for_host.len(), expected_managed_dpus_total, dpus_explored_for_host
1241+
);
1242+
}
12131243

12141244
if !all_dpus_configured_properly_in_host {
12151245
// A queued `set_nic_mode` only takes effect after a host

crates/site-explorer/tests/site_explorer.rs

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,6 +2606,132 @@ async fn test_site_explorer_power_cycles_non_dell_host_to_apply_nic_mode(
26062606
Ok(())
26072607
}
26082608

2609+
/// Regression guard for the fallback-serial path (#2631): a DPU paired only
2610+
/// through `fallback_dpu_serial_numbers` must get the same NIC-mode enforcement
2611+
/// as a host-reported one. The host BMC here enumerates no DPU over PCIe -- the
2612+
/// usual reason the fallback exists (e.g. a GB200 that drops a DPU from its
2613+
/// inventory) -- so the only link is the operator-listed serial, and the DPU is
2614+
/// still reporting DPU mode against a `NicMode` host.
2615+
///
2616+
/// Before the fix the fallback path trusted the match as already-configured: it
2617+
/// attached the DPU without a mode check, then dropped it to zero-DPU, so the
2618+
/// host registered as a NIC-mode host while the BlueField stayed in DPU mode and
2619+
/// `set_nic_mode` was never issued. Now the flip is issued, the host is
2620+
/// power-cycled to apply it, and the host waits instead of settling this pass.
2621+
#[sqlx_test]
2622+
async fn test_site_explorer_enforces_nic_mode_on_fallback_serial_match(
2623+
pool: PgPool,
2624+
) -> Result<(), Box<dyn std::error::Error>> {
2625+
use model::expected_machine::{DpuMode, ExpectedMachine, ExpectedMachineData};
2626+
use model::site_explorer::NicMode;
2627+
2628+
let env = Env::new(pool).await;
2629+
2630+
const FALLBACK_DPU_SERIAL: &str = "fallback-only-dpu-serial";
2631+
// DPU reports DPU mode; the host report carries no DPU device, so the
2632+
// serial is the only thing that can pair them.
2633+
let dpu_config = DpuConfig {
2634+
nic_mode: Some(NicMode::Dpu),
2635+
serial: FALLBACK_DPU_SERIAL.to_string(),
2636+
..DpuConfig::default()
2637+
};
2638+
let mock_host = ManagedHostConfig::default();
2639+
let host_bmc_mac = mock_host.bmc_mac_address;
2640+
2641+
// Operator declares the host NIC mode and lists the DPU's serial as a
2642+
// pairing fallback.
2643+
let mut txn = env.pool.begin().await?;
2644+
db::expected_machine::create(
2645+
&mut txn,
2646+
ExpectedMachine {
2647+
id: None,
2648+
bmc_mac_address: host_bmc_mac,
2649+
data: ExpectedMachineData {
2650+
bmc_username: "ADMIN".to_string(),
2651+
bmc_password: "PASS".to_string(),
2652+
serial_number: "EM-2631-FALLBACK-NIC".to_string(),
2653+
metadata: model::metadata::Metadata::new_with_default_name(),
2654+
dpu_mode: DpuMode::NicMode,
2655+
fallback_dpu_serial_numbers: vec![FALLBACK_DPU_SERIAL.to_string()],
2656+
..Default::default()
2657+
},
2658+
},
2659+
)
2660+
.await?;
2661+
txn.commit().await?;
2662+
2663+
let mut host_bmc = env.new_machine(&host_bmc_mac.to_string(), "SomeVendor");
2664+
let mut dpu_bmc = env.new_machine(&dpu_config.bmc_mac_address.to_string(), "NVIDIA/BF/BMC");
2665+
host_bmc.discover_dhcp(env.api()).await?;
2666+
dpu_bmc.discover_dhcp(env.api()).await?;
2667+
2668+
let explorer_config = SiteExplorerConfig {
2669+
enabled: Arc::new(true.into()),
2670+
retained_boot_interface_window: None,
2671+
explorations_per_run: 10,
2672+
concurrent_explorations: 1,
2673+
run_interval: std::time::Duration::from_secs(1),
2674+
create_machines: Arc::new(true.into()),
2675+
..Default::default()
2676+
};
2677+
let explorer = env.test_site_explorer(explorer_config);
2678+
explorer.insert_endpoint_results(vec![
2679+
(dpu_bmc.ip.parse().unwrap(), Ok(dpu_config.clone().into())),
2680+
(host_bmc.ip.parse().unwrap(), Ok(mock_host.into())),
2681+
]);
2682+
2683+
// First iteration: initial endpoint exploration.
2684+
explorer.run_single_iteration().await.unwrap();
2685+
let mut txn = env.pool.begin().await?;
2686+
for ip in [host_bmc.ip.parse()?, dpu_bmc.ip.parse()?] {
2687+
db::explored_endpoints::set_preingestion_complete(ip, &mut txn).await?;
2688+
}
2689+
txn.commit().await?;
2690+
// Second iteration: per-host matching falls through to the fallback-serial
2691+
// path, which must enforce the declared NIC mode.
2692+
explorer.run_single_iteration().await.unwrap();
2693+
2694+
{
2695+
let calls = explorer
2696+
.endpoint_explorer()
2697+
.set_nic_mode_calls
2698+
.lock()
2699+
.unwrap();
2700+
assert!(
2701+
calls.iter().any(|(_, mode)| *mode == NicMode::Nic),
2702+
"fallback-matched DPU on a NicMode host should get set_nic_mode(Nic); calls so far: {calls:?}"
2703+
);
2704+
}
2705+
2706+
// The host must not settle as a zero-DPU managed host until the flip has
2707+
// applied -- otherwise the database reads "NIC-mode host" while the
2708+
// BlueField is still physically in DPU mode.
2709+
let explored_managed_hosts = db::explored_managed_host::find_all(&env.pool).await?;
2710+
assert!(
2711+
explored_managed_hosts.is_empty(),
2712+
"host should wait for the queued NIC-mode flip to apply, not register as zero-DPU this pass"
2713+
);
2714+
2715+
// The reset path fires even though the host BMC never enumerated the DPU
2716+
// over PCIe (`expected_managed_dpus_total == 0`), so the queued flip can
2717+
// actually apply.
2718+
{
2719+
let power_calls = explorer
2720+
.endpoint_explorer()
2721+
.redfish_power_control_calls
2722+
.lock()
2723+
.unwrap();
2724+
assert!(
2725+
power_calls
2726+
.iter()
2727+
.any(|(_, action)| matches!(action, libredfish::SystemPowerControl::PowerCycle)),
2728+
"host should be power-cycled to apply the queued NIC-mode flip; power calls so far: {power_calls:?}"
2729+
);
2730+
}
2731+
2732+
Ok(())
2733+
}
2734+
26092735
/// A managed host's DPU-facing `machine_interface` is created (via DHCP) with
26102736
/// just a MAC and no `boot_interface_id`. The exploration that ingests the host
26112737
/// then backfills the vendor-specific Redfish interface id onto that row, matched

0 commit comments

Comments
 (0)