diff --git a/crates/machine-controller/src/handler.rs b/crates/machine-controller/src/handler.rs index cfbd20bc45..d4ccb28b10 100644 --- a/crates/machine-controller/src/handler.rs +++ b/crates/machine-controller/src/handler.rs @@ -10386,7 +10386,9 @@ async fn set_boot_order_dpu_first_and_handle_no_dpu_error( /// declared zero-DPU (`expected_dpu_count == 0`). Other error variants and /// successful results pass through untouched. The `dpu_mode` gate in /// site-explorer is what guarantees `expected_dpu_count == 0` actually -/// means the host was configured as `NoDpu`. +/// means the host carries no managed DPU -- either `NoDpu` (no DPU hardware) +/// or `NicMode` (a DPU intentionally running as a plain NIC). Neither has a +/// DPU to answer Redfish, so a `NoDpu` error is expected, not a fault. fn handle_no_dpu_error( result: Result, RedfishError>, expected_dpu_count: usize, diff --git a/crates/site-explorer/src/lib.rs b/crates/site-explorer/src/lib.rs index e4aab801f8..eed58acc23 100644 --- a/crates/site-explorer/src/lib.rs +++ b/crates/site-explorer/src/lib.rs @@ -95,7 +95,7 @@ use carbide_redfish::libredfish::RedfishClientPool; use carbide_redfish::nv_redfish::NvRedfishClientPool; use errors::{SiteExplorerError, SiteExplorerResult}; -use self::metrics::{PairingBlockerReason, exploration_error_to_metric_label}; +use self::metrics::{DpuMigrationSignal, PairingBlockerReason, exploration_error_to_metric_label}; use crate::config::SiteExplorerExploreMode; use crate::explored_endpoint_index::ExploredEndpointIndex; @@ -1121,6 +1121,7 @@ impl SiteExplorer { host_dpu_mode, &ep, &mut dpu_exploration, + metrics, ) .await; } @@ -1138,6 +1139,7 @@ impl SiteExplorer { host_dpu_mode, &ep, &mut dpu_exploration, + metrics, ) .await; } @@ -1179,6 +1181,7 @@ impl SiteExplorer { &dpu_ep, dpu_ep.report.model().unwrap_or_default(), host_dpu_mode, + metrics, ) .await, ); @@ -1257,6 +1260,9 @@ impl SiteExplorer { "power cycling host {} to apply nic mode change for its incorrectly configured DPUs; time since last powercycle: {time_since_redfish_powercycle}", ep.address, ); + metrics.increment_dpu_migration_signal( + DpuMigrationSignal::ResetRequested, + ); if let Err(err) = self.redfish_powercycle(ep.address).await { tracing::warn!( @@ -1408,7 +1414,12 @@ impl SiteExplorer { // earlier on after detecting the host_dpu_mode as such, so // this shouldn't fire. let dpus = match host_dpu_mode { - DpuMode::NicMode => Vec::new(), + DpuMode::NicMode => { + metrics.increment_dpu_migration_signal( + DpuMigrationSignal::RegisteredZeroDpuForNicMode, + ); + Vec::new() + } DpuMode::DpuMode => dpus_explored_for_host, // Now that we continue/return early for NoDpu hosts, // we shouldn't actually get here. Probably could be @@ -1477,6 +1488,7 @@ impl SiteExplorer { /// `set_nic_mode` to auto-correct a mismatch -- happens here; the actual /// classification of its result lives in [`classify_matched_dpu`], which is /// unit-tested directly. Both the PCIe loop and the chassis fallback call this. + #[allow(clippy::too_many_arguments)] async fn record_host_dpu_device( &self, part_number: Option<&str>, @@ -1485,6 +1497,7 @@ impl SiteExplorer { host_dpu_mode: DpuMode, host_ep: &ExploredEndpoint, exploration: &mut DpuExplorationState, + metrics: &mut SiteExplorationMetrics, ) { // Count every DPU the host reports, independent of whether we've // discovered its BMC yet. @@ -1506,8 +1519,13 @@ impl SiteExplorer { // I/O, and may issue a `set_nic_mode` (in which case it returns `Ok(false)`). let mode_check = match part_number { Some(model) => Some( - self.check_and_configure_dpu_mode(dpu_ep, model.to_string(), host_dpu_mode) - .await, + self.check_and_configure_dpu_mode( + dpu_ep, + model.to_string(), + host_dpu_mode, + metrics, + ) + .await, ), None => None, }; @@ -2865,6 +2883,7 @@ impl SiteExplorer { dpu_ep: &ExploredEndpoint, dpu_model: String, host_dpu_mode: DpuMode, + metrics: &mut SiteExplorationMetrics, ) -> SiteExplorerResult { // Compute the target NIC mode. `None` means "no opinion -- don't // attempt to reconfigure" (e.g., BF2 where the heuristic doesn't @@ -2900,7 +2919,9 @@ impl SiteExplorer { ?host_dpu_mode, "site explorer found a DPU with a mode that does not match the target; will try to reconfigure" ); + metrics.increment_dpu_migration_signal(DpuMigrationSignal::ModeMismatchFound); self.set_nic_mode(dpu_ep, target_nic_mode).await?; + metrics.increment_dpu_migration_signal(DpuMigrationSignal::SetNicModeIssued); Ok(false) } None => { diff --git a/crates/site-explorer/src/metrics.rs b/crates/site-explorer/src/metrics.rs index c7ea895d99..1061893e88 100644 --- a/crates/site-explorer/src/metrics.rs +++ b/crates/site-explorer/src/metrics.rs @@ -64,6 +64,34 @@ impl Display for PairingBlockerReason { } } +/// Signals emitted while migrating a DPU's NIC mode toward its declared target. +/// Each marks a step in the flip-and-reset flow that drives a DPU into the +/// mode its host's `dpu_mode` calls for. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum DpuMigrationSignal { + /// Found a DPU whose actual mode differs from the target; will reconfigure. + ModeMismatchFound, + /// Issued a `set_nic_mode` flip to a DPU. + SetNicModeIssued, + /// Requested a host power-cycle to apply a queued NIC-mode change. + ResetRequested, + /// Registered a host with zero managed DPUs because its declared + /// `dpu_mode` is NicMode (distinct from NoDpu). + RegisteredZeroDpuForNicMode, +} + +impl Display for DpuMigrationSignal { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::ModeMismatchFound => "mode_mismatch_found", + Self::SetNicModeIssued => "set_nic_mode_issued", + Self::ResetRequested => "reset_requested", + Self::RegisteredZeroDpuForNicMode => "registered_zero_dpu_for_nic_mode", + }; + write!(f, "{s}") + } +} + /// Metrics that are gathered in one site exploration run #[derive(Clone, Debug)] pub struct SiteExplorationMetrics { @@ -127,6 +155,11 @@ pub struct SiteExplorationMetrics { /// These are issues that prevent a host from being paired with its dpu(s) /// and require manual intervention. pub host_dpu_pairing_blockers: HashMap, + /// Total count of DPU NIC-mode migration signals by kind. These track the + /// flip-and-reset flow that drives a DPU into the mode its host's + /// `dpu_mode` declares (mismatch found, `set_nic_mode` issued, reset + /// requested, and zero-DPU registered for a NicMode host). + pub dpu_migration_signals: HashMap, } impl Default for SiteExplorationMetrics { @@ -161,6 +194,7 @@ impl SiteExplorationMetrics { endpoint_explorations_expected_power_shelves_missing_overall_count: 0, expected_machines_sku_count: HashMap::new(), host_dpu_pairing_blockers: HashMap::new(), + dpu_migration_signals: HashMap::new(), } } @@ -246,6 +280,14 @@ impl SiteExplorationMetrics { .entry(reason.to_string()) .or_default() += 1; } + + /// Increment the count of DPU NIC-mode migration signals by kind. + pub fn increment_dpu_migration_signal(&mut self, signal: DpuMigrationSignal) { + *self + .dpu_migration_signals + .entry(signal.to_string()) + .or_default() += 1; + } } /// Instruments that are used by the Site Explorer @@ -590,6 +632,28 @@ impl SiteExplorerInstruments { .build(); } + { + let metrics = shared_metrics.clone(); + meter + .u64_observable_gauge("carbide_site_explorer_dpu_migration_signals_count") + .with_description( + "Count of DPU NIC-mode migration signals by kind -- mode-mismatch found, \ + set_nic_mode issued, reset requested, and zero-DPU registered for a NicMode \ + host.", + ) + .with_callback(move |observer| { + metrics.if_available(|metrics, attrs| { + for (signal, &count) in metrics.dpu_migration_signals.iter() { + observer.observe( + count as u64, + &[attrs, &[KeyValue::new("signal", signal.clone())]].concat(), + ); + } + }) + }) + .build(); + } + { let metrics = shared_metrics.clone(); meter