Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion crates/machine-controller/src/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10386,7 +10386,9 @@ async fn set_boot_order_dpu_first_and_handle_no_dpu_error(
/// declared zero-DPU (`expected_dpu_count == 0`). Other error variants and
/// successful results pass through untouched. The `dpu_mode` gate in
/// site-explorer is what guarantees `expected_dpu_count == 0` actually
/// means the host was configured as `NoDpu`.
/// means the host carries no managed DPU -- either `NoDpu` (no DPU hardware)
/// or `NicMode` (a DPU intentionally running as a plain NIC). Neither has a
/// DPU to answer Redfish, so a `NoDpu` error is expected, not a fault.
fn handle_no_dpu_error(
result: Result<Option<String>, RedfishError>,
expected_dpu_count: usize,
Expand Down
29 changes: 25 additions & 4 deletions crates/site-explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ use carbide_redfish::libredfish::RedfishClientPool;
use carbide_redfish::nv_redfish::NvRedfishClientPool;
use errors::{SiteExplorerError, SiteExplorerResult};

use self::metrics::{PairingBlockerReason, exploration_error_to_metric_label};
use self::metrics::{DpuMigrationSignal, PairingBlockerReason, exploration_error_to_metric_label};
use crate::config::SiteExplorerExploreMode;
use crate::explored_endpoint_index::ExploredEndpointIndex;

Expand Down Expand Up @@ -1121,6 +1121,7 @@ impl SiteExplorer {
host_dpu_mode,
&ep,
&mut dpu_exploration,
metrics,
)
.await;
}
Expand All @@ -1138,6 +1139,7 @@ impl SiteExplorer {
host_dpu_mode,
&ep,
&mut dpu_exploration,
metrics,
)
.await;
}
Expand Down Expand Up @@ -1179,6 +1181,7 @@ impl SiteExplorer {
&dpu_ep,
dpu_ep.report.model().unwrap_or_default(),
host_dpu_mode,
metrics,
)
.await,
);
Expand Down Expand Up @@ -1257,6 +1260,9 @@ impl SiteExplorer {
"power cycling host {} to apply nic mode change for its incorrectly configured DPUs; time since last powercycle: {time_since_redfish_powercycle}",
ep.address,
);
metrics.increment_dpu_migration_signal(
DpuMigrationSignal::ResetRequested,
);

if let Err(err) = self.redfish_powercycle(ep.address).await {
tracing::warn!(
Expand Down Expand Up @@ -1408,7 +1414,12 @@ impl SiteExplorer {
// earlier on after detecting the host_dpu_mode as such, so
// this shouldn't fire.
let dpus = match host_dpu_mode {
DpuMode::NicMode => Vec::new(),
DpuMode::NicMode => {
metrics.increment_dpu_migration_signal(
DpuMigrationSignal::RegisteredZeroDpuForNicMode,
);
Vec::new()
}
DpuMode::DpuMode => dpus_explored_for_host,
// Now that we continue/return early for NoDpu hosts,
// we shouldn't actually get here. Probably could be
Expand Down Expand Up @@ -1477,6 +1488,7 @@ impl SiteExplorer {
/// `set_nic_mode` to auto-correct a mismatch -- happens here; the actual
/// classification of its result lives in [`classify_matched_dpu`], which is
/// unit-tested directly. Both the PCIe loop and the chassis fallback call this.
#[allow(clippy::too_many_arguments)]
async fn record_host_dpu_device(
&self,
part_number: Option<&str>,
Expand All @@ -1485,6 +1497,7 @@ impl SiteExplorer {
host_dpu_mode: DpuMode,
host_ep: &ExploredEndpoint,
exploration: &mut DpuExplorationState,
metrics: &mut SiteExplorationMetrics,
) {
// Count every DPU the host reports, independent of whether we've
// discovered its BMC yet.
Expand All @@ -1506,8 +1519,13 @@ impl SiteExplorer {
// I/O, and may issue a `set_nic_mode` (in which case it returns `Ok(false)`).
let mode_check = match part_number {
Some(model) => Some(
self.check_and_configure_dpu_mode(dpu_ep, model.to_string(), host_dpu_mode)
.await,
self.check_and_configure_dpu_mode(
dpu_ep,
model.to_string(),
host_dpu_mode,
metrics,
)
.await,
),
None => None,
};
Expand Down Expand Up @@ -2865,6 +2883,7 @@ impl SiteExplorer {
dpu_ep: &ExploredEndpoint,
dpu_model: String,
host_dpu_mode: DpuMode,
metrics: &mut SiteExplorationMetrics,
) -> SiteExplorerResult<bool> {
// Compute the target NIC mode. `None` means "no opinion -- don't
// attempt to reconfigure" (e.g., BF2 where the heuristic doesn't
Expand Down Expand Up @@ -2900,7 +2919,9 @@ impl SiteExplorer {
?host_dpu_mode,
"site explorer found a DPU with a mode that does not match the target; will try to reconfigure"
);
metrics.increment_dpu_migration_signal(DpuMigrationSignal::ModeMismatchFound);
self.set_nic_mode(dpu_ep, target_nic_mode).await?;
metrics.increment_dpu_migration_signal(DpuMigrationSignal::SetNicModeIssued);
Ok(false)
}
None => {
Expand Down
64 changes: 64 additions & 0 deletions crates/site-explorer/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,34 @@ impl Display for PairingBlockerReason {
}
}

/// Signals emitted while migrating a DPU's NIC mode toward its declared target.
/// Each marks a step in the flip-and-reset flow that drives a DPU into the
/// mode its host's `dpu_mode` calls for.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum DpuMigrationSignal {
/// Found a DPU whose actual mode differs from the target; will reconfigure.
ModeMismatchFound,
/// Issued a `set_nic_mode` flip to a DPU.
SetNicModeIssued,
/// Requested a host power-cycle to apply a queued NIC-mode change.
ResetRequested,
/// Registered a host with zero managed DPUs because its declared
/// `dpu_mode` is NicMode (distinct from NoDpu).
RegisteredZeroDpuForNicMode,
}

impl Display for DpuMigrationSignal {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
let s = match self {
Self::ModeMismatchFound => "mode_mismatch_found",
Self::SetNicModeIssued => "set_nic_mode_issued",
Self::ResetRequested => "reset_requested",
Self::RegisteredZeroDpuForNicMode => "registered_zero_dpu_for_nic_mode",
};
write!(f, "{s}")
}
}

/// Metrics that are gathered in one site exploration run
#[derive(Clone, Debug)]
pub struct SiteExplorationMetrics {
Expand Down Expand Up @@ -127,6 +155,11 @@ pub struct SiteExplorationMetrics {
/// These are issues that prevent a host from being paired with its dpu(s)
/// and require manual intervention.
pub host_dpu_pairing_blockers: HashMap<String, usize>,
/// Total count of DPU NIC-mode migration signals by kind. These track the
/// flip-and-reset flow that drives a DPU into the mode its host's
/// `dpu_mode` declares (mismatch found, `set_nic_mode` issued, reset
/// requested, and zero-DPU registered for a NicMode host).
pub dpu_migration_signals: HashMap<String, usize>,
}

impl Default for SiteExplorationMetrics {
Expand Down Expand Up @@ -161,6 +194,7 @@ impl SiteExplorationMetrics {
endpoint_explorations_expected_power_shelves_missing_overall_count: 0,
expected_machines_sku_count: HashMap::new(),
host_dpu_pairing_blockers: HashMap::new(),
dpu_migration_signals: HashMap::new(),
}
}

Expand Down Expand Up @@ -246,6 +280,14 @@ impl SiteExplorationMetrics {
.entry(reason.to_string())
.or_default() += 1;
}

/// Increment the count of DPU NIC-mode migration signals by kind.
pub fn increment_dpu_migration_signal(&mut self, signal: DpuMigrationSignal) {
*self
.dpu_migration_signals
.entry(signal.to_string())
.or_default() += 1;
}
}

/// Instruments that are used by the Site Explorer
Expand Down Expand Up @@ -590,6 +632,28 @@ impl SiteExplorerInstruments {
.build();
}

{
let metrics = shared_metrics.clone();
meter
.u64_observable_gauge("carbide_site_explorer_dpu_migration_signals_count")
.with_description(
"Count of DPU NIC-mode migration signals by kind -- mode-mismatch found, \
set_nic_mode issued, reset requested, and zero-DPU registered for a NicMode \
host.",
)
.with_callback(move |observer| {
metrics.if_available(|metrics, attrs| {
for (signal, &count) in metrics.dpu_migration_signals.iter() {
observer.observe(
count as u64,
&[attrs, &[KeyValue::new("signal", signal.clone())]].concat(),
);
}
})
})
.build();
}

{
let metrics = shared_metrics.clone();
meter
Expand Down
Loading