diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index c9b262cfcd5..8ea9b86b378 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1194,6 +1194,24 @@ fn print_run_time(start_time: DateTime, elapsed: Duration, indent: usize) { ); } +fn print_start_end_time( + start: DateTime, + end: DateTime, + indent: usize, +) { + if let Ok(elapsed) = end.signed_duration_since(start).to_std() { + print_run_time(start, elapsed, indent); + } else { + println!( + "{:indent$}started at: {} (end time {} less than start time, \ + which seems weird?)", + "", + humantime::format_rfc3339_millis(start.into()), + humantime::format_rfc3339_millis(end.into()), + ); + } +} + /// Interprets the unstable, schemaless output from each particular background /// task and print a human-readable summary /// @@ -3465,7 +3483,7 @@ mod ereporter_status_fields { fn print_task_fm_analysis(details: &serde_json::Value) { use nexus_types::internal_api::background::fm_analysis::{ - AnalysisOutcome, Outcome, PreparationStatus, + AnalysisOutcome, AnalysisStatus, Outcome, PreparationStatus, }; let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } = match serde_json::from_value::(details.clone()) { @@ -3484,8 +3502,8 @@ fn print_task_fm_analysis(details: &serde_json::Value) { println!(" {PARENT_SITREP_ID: { println!( " analysis was not performed, as the inventory has\n \ @@ -3500,9 +3518,18 @@ fn print_task_fm_analysis(details: &serde_json::Value) { ); return; } - Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome), + Outcome::RanAnalysis { prep_status, analysis_status } => { + (prep_status, analysis_status) + } }; - match analysis_outcome { + + let AnalysisStatus { + start_time, + end_time, + report: analysis_report, + outcome, + } = analysis_status; + match outcome { AnalysisOutcome::Error(error) => { println!("{ERRICON} analysis failed: {error}"); } @@ -3512,9 +3539,20 @@ fn print_task_fm_analysis(details: &serde_json::Value) { parent_sitrep_id ); } - AnalysisOutcome::NotCommitted { sitrep_id, error } => { + AnalysisOutcome::NotCommitted { sitrep_id } => { + println!( + " analysis succeeded, but the sitrep was not committed" + ); println!( - " analysis succeeded, but the sitrep was not committed!" + " since the parent sitrep ({parent_sitrep_id:?}) was out \ + of date" + ); + println!(" sitrep ID: {sitrep_id:?}"); + } + AnalysisOutcome::CommitFailed { sitrep_id, error } => { + println!( + "{ERRICON} analysis succeeded, but committing the new sitrep \ + failed!" ); println!(" sitrep ID: {sitrep_id:?}"); println!(" error: {error}"); @@ -3526,17 +3564,17 @@ fn print_task_fm_analysis(details: &serde_json::Value) { } println!(); - let PreparationStatus { errors, report } = prep_status; - println!("{}", report.display_multiline(4)); + let PreparationStatus { errors, report: prep_report } = prep_status; + print!("{}", prep_report.display_multiline(4)); if !errors.is_empty() { println!("{ERRICON} errors preparing analysis inputs:"); for error in errors { println!(" > {error}") } } - - // TODO(eliza): eventually there will also be a detailed analysis report, - // print that here as well... + println!(); + print!("{}", analysis_report.display_multiline(4)); + print_start_end_time(start_time, end_time, 4); } fn print_task_fm_sitrep_loader(details: &serde_json::Value) { @@ -3643,16 +3681,7 @@ fn print_task_fm_rendezvous(details: &serde_json::Value) { println!("(i) note: this operation was not executed") } fm_rendezvous::OpResult::Executed { start, end } => { - if let Ok(elapsed) = start.signed_duration_since(end).to_std() { - print_run_time(start, elapsed, 6); - } else { - println!( - " started at: {} (end time {} less than start time, \ - which seems weird?)", - humantime::format_rfc3339_millis(start.into()), - humantime::format_rfc3339_millis(end.into()), - ); - } + print_start_end_time(start, end, 6); } } diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 48618032902..ba66911f6e4 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -698,7 +698,7 @@ task: "fm_analysis" parent sitrep ID: None current inventory collection ID: Some(..................... (collection)) FAULT MANAGEMENT ANALYSIS SUMMARY - ===== ========== ======== ======= + ================================= /!\ analysis failed: FM analysis is not yet implemented fault management analysis inputs @@ -708,6 +708,11 @@ task: "fm_analysis" no new ereports since the parent sitrep no cases copied forward + fault management analysis report + -------------------------------- + sitrep ID: ..................... + no cases changed in this analysis step + started at (s ago) and ran for ms task: "fm_rendezvous" configured period: every m @@ -1374,7 +1379,7 @@ task: "fm_analysis" parent sitrep ID: None current inventory collection ID: Some(..................... (collection)) FAULT MANAGEMENT ANALYSIS SUMMARY - ===== ========== ======== ======= + ================================= /!\ analysis failed: FM analysis is not yet implemented fault management analysis inputs @@ -1384,6 +1389,11 @@ task: "fm_analysis" no new ereports since the parent sitrep no cases copied forward + fault management analysis report + -------------------------------- + sitrep ID: ..................... + no cases changed in this analysis step + started at (s ago) and ran for ms task: "fm_rendezvous" configured period: every m diff --git a/nexus/fm/src/diagnosis.rs b/nexus/fm/src/diagnosis.rs new file mode 100644 index 00000000000..a1bb65b31e2 --- /dev/null +++ b/nexus/fm/src/diagnosis.rs @@ -0,0 +1,13 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::SitrepBuilder; +use crate::analysis_input::Input; + +pub fn analyze( + _input: &Input, + _builder: &mut SitrepBuilder<'_>, +) -> anyhow::Result<()> { + anyhow::bail!("FM analysis is not yet implemented") +} diff --git a/nexus/fm/src/lib.rs b/nexus/fm/src/lib.rs index f5ff7da5b2c..e207e1d4064 100644 --- a/nexus/fm/src/lib.rs +++ b/nexus/fm/src/lib.rs @@ -7,6 +7,7 @@ pub mod builder; pub use builder::{CaseBuilder, SitrepBuilder}; pub mod analysis_input; +pub mod diagnosis; pub use nexus_types::fm::*; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 6059fe710d6..8f89e4856b5 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -107,7 +107,7 @@ use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::ereport_ingester; use super::tasks::external_endpoints; -use super::tasks::fm_analysis::FmAnalysis; +use super::tasks::fm_analysis::{self, FmAnalysis}; use super::tasks::fm_rendezvous::FmRendezvous; use super::tasks::fm_sitrep_gc; use super::tasks::fm_sitrep_load; @@ -1142,7 +1142,11 @@ impl BackgroundTasksInitializer { datastore.clone(), sitrep_watcher.clone(), inventory_load_watcher.clone(), - task_fm_sitrep_loader.clone(), + fm_analysis::Activators { + sitrep_loader: task_fm_sitrep_loader.clone(), + sitrep_gc: task_fm_sitrep_gc.clone(), + }, + nexus_id, ); driver.register(TaskDefinition { name: "fm_analysis", diff --git a/nexus/src/app/background/tasks/fm_analysis.rs b/nexus/src/app/background/tasks/fm_analysis.rs index 4bd8067cca9..5310c224ee6 100644 --- a/nexus/src/app/background/tasks/fm_analysis.rs +++ b/nexus/src/app/background/tasks/fm_analysis.rs @@ -6,15 +6,18 @@ use crate::app::background::Activator; use crate::app::background::BackgroundTask; use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep; use anyhow::Context; +use chrono::Utc; use futures::future::BoxFuture; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore; use nexus_db_queries::db::pagination::Paginator; use nexus_fm as fm; use nexus_types::internal_api::background::FmAnalysisStatus; use nexus_types::internal_api::background::fm_analysis as status; use nexus_types::inventory; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::OmicronZoneUuid; use serde_json::json; use slog_error_chain::InlineErrorChain; use std::sync::Arc; @@ -26,6 +29,15 @@ pub struct FmAnalysis { sitrep_rx: watch::Receiver>, inv_rx: watch::Receiver>>, sitrep_loader: Activator, + sitrep_gc: Activator, + nexus_id: OmicronZoneUuid, +} + +/// This is just because I don't like it when a constructor takes multiple +/// positional arguments of the same type... +pub struct Activators { + pub sitrep_loader: Activator, + pub sitrep_gc: Activator, } impl BackgroundTask for FmAnalysis { @@ -54,9 +66,18 @@ impl FmAnalysis { datastore: Arc, sitrep_rx: watch::Receiver>, inv_rx: watch::Receiver>>, - sitrep_loader: Activator, + activators: Activators, + nexus_id: OmicronZoneUuid, ) -> Self { - Self { datastore, sitrep_rx, inv_rx, sitrep_loader } + let Activators { sitrep_loader, sitrep_gc } = activators; + Self { + datastore, + sitrep_rx, + inv_rx, + sitrep_loader, + sitrep_gc, + nexus_id, + } } async fn actually_activate( @@ -112,25 +133,15 @@ impl FmAnalysis { }; // Okay, actually run analysis and generate a new sitrep. - let outcome = self - .analyze(&opctx, inputs) - .await - .unwrap_or_else(|err| { - let error = InlineErrorChain::new(&*err); - slog::error!(opctx.log, "fault management analysis failed!"; &error); - status::AnalysisOutcome::Error(error.to_string()) - }); - - if let status::AnalysisOutcome::Committed { .. } = &outcome { - // If we committed a new sitrep, we ought to go ahead and load it - // now... - self.sitrep_loader.activate(); - } + let outcome = self.analyze(&opctx, inputs).await; FmAnalysisStatus { parent_sitrep_id, inv_collection_id: Some(inv_collection_id), - outcome: status::Outcome::RanAnalysis { prep_status, outcome }, + outcome: status::Outcome::RanAnalysis { + prep_status, + analysis_status: outcome, + }, } } @@ -205,9 +216,91 @@ impl FmAnalysis { async fn analyze( &mut self, - _opctx: &OpContext, - _inputs: fm::analysis_input::Input, - ) -> anyhow::Result { - anyhow::bail!("FM analysis is not yet implemented") + opctx: &OpContext, + inputs: fm::analysis_input::Input, + ) -> status::AnalysisStatus { + let start_time = Utc::now(); + let mut sitrep_builder = fm::SitrepBuilder::new(&opctx.log, &inputs); + let result = fm::diagnosis::analyze(&inputs, &mut sitrep_builder); + let end_time = Utc::now(); + let (sitrep, report) = sitrep_builder.build(self.nexus_id, end_time); + + // Did it work? + if let Err(e) = result { + let err = InlineErrorChain::new(&*e); + slog::error!(&opctx.log, "fault management analysis failed"; "err" => %err); + return status::AnalysisStatus { + start_time, + end_time, + report, + outcome: status::AnalysisOutcome::Error(e.to_string()), + }; + } + + // TODO(eliza): diff the sitrep against the parent, and return + // `Unchanged` if it's the same. + let unchanged = true; + if unchanged { + slog::info!( + &opctx.log, + "fault management analysis produced no changes from the \ + current sitrep" + ); + return status::AnalysisStatus { + start_time, + end_time, + report, + outcome: status::AnalysisOutcome::Unchanged, + }; + } + + let sitrep_id = sitrep.id(); + match self.datastore.fm_sitrep_insert(opctx, sitrep).await { + Ok(()) => { + slog::info!(&opctx.log, "updated the current sitrep!"); + // If we committed a new sitrep, we ought to go ahead and load it + // now... + self.sitrep_loader.activate(); + status::AnalysisStatus { + start_time, + end_time, + report, + outcome: status::AnalysisOutcome::Committed { sitrep_id }, + } + } + Err(datastore::fm::InsertSitrepError::ParentNotCurrent(_)) => { + slog::info!( + &opctx.log, + "new sitrep was not committed as the parent sitrep was \ + out of date"; + ); + // We are behind, activate the sitrep loader to try and catch up! + self.sitrep_loader.activate(); + // Also, we should probably clean up after ourselves... + self.sitrep_gc.activate(); + + status::AnalysisStatus { + start_time, + end_time, + report, + outcome: status::AnalysisOutcome::NotCommitted { + sitrep_id, + }, + } + } + Err(datastore::fm::InsertSitrepError::Other(e)) => { + let err = InlineErrorChain::new(&e); + slog::error!(&opctx.log, "failed to insert sitrep"; "err" => %err); + status::AnalysisStatus { + start_time, + end_time, + report, + outcome: status::AnalysisOutcome::CommitFailed { + sitrep_id, + error: e.to_string(), + }, + } + } + } } } diff --git a/nexus/types/src/fm/analysis_reports.rs b/nexus/types/src/fm/analysis_reports.rs index 2ff93160d4e..839fb7fade8 100644 --- a/nexus/types/src/fm/analysis_reports.rs +++ b/nexus/types/src/fm/analysis_reports.rs @@ -85,10 +85,23 @@ impl AnalysisReport { writeln!(f, "{:indent$}// {comment}", "")?; } writeln!(f, "{:indent$}sitrep ID: {sitrep_id}", "")?; - writeln!(f, "{:indent$}cases:", "")?; - for case in cases { - case.display_multiline(indent + 2, Some(*sitrep_id)) - .fmt(f)?; + if cases.is_empty() { + writeln!( + f, + "{:indent$}no cases changed in this analysis step", + "" + )?; + } else { + writeln!( + f, + "{:indent$}cases ({} with activity):", + "", + cases.len() + )?; + for case in cases { + case.display_multiline(indent + 2, Some(*sitrep_id)) + .fmt(f)?; + } } Ok(()) } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 0dde5368c88..91959519e7d 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -939,7 +939,18 @@ pub mod fm_analysis { PreparationError(String), /// Preparation succeeded and analysis was performed. - RanAnalysis { prep_status: PreparationStatus, outcome: AnalysisOutcome }, + RanAnalysis { + prep_status: PreparationStatus, + analysis_status: AnalysisStatus, + }, + } + + #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] + pub struct AnalysisStatus { + pub start_time: DateTime, + pub end_time: DateTime, + pub report: crate::fm::analysis_reports::AnalysisReport, + pub outcome: AnalysisOutcome, } #[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] @@ -953,7 +964,11 @@ pub mod fm_analysis { /// Analysis produced a new sitrep, but we failed to make it /// the current sitrep. - NotCommitted { sitrep_id: SitrepUuid, error: String }, + CommitFailed { sitrep_id: SitrepUuid, error: String }, + + /// Analysis produced a new sitrep, but the parent sitrep was out of + /// date, so it was not committed. + NotCommitted { sitrep_id: SitrepUuid }, /// Analysis produced a new sitrep, which was saved and made the current /// sitrep.