Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 51 additions & 22 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,24 @@ fn print_run_time(start_time: DateTime<Utc>, elapsed: Duration, indent: usize) {
);
}

fn print_start_end_time(
start: DateTime<Utc>,
end: DateTime<Utc>,
indent: usize,
) {
if let Ok(elapsed) = end.signed_duration_since(start).to_std() {
print_run_time(start, elapsed, indent);
} else {
println!(
"{:indent$}started at: {} (end time {} less than start time, \
which seems weird?)",
"",
humantime::format_rfc3339_millis(start.into()),
humantime::format_rfc3339_millis(end.into()),
);
}
}

/// Interprets the unstable, schemaless output from each particular background
/// task and print a human-readable summary
///
Expand Down Expand Up @@ -3465,7 +3483,7 @@ mod ereporter_status_fields {

fn print_task_fm_analysis(details: &serde_json::Value) {
use nexus_types::internal_api::background::fm_analysis::{
AnalysisOutcome, Outcome, PreparationStatus,
AnalysisOutcome, AnalysisStatus, Outcome, PreparationStatus,
};
let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } =
match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
Expand All @@ -3484,8 +3502,8 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
println!(" {PARENT_SITREP_ID:<WIDTH$}{parent_sitrep_id:?}");
println!(" {INV_ID:<WIDTH$}{inv_collection_id:?}");
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
println!(" ===== ========== ======== =======");
let (prep_status, analysis_outcome) = match outcome {
println!(" =================================");
let (prep_status, analysis_status) = match outcome {
Outcome::WaitingForInventory => {
println!(
" analysis was not performed, as the inventory has\n \
Expand All @@ -3500,9 +3518,18 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
);
return;
}
Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome),
Outcome::RanAnalysis { prep_status, analysis_status } => {
(prep_status, analysis_status)
}
};
match analysis_outcome {

let AnalysisStatus {
start_time,
end_time,
report: analysis_report,
outcome,
} = analysis_status;
match outcome {
AnalysisOutcome::Error(error) => {
println!("{ERRICON} analysis failed: {error}");
}
Expand All @@ -3512,9 +3539,20 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
parent_sitrep_id
);
}
AnalysisOutcome::NotCommitted { sitrep_id, error } => {
AnalysisOutcome::NotCommitted { sitrep_id } => {
println!(
" analysis succeeded, but the sitrep was not committed"
);
println!(
" analysis succeeded, but the sitrep was not committed!"
" since the parent sitrep ({parent_sitrep_id:?}) was out \
of date"
);
println!(" sitrep ID: {sitrep_id:?}");
}
AnalysisOutcome::CommitFailed { sitrep_id, error } => {
println!(
"{ERRICON} analysis succeeded, but committing the new sitrep \
failed!"
);
println!(" sitrep ID: {sitrep_id:?}");
println!(" error: {error}");
Expand All @@ -3526,17 +3564,17 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
}
println!();

let PreparationStatus { errors, report } = prep_status;
println!("{}", report.display_multiline(4));
let PreparationStatus { errors, report: prep_report } = prep_status;
print!("{}", prep_report.display_multiline(4));
if !errors.is_empty() {
println!("{ERRICON} errors preparing analysis inputs:");
for error in errors {
println!(" > {error}")
}
}

// TODO(eliza): eventually there will also be a detailed analysis report,
// print that here as well...
println!();
print!("{}", analysis_report.display_multiline(4));
print_start_end_time(start_time, end_time, 4);
}

fn print_task_fm_sitrep_loader(details: &serde_json::Value) {
Expand Down Expand Up @@ -3643,16 +3681,7 @@ fn print_task_fm_rendezvous(details: &serde_json::Value) {
println!("(i) note: this operation was not executed")
}
fm_rendezvous::OpResult::Executed { start, end } => {
if let Ok(elapsed) = start.signed_duration_since(end).to_std() {
print_run_time(start, elapsed, 6);
} else {
println!(
" started at: {} (end time {} less than start time, \
which seems weird?)",
humantime::format_rfc3339_millis(start.into()),
humantime::format_rfc3339_millis(end.into()),
);
}
print_start_end_time(start, end, 6);
}
}

Expand Down
14 changes: 12 additions & 2 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ task: "fm_analysis"
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
=================================
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
Expand All @@ -708,6 +708,11 @@ task: "fm_analysis"
no new ereports since the parent sitrep
no cases copied forward

fault management analysis report
--------------------------------
sitrep ID: ..........<REDACTED_UUID>...........
no cases changed in this analysis step
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms

task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
Expand Down Expand Up @@ -1374,7 +1379,7 @@ task: "fm_analysis"
parent sitrep ID: None
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
FAULT MANAGEMENT ANALYSIS SUMMARY
===== ========== ======== =======
=================================
/!\ analysis failed: FM analysis is not yet implemented

fault management analysis inputs
Expand All @@ -1384,6 +1389,11 @@ task: "fm_analysis"
no new ereports since the parent sitrep
no cases copied forward

fault management analysis report
--------------------------------
sitrep ID: ..........<REDACTED_UUID>...........
no cases changed in this analysis step
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms

task: "fm_rendezvous"
configured period: every <REDACTED_DURATION>m
Expand Down
13 changes: 13 additions & 0 deletions nexus/fm/src/diagnosis.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::SitrepBuilder;
use crate::analysis_input::Input;

pub fn analyze(
_input: &Input,
_builder: &mut SitrepBuilder<'_>,
) -> anyhow::Result<()> {
anyhow::bail!("FM analysis is not yet implemented")
}
1 change: 1 addition & 0 deletions nexus/fm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
pub mod builder;
pub use builder::{CaseBuilder, SitrepBuilder};
pub mod analysis_input;
pub mod diagnosis;

pub use nexus_types::fm::*;

Expand Down
8 changes: 6 additions & 2 deletions nexus/src/app/background/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ use super::tasks::dns_propagation;
use super::tasks::dns_servers;
use super::tasks::ereport_ingester;
use super::tasks::external_endpoints;
use super::tasks::fm_analysis::FmAnalysis;
use super::tasks::fm_analysis::{self, FmAnalysis};
use super::tasks::fm_rendezvous::FmRendezvous;
use super::tasks::fm_sitrep_gc;
use super::tasks::fm_sitrep_load;
Expand Down Expand Up @@ -1142,7 +1142,11 @@ impl BackgroundTasksInitializer {
datastore.clone(),
sitrep_watcher.clone(),
inventory_load_watcher.clone(),
task_fm_sitrep_loader.clone(),
fm_analysis::Activators {
sitrep_loader: task_fm_sitrep_loader.clone(),
sitrep_gc: task_fm_sitrep_gc.clone(),
},
nexus_id,
);
driver.register(TaskDefinition {
name: "fm_analysis",
Expand Down
135 changes: 114 additions & 21 deletions nexus/src/app/background/tasks/fm_analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@ use crate::app::background::Activator;
use crate::app::background::BackgroundTask;
use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep;
use anyhow::Context;
use chrono::Utc;
use futures::future::BoxFuture;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::DataStore;
use nexus_db_queries::db::datastore;
use nexus_db_queries::db::pagination::Paginator;
use nexus_fm as fm;
use nexus_types::internal_api::background::FmAnalysisStatus;
use nexus_types::internal_api::background::fm_analysis as status;
use nexus_types::inventory;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::OmicronZoneUuid;
use serde_json::json;
use slog_error_chain::InlineErrorChain;
use std::sync::Arc;
Expand All @@ -26,6 +29,15 @@ pub struct FmAnalysis {
sitrep_rx: watch::Receiver<Option<CurrentSitrep>>,
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
sitrep_loader: Activator,
sitrep_gc: Activator,
nexus_id: OmicronZoneUuid,
}

/// This is just because I don't like it when a constructor takes multiple
/// positional arguments of the same type...
pub struct Activators {
pub sitrep_loader: Activator,
pub sitrep_gc: Activator,
}
Comment thread
hawkw marked this conversation as resolved.

impl BackgroundTask for FmAnalysis {
Expand Down Expand Up @@ -54,9 +66,18 @@ impl FmAnalysis {
datastore: Arc<DataStore>,
sitrep_rx: watch::Receiver<Option<CurrentSitrep>>,
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
sitrep_loader: Activator,
activators: Activators,
nexus_id: OmicronZoneUuid,
) -> Self {
Self { datastore, sitrep_rx, inv_rx, sitrep_loader }
let Activators { sitrep_loader, sitrep_gc } = activators;
Self {
datastore,
sitrep_rx,
inv_rx,
sitrep_loader,
sitrep_gc,
nexus_id,
}
}

async fn actually_activate(
Expand Down Expand Up @@ -112,25 +133,15 @@ impl FmAnalysis {
};

// Okay, actually run analysis and generate a new sitrep.
let outcome = self
.analyze(&opctx, inputs)
.await
.unwrap_or_else(|err| {
let error = InlineErrorChain::new(&*err);
slog::error!(opctx.log, "fault management analysis failed!"; &error);
status::AnalysisOutcome::Error(error.to_string())
});

if let status::AnalysisOutcome::Committed { .. } = &outcome {
// If we committed a new sitrep, we ought to go ahead and load it
// now...
self.sitrep_loader.activate();
}
let outcome = self.analyze(&opctx, inputs).await;

FmAnalysisStatus {
parent_sitrep_id,
inv_collection_id: Some(inv_collection_id),
outcome: status::Outcome::RanAnalysis { prep_status, outcome },
outcome: status::Outcome::RanAnalysis {
prep_status,
analysis_status: outcome,
},
}
}

Expand Down Expand Up @@ -205,9 +216,91 @@ impl FmAnalysis {

async fn analyze(
&mut self,
_opctx: &OpContext,
_inputs: fm::analysis_input::Input,
) -> anyhow::Result<status::AnalysisOutcome> {
anyhow::bail!("FM analysis is not yet implemented")
opctx: &OpContext,
inputs: fm::analysis_input::Input,
) -> status::AnalysisStatus {
let start_time = Utc::now();
let mut sitrep_builder = fm::SitrepBuilder::new(&opctx.log, &inputs);
let result = fm::diagnosis::analyze(&inputs, &mut sitrep_builder);
let end_time = Utc::now();
let (sitrep, report) = sitrep_builder.build(self.nexus_id, end_time);

// Did it work?
if let Err(e) = result {
let err = InlineErrorChain::new(&*e);
slog::error!(&opctx.log, "fault management analysis failed"; "err" => %err);
return status::AnalysisStatus {
start_time,
end_time,
report,
outcome: status::AnalysisOutcome::Error(e.to_string()),
};
}

// TODO(eliza): diff the sitrep against the parent, and return
// `Unchanged` if it's the same.
let unchanged = true;
if unchanged {
slog::info!(
&opctx.log,
"fault management analysis produced no changes from the \
current sitrep"
);
return status::AnalysisStatus {
start_time,
end_time,
report,
outcome: status::AnalysisOutcome::Unchanged,
};
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compiler doesn't yell at you about code after this line being unreachable?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I wondered the same - feels a little sketchy to have "impossible-to-reach" code down here that's definitely untested, but also this is probably fine for now as long as the build isn't broken?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah...I couldn't really think of a better way to do this incrementally while the diffing code and the sitrep-generating code both currently don't exist. I suppose we could just not add this part at all, but I wanted to make sure to remember to put it there when the necessary stuff is added. Once we actually have enough code to start trying to produce sitreps, any tests for that will start failing due to this until the diffing code is added, which felt like the right way to make sure we remembered to do that part...

What do you guys think?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eh. I vote move forward for now. don't want to get hung around the wheel too much, but agree that we should not leave it in this weird intermediate state for too long.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also like to avoid that! :)


let sitrep_id = sitrep.id();
match self.datastore.fm_sitrep_insert(opctx, sitrep).await {
Ok(()) => {
slog::info!(&opctx.log, "updated the current sitrep!");
// If we committed a new sitrep, we ought to go ahead and load it
// now...
self.sitrep_loader.activate();
status::AnalysisStatus {
start_time,
end_time,
report,
outcome: status::AnalysisOutcome::Committed { sitrep_id },
}
}
Err(datastore::fm::InsertSitrepError::ParentNotCurrent(_)) => {
slog::info!(
&opctx.log,
"new sitrep was not committed as the parent sitrep was \
out of date";
);
// We are behind, activate the sitrep loader to try and catch up!
self.sitrep_loader.activate();
// Also, we should probably clean up after ourselves...
self.sitrep_gc.activate();

status::AnalysisStatus {
start_time,
end_time,
report,
outcome: status::AnalysisOutcome::NotCommitted {
sitrep_id,
},
}
}
Err(datastore::fm::InsertSitrepError::Other(e)) => {
let err = InlineErrorChain::new(&e);
slog::error!(&opctx.log, "failed to insert sitrep"; "err" => %err);
status::AnalysisStatus {
start_time,
end_time,
report,
outcome: status::AnalysisOutcome::CommitFailed {
sitrep_id,
error: e.to_string(),
},
}
}
}
}
}
Loading
Loading