Skip to content

Commit d7aa33c

Browse files
authored
[fm] more analysis plumbing (#10258)
1 parent b3f64b4 commit d7aa33c

File tree

8 files changed

+231
-53
lines changed

8 files changed

+231
-53
lines changed

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 51 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,24 @@ fn print_run_time(start_time: DateTime<Utc>, elapsed: Duration, indent: usize) {
11941194
);
11951195
}
11961196

1197+
fn print_start_end_time(
1198+
start: DateTime<Utc>,
1199+
end: DateTime<Utc>,
1200+
indent: usize,
1201+
) {
1202+
if let Ok(elapsed) = end.signed_duration_since(start).to_std() {
1203+
print_run_time(start, elapsed, indent);
1204+
} else {
1205+
println!(
1206+
"{:indent$}started at: {} (end time {} less than start time, \
1207+
which seems weird?)",
1208+
"",
1209+
humantime::format_rfc3339_millis(start.into()),
1210+
humantime::format_rfc3339_millis(end.into()),
1211+
);
1212+
}
1213+
}
1214+
11971215
/// Interprets the unstable, schemaless output from each particular background
11981216
/// task and print a human-readable summary
11991217
///
@@ -3465,7 +3483,7 @@ mod ereporter_status_fields {
34653483

34663484
fn print_task_fm_analysis(details: &serde_json::Value) {
34673485
use nexus_types::internal_api::background::fm_analysis::{
3468-
AnalysisOutcome, Outcome, PreparationStatus,
3486+
AnalysisOutcome, AnalysisStatus, Outcome, PreparationStatus,
34693487
};
34703488
let FmAnalysisStatus { parent_sitrep_id, inv_collection_id, outcome } =
34713489
match serde_json::from_value::<FmAnalysisStatus>(details.clone()) {
@@ -3484,8 +3502,8 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
34843502
println!(" {PARENT_SITREP_ID:<WIDTH$}{parent_sitrep_id:?}");
34853503
println!(" {INV_ID:<WIDTH$}{inv_collection_id:?}");
34863504
println!(" FAULT MANAGEMENT ANALYSIS SUMMARY");
3487-
println!(" ===== ========== ======== =======");
3488-
let (prep_status, analysis_outcome) = match outcome {
3505+
println!(" =================================");
3506+
let (prep_status, analysis_status) = match outcome {
34893507
Outcome::WaitingForInventory => {
34903508
println!(
34913509
" analysis was not performed, as the inventory has\n \
@@ -3500,9 +3518,18 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
35003518
);
35013519
return;
35023520
}
3503-
Outcome::RanAnalysis { prep_status, outcome } => (prep_status, outcome),
3521+
Outcome::RanAnalysis { prep_status, analysis_status } => {
3522+
(prep_status, analysis_status)
3523+
}
35043524
};
3505-
match analysis_outcome {
3525+
3526+
let AnalysisStatus {
3527+
start_time,
3528+
end_time,
3529+
report: analysis_report,
3530+
outcome,
3531+
} = analysis_status;
3532+
match outcome {
35063533
AnalysisOutcome::Error(error) => {
35073534
println!("{ERRICON} analysis failed: {error}");
35083535
}
@@ -3512,9 +3539,20 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
35123539
parent_sitrep_id
35133540
);
35143541
}
3515-
AnalysisOutcome::NotCommitted { sitrep_id, error } => {
3542+
AnalysisOutcome::NotCommitted { sitrep_id } => {
3543+
println!(
3544+
" analysis succeeded, but the sitrep was not committed"
3545+
);
35163546
println!(
3517-
" analysis succeeded, but the sitrep was not committed!"
3547+
" since the parent sitrep ({parent_sitrep_id:?}) was out \
3548+
of date"
3549+
);
3550+
println!(" sitrep ID: {sitrep_id:?}");
3551+
}
3552+
AnalysisOutcome::CommitFailed { sitrep_id, error } => {
3553+
println!(
3554+
"{ERRICON} analysis succeeded, but committing the new sitrep \
3555+
failed!"
35183556
);
35193557
println!(" sitrep ID: {sitrep_id:?}");
35203558
println!(" error: {error}");
@@ -3526,17 +3564,17 @@ fn print_task_fm_analysis(details: &serde_json::Value) {
35263564
}
35273565
println!();
35283566

3529-
let PreparationStatus { errors, report } = prep_status;
3530-
println!("{}", report.display_multiline(4));
3567+
let PreparationStatus { errors, report: prep_report } = prep_status;
3568+
print!("{}", prep_report.display_multiline(4));
35313569
if !errors.is_empty() {
35323570
println!("{ERRICON} errors preparing analysis inputs:");
35333571
for error in errors {
35343572
println!(" > {error}")
35353573
}
35363574
}
3537-
3538-
// TODO(eliza): eventually there will also be a detailed analysis report,
3539-
// print that here as well...
3575+
println!();
3576+
print!("{}", analysis_report.display_multiline(4));
3577+
print_start_end_time(start_time, end_time, 4);
35403578
}
35413579

35423580
fn print_task_fm_sitrep_loader(details: &serde_json::Value) {
@@ -3643,16 +3681,7 @@ fn print_task_fm_rendezvous(details: &serde_json::Value) {
36433681
println!("(i) note: this operation was not executed")
36443682
}
36453683
fm_rendezvous::OpResult::Executed { start, end } => {
3646-
if let Ok(elapsed) = start.signed_duration_since(end).to_std() {
3647-
print_run_time(start, elapsed, 6);
3648-
} else {
3649-
println!(
3650-
" started at: {} (end time {} less than start time, \
3651-
which seems weird?)",
3652-
humantime::format_rfc3339_millis(start.into()),
3653-
humantime::format_rfc3339_millis(end.into()),
3654-
);
3655-
}
3684+
print_start_end_time(start, end, 6);
36563685
}
36573686
}
36583687

dev-tools/omdb/tests/successes.out

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -698,7 +698,7 @@ task: "fm_analysis"
698698
parent sitrep ID: None
699699
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
700700
FAULT MANAGEMENT ANALYSIS SUMMARY
701-
===== ========== ======== =======
701+
=================================
702702
/!\ analysis failed: FM analysis is not yet implemented
703703

704704
fault management analysis inputs
@@ -708,6 +708,11 @@ task: "fm_analysis"
708708
no new ereports since the parent sitrep
709709
no cases copied forward
710710

711+
fault management analysis report
712+
--------------------------------
713+
sitrep ID: ..........<REDACTED_UUID>...........
714+
no cases changed in this analysis step
715+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
711716

712717
task: "fm_rendezvous"
713718
configured period: every <REDACTED_DURATION>m
@@ -1374,7 +1379,7 @@ task: "fm_analysis"
13741379
parent sitrep ID: None
13751380
current inventory collection ID: Some(..........<REDACTED_UUID>........... (collection))
13761381
FAULT MANAGEMENT ANALYSIS SUMMARY
1377-
===== ========== ======== =======
1382+
=================================
13781383
/!\ analysis failed: FM analysis is not yet implemented
13791384

13801385
fault management analysis inputs
@@ -1384,6 +1389,11 @@ task: "fm_analysis"
13841389
no new ereports since the parent sitrep
13851390
no cases copied forward
13861391

1392+
fault management analysis report
1393+
--------------------------------
1394+
sitrep ID: ..........<REDACTED_UUID>...........
1395+
no cases changed in this analysis step
1396+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
13871397

13881398
task: "fm_rendezvous"
13891399
configured period: every <REDACTED_DURATION>m

nexus/fm/src/diagnosis.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use crate::SitrepBuilder;
6+
use crate::analysis_input::Input;
7+
8+
pub fn analyze(
9+
_input: &Input,
10+
_builder: &mut SitrepBuilder<'_>,
11+
) -> anyhow::Result<()> {
12+
anyhow::bail!("FM analysis is not yet implemented")
13+
}

nexus/fm/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
pub mod builder;
88
pub use builder::{CaseBuilder, SitrepBuilder};
99
pub mod analysis_input;
10+
pub mod diagnosis;
1011

1112
pub use nexus_types::fm::*;
1213

nexus/src/app/background/init.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ use super::tasks::dns_propagation;
107107
use super::tasks::dns_servers;
108108
use super::tasks::ereport_ingester;
109109
use super::tasks::external_endpoints;
110-
use super::tasks::fm_analysis::FmAnalysis;
110+
use super::tasks::fm_analysis::{self, FmAnalysis};
111111
use super::tasks::fm_rendezvous::FmRendezvous;
112112
use super::tasks::fm_sitrep_gc;
113113
use super::tasks::fm_sitrep_load;
@@ -1142,7 +1142,11 @@ impl BackgroundTasksInitializer {
11421142
datastore.clone(),
11431143
sitrep_watcher.clone(),
11441144
inventory_load_watcher.clone(),
1145-
task_fm_sitrep_loader.clone(),
1145+
fm_analysis::Activators {
1146+
sitrep_loader: task_fm_sitrep_loader.clone(),
1147+
sitrep_gc: task_fm_sitrep_gc.clone(),
1148+
},
1149+
nexus_id,
11461150
);
11471151
driver.register(TaskDefinition {
11481152
name: "fm_analysis",

nexus/src/app/background/tasks/fm_analysis.rs

Lines changed: 114 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,18 @@ use crate::app::background::Activator;
66
use crate::app::background::BackgroundTask;
77
use crate::app::background::tasks::fm_sitrep_load::CurrentSitrep;
88
use anyhow::Context;
9+
use chrono::Utc;
910
use futures::future::BoxFuture;
1011
use nexus_db_queries::context::OpContext;
1112
use nexus_db_queries::db::DataStore;
13+
use nexus_db_queries::db::datastore;
1214
use nexus_db_queries::db::pagination::Paginator;
1315
use nexus_fm as fm;
1416
use nexus_types::internal_api::background::FmAnalysisStatus;
1517
use nexus_types::internal_api::background::fm_analysis as status;
1618
use nexus_types::inventory;
1719
use omicron_uuid_kinds::GenericUuid;
20+
use omicron_uuid_kinds::OmicronZoneUuid;
1821
use serde_json::json;
1922
use slog_error_chain::InlineErrorChain;
2023
use std::sync::Arc;
@@ -26,6 +29,15 @@ pub struct FmAnalysis {
2629
sitrep_rx: watch::Receiver<Option<CurrentSitrep>>,
2730
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
2831
sitrep_loader: Activator,
32+
sitrep_gc: Activator,
33+
nexus_id: OmicronZoneUuid,
34+
}
35+
36+
/// This is just because I don't like it when a constructor takes multiple
37+
/// positional arguments of the same type...
38+
pub struct Activators {
39+
pub sitrep_loader: Activator,
40+
pub sitrep_gc: Activator,
2941
}
3042

3143
impl BackgroundTask for FmAnalysis {
@@ -54,9 +66,18 @@ impl FmAnalysis {
5466
datastore: Arc<DataStore>,
5567
sitrep_rx: watch::Receiver<Option<CurrentSitrep>>,
5668
inv_rx: watch::Receiver<Option<Arc<inventory::Collection>>>,
57-
sitrep_loader: Activator,
69+
activators: Activators,
70+
nexus_id: OmicronZoneUuid,
5871
) -> Self {
59-
Self { datastore, sitrep_rx, inv_rx, sitrep_loader }
72+
let Activators { sitrep_loader, sitrep_gc } = activators;
73+
Self {
74+
datastore,
75+
sitrep_rx,
76+
inv_rx,
77+
sitrep_loader,
78+
sitrep_gc,
79+
nexus_id,
80+
}
6081
}
6182

6283
async fn actually_activate(
@@ -112,25 +133,15 @@ impl FmAnalysis {
112133
};
113134

114135
// Okay, actually run analysis and generate a new sitrep.
115-
let outcome = self
116-
.analyze(&opctx, inputs)
117-
.await
118-
.unwrap_or_else(|err| {
119-
let error = InlineErrorChain::new(&*err);
120-
slog::error!(opctx.log, "fault management analysis failed!"; &error);
121-
status::AnalysisOutcome::Error(error.to_string())
122-
});
123-
124-
if let status::AnalysisOutcome::Committed { .. } = &outcome {
125-
// If we committed a new sitrep, we ought to go ahead and load it
126-
// now...
127-
self.sitrep_loader.activate();
128-
}
136+
let outcome = self.analyze(&opctx, inputs).await;
129137

130138
FmAnalysisStatus {
131139
parent_sitrep_id,
132140
inv_collection_id: Some(inv_collection_id),
133-
outcome: status::Outcome::RanAnalysis { prep_status, outcome },
141+
outcome: status::Outcome::RanAnalysis {
142+
prep_status,
143+
analysis_status: outcome,
144+
},
134145
}
135146
}
136147

@@ -205,9 +216,91 @@ impl FmAnalysis {
205216

206217
async fn analyze(
207218
&mut self,
208-
_opctx: &OpContext,
209-
_inputs: fm::analysis_input::Input,
210-
) -> anyhow::Result<status::AnalysisOutcome> {
211-
anyhow::bail!("FM analysis is not yet implemented")
219+
opctx: &OpContext,
220+
inputs: fm::analysis_input::Input,
221+
) -> status::AnalysisStatus {
222+
let start_time = Utc::now();
223+
let mut sitrep_builder = fm::SitrepBuilder::new(&opctx.log, &inputs);
224+
let result = fm::diagnosis::analyze(&inputs, &mut sitrep_builder);
225+
let end_time = Utc::now();
226+
let (sitrep, report) = sitrep_builder.build(self.nexus_id, end_time);
227+
228+
// Did it work?
229+
if let Err(e) = result {
230+
let err = InlineErrorChain::new(&*e);
231+
slog::error!(&opctx.log, "fault management analysis failed"; "err" => %err);
232+
return status::AnalysisStatus {
233+
start_time,
234+
end_time,
235+
report,
236+
outcome: status::AnalysisOutcome::Error(e.to_string()),
237+
};
238+
}
239+
240+
// TODO(eliza): diff the sitrep against the parent, and return
241+
// `Unchanged` if it's the same.
242+
let unchanged = true;
243+
if unchanged {
244+
slog::info!(
245+
&opctx.log,
246+
"fault management analysis produced no changes from the \
247+
current sitrep"
248+
);
249+
return status::AnalysisStatus {
250+
start_time,
251+
end_time,
252+
report,
253+
outcome: status::AnalysisOutcome::Unchanged,
254+
};
255+
}
256+
257+
let sitrep_id = sitrep.id();
258+
match self.datastore.fm_sitrep_insert(opctx, sitrep).await {
259+
Ok(()) => {
260+
slog::info!(&opctx.log, "updated the current sitrep!");
261+
// If we committed a new sitrep, we ought to go ahead and load it
262+
// now...
263+
self.sitrep_loader.activate();
264+
status::AnalysisStatus {
265+
start_time,
266+
end_time,
267+
report,
268+
outcome: status::AnalysisOutcome::Committed { sitrep_id },
269+
}
270+
}
271+
Err(datastore::fm::InsertSitrepError::ParentNotCurrent(_)) => {
272+
slog::info!(
273+
&opctx.log,
274+
"new sitrep was not committed as the parent sitrep was \
275+
out of date";
276+
);
277+
// We are behind, activate the sitrep loader to try and catch up!
278+
self.sitrep_loader.activate();
279+
// Also, we should probably clean up after ourselves...
280+
self.sitrep_gc.activate();
281+
282+
status::AnalysisStatus {
283+
start_time,
284+
end_time,
285+
report,
286+
outcome: status::AnalysisOutcome::NotCommitted {
287+
sitrep_id,
288+
},
289+
}
290+
}
291+
Err(datastore::fm::InsertSitrepError::Other(e)) => {
292+
let err = InlineErrorChain::new(&e);
293+
slog::error!(&opctx.log, "failed to insert sitrep"; "err" => %err);
294+
status::AnalysisStatus {
295+
start_time,
296+
end_time,
297+
report,
298+
outcome: status::AnalysisOutcome::CommitFailed {
299+
sitrep_id,
300+
error: e.to_string(),
301+
},
302+
}
303+
}
304+
}
212305
}
213306
}

0 commit comments

Comments
 (0)