From 428a6072291664ceed36da7ecde34a051bf227f2 Mon Sep 17 00:00:00 2001 From: karencfv Date: Mon, 5 Jan 2026 21:30:56 +1300 Subject: [PATCH 01/20] [inventory] Add svcs in maintenance to DB --- Cargo.lock | 1 + illumos-utils/src/svcs.rs | 5 +- nexus/db-model/Cargo.toml | 1 + nexus/db-model/src/inventory.rs | 46 ++++++++++++++++- nexus/db-model/src/schema_versions.rs | 3 +- .../db-queries/src/db/datastore/inventory.rs | 51 +++++++++++++++++++ nexus/db-schema/src/schema.rs | 12 +++++ schema/crdb/dbinit.sql | 25 +++++++++ .../up01.sql | 24 +++++++++ 9 files changed, 163 insertions(+), 5 deletions(-) create mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up01.sql diff --git a/Cargo.lock b/Cargo.lock index e76aed1376c..8b306e311f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6764,6 +6764,7 @@ dependencies = [ "expectorate", "hex", "iddqd", + "illumos-utils", "ipnetwork", "itertools 0.14.0", "macaddr", diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..42abf8b4bf5 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -195,12 +195,13 @@ impl From for SvcState { } } +// TODO-K: Ugh, I think this might need to be versioned and moved out of here? #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 9762bccbe76..1c30231bf53 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -21,6 +21,7 @@ derive-where.workspace = true diesel = { workspace = true, features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } hex.workspace = true iddqd.workspace = true +illumos-utils.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 8caf04aadcf..bc6af948e03 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -27,14 +27,16 @@ use diesel::pg::Pg; use diesel::serialize::ToSql; use diesel::{serialize, sql_types}; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcInMaintenance; use ipnetwork::IpNetwork; use nexus_db_schema::schema::inv_zone_manifest_non_boot; use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, - inv_internal_dns, inv_last_reconciliation_dataset_result, + inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, + inv_host_phase_1_flash_hash, inv_internal_dns, + inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_mupdate_override_non_boot, @@ -1012,6 +1014,46 @@ impl_enum_type!( Idle => b"idle" ); +// TODO-K: add docs and move type elsewhere? +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] +pub struct InvSvcInMaintenance { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub fmri: Option, + pub zone: Option, + pub error_messages: Vec, + // TODO-K: Check if this needs to be an option + pub time_of_status: Option>, +} + +impl InvSvcInMaintenance { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svc: Option, + svc_errors: Vec, + time_of_status: Option>, + // TODO-K: Does this need to be here? or is it OK to bunch up all the + // errors in one place? + //svcs_cmd_error: Option, + ) -> Self { + let (fmri, zone) = match svc { + Some(svc) => (Some(svc.fmri), Some(svc.zone)), + None => (None, None), + }; + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + fmri, + zone, + error_messages: svc_errors, + time_of_status, + } + } +} + /// See [`sled_agent_types::inventory::ConfigReconcilerInventory`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_config_reconciler)] diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 293b6086c6b..4c9434e0c7a 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(217, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(218, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(218, "health-monitor-svcs-in-maintenance"), KnownVersion::new(217, "multiple-default-ip-pools-per-silo"), KnownVersion::new(216, "add-trust-quorum"), KnownVersion::new(215, "support-up-to-12-disks"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index ccf453c5cd7..137662d22ad 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -60,6 +60,7 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; +use nexus_db_model::InvSvcInMaintenance; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -206,6 +207,33 @@ impl DataStore { } } + // TODO-K: Clean up + // Pull services in maintenance out of all sled agents + let mut svcs_in_maintenance = vec![]; + + for sled_agent in &collection.sled_agents { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + for svc in &svcs.services { + svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(svc.clone()), + svcs.errors.clone(), + svcs.time_of_status, + )); + } + } + Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + vec![e.to_string()], + None, + )), + } + } + // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -1394,6 +1422,25 @@ impl DataStore { } } + // Insert rows for all the unhealthy services we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); + loop { + let some_svcs_in_maintenance = + svcs_in_maintenance.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) + .values(some_svcs_in_maintenance) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). @@ -2223,6 +2270,8 @@ impl DataStore { .await? }; + // TODO-K: Remove rows for health monitor + Ok(NumRowsDeleted { ncollections, nsps, @@ -2299,6 +2348,7 @@ impl DataStore { "ncockroach_status" => ncockroach_status, "nntp_timesync" => nntp_timesync, "ninternal_dns" => ninternal_dns, + // TODO-K: add health monitor rows here too ); Ok(()) @@ -2587,6 +2637,7 @@ impl DataStore { disk_firmware }; + // TODO-K: Take inspiration here // Mapping of "Sled ID" -> "All disks reported by that sled" let physical_disks: BTreeMap< SledUuid, diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index de626e5c64f..c471c6e8981 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1716,6 +1716,18 @@ table! { } } +table! { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + fmri -> Nullable, + zone -> Nullable, + + error_messages -> Nullable>, + time_of_status -> Nullable, + } +} + table! { inv_sled_boot_partition (inv_collection_id, sled_id, boot_disk_slot) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 29efbfcfb3b..31eaa0330be 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4021,6 +4021,31 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY, + + -- time when the status was checked if applicable + valid_until TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id) +) + -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql new file mode 100644 index 00000000000..71680604482 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -0,0 +1,24 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY, + + -- time when the status was checked if applicable + valid_until TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id) +) \ No newline at end of file From 64852eaa5831a0a827581fa94c499960d03c01ad Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 14:54:36 +1300 Subject: [PATCH 02/20] insert rows --- nexus/db-model/src/inventory.rs | 17 +++++-- .../db-queries/src/db/datastore/inventory.rs | 45 ++++++++++++++++++- nexus/db-schema/src/schema.rs | 4 +- schema/crdb/dbinit.sql | 16 ++++--- .../up01.sql | 14 ++++-- sled-agent/src/long_running_tasks.rs | 3 +- sled-agent/src/sim/sled_agent.rs | 4 +- uuid-kinds/src/lib.rs | 3 ++ 8 files changed, 90 insertions(+), 16 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index bc6af948e03..e3e2045b256 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -9,6 +9,7 @@ use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; use crate::sled_cpu_family::SledCpuFamily; +use crate::to_db_typed_uuid; use crate::typed_uuid::DbTypedUuid; use crate::{ ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, @@ -63,6 +64,7 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -72,6 +74,8 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledKind; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::SvcInMaintenanceKind; +use omicron_uuid_kinds::SvcInMaintenanceUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; @@ -1020,9 +1024,11 @@ impl_enum_type!( pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, + pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, pub error_messages: Vec, + pub svcs_cmd_error: Option, // TODO-K: Check if this needs to be an option pub time_of_status: Option>, } @@ -1033,22 +1039,27 @@ impl InvSvcInMaintenance { sled_id: SledUuid, svc: Option, svc_errors: Vec, + svcs_cmd_error: Option, time_of_status: Option>, - // TODO-K: Does this need to be here? or is it OK to bunch up all the - // errors in one place? - //svcs_cmd_error: Option, ) -> Self { let (fmri, zone) = match svc { Some(svc) => (Some(svc.fmri), Some(svc.zone)), None => (None, None), }; + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + Self { inv_collection_id: inv_collection_id.into(), sled_id: sled_id.into(), + id, fmri, zone, error_messages: svc_errors, + svcs_cmd_error, time_of_status, } } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 137662d22ad..63869ffe42c 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -220,6 +220,7 @@ impl DataStore { sled_agent.sled_id, Some(svc.clone()), svcs.errors.clone(), + None, svcs.time_of_status, )); } @@ -228,7 +229,8 @@ impl DataStore { collection_id, sled_agent.sled_id, None, - vec![e.to_string()], + vec![], + Some(e.to_string()), None, )), } @@ -2692,6 +2694,7 @@ impl DataStore { disks }; + // TODO-K: get inspiration ID here // Mapping of "Sled ID" -> "All zpools reported by that sled" let zpools: BTreeMap> = { use nexus_db_schema::schema::inv_zpool::dsl; @@ -2762,6 +2765,46 @@ impl DataStore { datasets }; + // TODO-K: fix +// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by +// // that sled" +// let svcs_in_maintenance: BTreeMap< +// Uuid, +// Vec, +// > = { +// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; +// +// let mut svcs = +// BTreeMap::>::new(); +// let mut paginator = Paginator::new( +// batch_size, +// dropshot::PaginationOrder::Ascending, +// ); +// while let Some(p) = paginator.next() { +// let batch = paginated_multicolumn( +// dsl::inv_health_monitor_svc_in_maintenance, +// (dsl::sled_id, dsl::id), +// &p.current_pagparams(), +// ) +// .filter(dsl::inv_collection_id.eq(db_id)) +// .select(InvDataset::as_select()) +// .load_async(&*conn) +// .await +// .map_err(|e| { +// public_error_from_diesel(e, ErrorHandler::Server) +// })?; +// paginator = p.found_batch(&batch, &|row| { +// (row.sled_id, row.name.clone()) +// }); +// for svc in batch { +// svcs.entry(svc.sled_id.into_untyped_uuid()) +// .or_default() +// .push(svc.into()); +// } +// } +// svcs +// }; + // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. let baseboard_id_ids: BTreeSet<_> = sps diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index c471c6e8981..8eab5de820f 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1717,13 +1717,15 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id) { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, sled_id -> Uuid, + id -> Uuid, fmri -> Nullable, zone -> Nullable, error_messages -> Nullable>, + svcs_cmd_error -> Nullable, time_of_status -> Nullable, } } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 31eaa0330be..08b9f3005ab 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4031,6 +4031,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, + -- unique id for each row + id UUID NOT NULL, + -- FMRI of the SMF service in maintenance fmri TEXT, @@ -4038,13 +4041,16 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance zone TEXT, -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY, + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, -- time when the status was checked if applicable - valid_until TIMESTAMPTZ, + time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id) -) + PRIMARY KEY (inv_collection_id, sled_id, id) +); -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. @@ -7680,7 +7686,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '217.0.0', NULL) + (TRUE, NOW(), NOW(), '218.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index 71680604482..bf0b956d23e 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -8,6 +8,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, + -- unique id for each row + id UUID NOT NULL, + -- FMRI of the SMF service in maintenance fmri TEXT, @@ -15,10 +18,13 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance zone TEXT, -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY, + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, -- time when the status was checked if applicable - valid_until TIMESTAMPTZ, + time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id) -) \ No newline at end of file + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 700d4a08f4b..d27aaa8595e 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,7 +275,8 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +// TODO-K: Remove pub +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 075dc655a0f..42d4861d4b4 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,7 +168,9 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + // TODO-K: Uncomment + // let health_monitor = HealthMonitorHandle::stub(); + let health_monitor = crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index abc8690806e..6253b460886 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -83,6 +83,9 @@ impl_typed_uuid_kinds! { Sled = {}, SpUpdate = {}, SupportBundle = {}, + // `SvcInMaintenance`s do not contain IDs themselves. These IDs exist + // for the same reason as those in `OmicronSledConfig`. + SvcInMaintenance = {}, TufArtifact = {}, TufRepo = {}, TufTrustRoot = {}, From 52f78497b27285ce0f5619392bd6ee3374958436 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 19:32:49 +1300 Subject: [PATCH 03/20] display works --- nexus/db-model/src/inventory.rs | 1 + .../db-queries/src/db/datastore/inventory.rs | 151 +++++++++++++----- nexus/db-schema/src/schema.rs | 2 +- nexus/types/src/inventory/display.rs | 115 +++++++++---- sled-agent/src/sim/sled_agent.rs | 3 +- 5 files changed, 197 insertions(+), 75 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index e3e2045b256..a6c59ee9e8e 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1024,6 +1024,7 @@ impl_enum_type!( pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, + // TODO-K: Is this ID necessary? pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 63869ffe42c..bba42438f2f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -27,6 +27,8 @@ use diesel::sql_types::Nullable; use futures::FutureExt; use futures::future::BoxFuture; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; @@ -2766,44 +2768,73 @@ impl DataStore { }; // TODO-K: fix -// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by -// // that sled" -// let svcs_in_maintenance: BTreeMap< -// Uuid, -// Vec, -// > = { -// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; -// -// let mut svcs = -// BTreeMap::>::new(); -// let mut paginator = Paginator::new( -// batch_size, -// dropshot::PaginationOrder::Ascending, -// ); -// while let Some(p) = paginator.next() { -// let batch = paginated_multicolumn( -// dsl::inv_health_monitor_svc_in_maintenance, -// (dsl::sled_id, dsl::id), -// &p.current_pagparams(), -// ) -// .filter(dsl::inv_collection_id.eq(db_id)) -// .select(InvDataset::as_select()) -// .load_async(&*conn) -// .await -// .map_err(|e| { -// public_error_from_diesel(e, ErrorHandler::Server) -// })?; -// paginator = p.found_batch(&batch, &|row| { -// (row.sled_id, row.name.clone()) -// }); -// for svc in batch { -// svcs.entry(svc.sled_id.into_untyped_uuid()) -// .or_default() -// .push(svc.into()); -// } -// } -// svcs -// }; + // Mapping of "Sled ID" -> "All SMF services in maintenance reported by + // that sled" + let mut svcs_in_maintenance_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let mut svcs = BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + // TODO-K: Do I actually need paginated multicolumn? + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenance::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = + p.found_batch(&batch, &|row| (row.sled_id, row.id.clone())); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc.into()); + } + } + svcs + }; + // + // TODO-K: This is wrong. We want a vector of services, not just one + // let mut svcs_in_maintenance_by_sled = { + // use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + // + // let mut results: BTreeMap = BTreeMap::new(); + // + // let mut paginator = Paginator::new( + // batch_size, + // dropshot::PaginationOrder::Ascending, + // ); + // while let Some(p) = paginator.next() { + // let batch = paginated( + // dsl::inv_health_monitor_svc_in_maintenance, + // dsl::sled_id, + // &p.current_pagparams(), + // ) + // .filter(dsl::inv_collection_id.eq(db_id)) + // .select(InvSvcInMaintenance::as_select()) + // .load_async(&*conn) + // .await + // .map_err(|e| { + // public_error_from_diesel(e, ErrorHandler::Server) + // })?; + // paginator = p.found_batch(&batch, &|row| row.sled_id); + // + // for row in batch { + // results.insert(row.sled_id.into(), row); + // } + // } + // + // results + // }; // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. @@ -4060,6 +4091,48 @@ impl DataStore { )) })?; + // TODO-K; Clean up + // Convert all health checks into a full `HealthMonitorInventory` + let mut health_monitor = HealthMonitorInventory::new(); + + let svcs_in_maintenance = svcs_in_maintenance_by_sled + .remove(&sled_id.into_untyped_uuid()) + .map(|svcs| { + // TODO-K: Clean up + if let Some(e) = svcs[0].svcs_cmd_error.clone() { + return Err(e); + } + let mut services = vec![]; + for svc in &svcs { + let fmri = if let Some(f) = svc.fmri.clone() { + f + } else { + "".to_string() + }; + let zone = if let Some(z) = svc.zone.clone() { + z + } else { + "".to_string() + }; + + let service = SvcInMaintenance { fmri, zone }; + services.push(service) + } + + Ok(SvcsInMaintenanceResult { + services, + errors: svcs[0].error_messages.clone(), + time_of_status: svcs[0].time_of_status, + }) + }); + + if let Some(svcs) = svcs_in_maintenance { + println!("DEBUG {svcs:?}"); + health_monitor.smf_services_in_maintenance = svcs + }; + + // TODO-K: End of clean up bit + let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4098,7 +4171,7 @@ impl DataStore { zone_image_resolver, // TODO-K[omicron#9516]: Actually query the DB when there is // something there - health_monitor: HealthMonitorInventory::new(), + health_monitor, }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 8eab5de820f..5954ceb738b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1724,7 +1724,7 @@ table! { fmri -> Nullable, zone -> Nullable, - error_messages -> Nullable>, + error_messages -> Array, svcs_cmd_error -> Nullable, time_of_status -> Nullable, } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 163f8744c79..f21d283cc47 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -14,12 +14,14 @@ use chrono::SecondsFormat; use clap::Subcommand; use gateway_types::component::SpType; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcsInMaintenanceResult; use indent_write::fmt::IndentWriter; use itertools::Itertools; use omicron_common::disk::M2Slot; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, }; +use sled_agent_types::inventory::HealthMonitorInventory; use sled_agent_types_versions::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, ConfigReconcilerInventory, ConfigReconcilerInventoryResult, @@ -896,41 +898,8 @@ fn display_sleds( } } - // TODO-K[omicron#9516]: This is temporarily hidden until we add the - // health monitor types to the DB. Once those have been integrated, - // we'll show health monitor status when everything is healthy as well. - if !health_monitor.is_empty() { - writeln!(indented, "HEALTH MONITOR")?; - let mut indent2 = IndentWriter::new(" ", &mut indented); - match &health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - if !svcs.is_empty() { - if let Some(time_of_status) = &svcs.time_of_status { - writeln!( - indent2, - "SMF services in maintenance at {}:", - time_of_status.to_rfc3339_opts( - SecondsFormat::Millis, - /* use_z */ true, - ) - )?; - } - let mut indent3 = IndentWriter::new(" ", &mut indent2); - for svc in &svcs.services { - writeln!(indent3, "{svc}")?; - } - } - } - Err(e) => { - writeln!( - indent2, - "failed to retrieve SMF services in maintenance: {e}" - )?; - } - } - } - f = indented.into_inner(); + display_health_monitor(health_monitor, f)?; } Ok(()) } @@ -1122,6 +1091,84 @@ fn collect_config_reconciler_errors( .collect() } +fn display_health_monitor( + health_monitor: &HealthMonitorInventory, + f: &mut dyn fmt::Write, +) -> fmt::Result { + let HealthMonitorInventory { smf_services_in_maintenance } = health_monitor; + + writeln!(f, "\nHEALTH MONITOR")?; + + let mut indented = IndentWriter::new(" ", f); + + match &smf_services_in_maintenance { + Ok(svcs) => { + if !svcs.is_empty() { + let SvcsInMaintenanceResult { + services, + errors, + time_of_status, + } = svcs; + let time = if let Some(t) = time_of_status { + t.to_rfc3339_opts( + SecondsFormat::Millis, + /* use_z */ true, + ) + } else { + "unknown time".to_string() + }; + + writeln!( + indented, + "{} SMF services in maintenance at {}", + svcs.services.len(), + time + )?; + + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + + if !errors.is_empty() { + writeln!( + indented, + "\nfound errors when retrieving services in maintenance:" + )?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + for e in errors { + writeln!(indent2, "{e}")?; + } + } + } else { + writeln!( + indented, + "no data on SMF services in maintenance has been collected" + )?; + } + } + Err(e) => { + writeln!( + indented, + "failed to retrieve SMF services in maintenance: {e}" + )?; + } + }; + + Ok(()) +} + fn display_sled_config( label: &str, config: &OmicronSledConfig, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 42d4861d4b4..a2feba8796a 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -170,7 +170,8 @@ impl SledAgent { // TODO-K: Uncomment // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + let health_monitor = + crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 3b5e7799b2a7f0fe5b6117049e23de7ee1248ea9 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:03:43 +1300 Subject: [PATCH 04/20] clean up and disable svcs in sim --- .../db-queries/src/db/datastore/inventory.rs | 38 ++----------------- nexus/types/src/inventory/display.rs | 1 + sled-agent/src/sim/sled_agent.rs | 8 ++-- 3 files changed, 8 insertions(+), 39 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index bba42438f2f..feba0be3c7d 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -2793,48 +2793,15 @@ impl DataStore { public_error_from_diesel(e, ErrorHandler::Server) })?; paginator = - p.found_batch(&batch, &|row| (row.sled_id, row.id.clone())); + p.found_batch(&batch, &|row| (row.sled_id, row.id)); for svc in batch { svcs.entry(svc.sled_id.into_untyped_uuid()) .or_default() - .push(svc.into()); + .push(svc); } } svcs }; - // - // TODO-K: This is wrong. We want a vector of services, not just one - // let mut svcs_in_maintenance_by_sled = { - // use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - // - // let mut results: BTreeMap = BTreeMap::new(); - // - // let mut paginator = Paginator::new( - // batch_size, - // dropshot::PaginationOrder::Ascending, - // ); - // while let Some(p) = paginator.next() { - // let batch = paginated( - // dsl::inv_health_monitor_svc_in_maintenance, - // dsl::sled_id, - // &p.current_pagparams(), - // ) - // .filter(dsl::inv_collection_id.eq(db_id)) - // .select(InvSvcInMaintenance::as_select()) - // .load_async(&*conn) - // .await - // .map_err(|e| { - // public_error_from_diesel(e, ErrorHandler::Server) - // })?; - // paginator = p.found_batch(&batch, &|row| row.sled_id); - // - // for row in batch { - // results.insert(row.sled_id.into(), row); - // } - // } - // - // results - // }; // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. @@ -4127,6 +4094,7 @@ impl DataStore { }); if let Some(svcs) = svcs_in_maintenance { + // TODO-K: removeme println!("DEBUG {svcs:?}"); health_monitor.smf_services_in_maintenance = svcs }; diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index f21d283cc47..05b4341599c 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -1152,6 +1152,7 @@ fn display_health_monitor( } } } else { + // TODO-K: Should we record time even if no svcs in maintenance were found? writeln!( indented, "no data on SMF services in maintenance has been collected" diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index a2feba8796a..fc0063096e8 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,10 +168,10 @@ impl SledAgent { .await .start(&log, &config.dropshot); - // TODO-K: Uncomment - // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = - crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + // TODO-K: Uncomment and remove long running task + let health_monitor = HealthMonitorHandle::stub(); + //let health_monitor = + // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From c6432017cd2c705b83c4686db7c9419e801c3a03 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:06:26 +1300 Subject: [PATCH 05/20] expectorate --- .../reconfigurator-cli/tests/output/cmds-example-stdout | 9 +++++++++ .../tests/output/cmds-mupdate-update-flow-stdout | 9 +++++++++ .../tests/output/cmds-nexus-generation-autobump-stdout | 9 +++++++++ .../tests/output/cmds-target-release-stdout | 9 +++++++++ .../tests/output/cmds-unsafe-zone-mgs-stdout | 9 +++++++++ 5 files changed, 45 insertions(+) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 83aa46e3d5c..1de518982b0 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1592,6 +1592,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1719,6 +1722,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -1939,6 +1945,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 25242992fcd..f8a9c9a0680 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -314,6 +314,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -432,6 +435,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -539,6 +545,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 28743676866..d4f01859add 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -702,6 +702,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -868,6 +871,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1034,6 +1040,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 952056fb802..bfc8a125487 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -689,6 +689,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -855,6 +858,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1021,6 +1027,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index 82562dc16a7..18efc9f2a04 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -673,6 +673,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -839,6 +842,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1005,6 +1011,9 @@ LEDGERED SLED CONFIG all zones reconciled successfully reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved From 9b1ac94f87db0a8fb303ca290de12c9fd83e89b3 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 6 Jan 2026 20:07:30 +1300 Subject: [PATCH 06/20] fmt --- nexus/db-queries/src/db/datastore/inventory.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index feba0be3c7d..7243b7fc57d 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -2792,8 +2792,7 @@ impl DataStore { .map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) })?; - paginator = - p.found_batch(&batch, &|row| (row.sled_id, row.id)); + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); for svc in batch { svcs.entry(svc.sled_id.into_untyped_uuid()) .or_default() From aa071e02333cc9117156f3aaeb286739c5dccb03 Mon Sep 17 00:00:00 2001 From: karencfv Date: Wed, 7 Jan 2026 20:42:00 +1300 Subject: [PATCH 07/20] fix display bug --- .../db-queries/src/db/datastore/inventory.rs | 41 ++++++++++++++----- nexus/types/src/inventory/display.rs | 36 ++++++++-------- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 7243b7fc57d..47bb56db94b 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -216,15 +216,30 @@ impl DataStore { for sled_agent in &collection.sled_agents { match &sled_agent.health_monitor.smf_services_in_maintenance { Ok(svcs) => { - for svc in &svcs.services { + // When there are no services in maintenance, we will still + // want to insert a row with the time the health check was + // made and any parsing errors we may have collected. + if svcs.services.is_empty() && svcs.time_of_status.is_some() + { svcs_in_maintenance.push(InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, - Some(svc.clone()), + None, svcs.errors.clone(), None, svcs.time_of_status, )); + } else { + for svc in &svcs.services { + svcs_in_maintenance.push(InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(svc.clone()), + svcs.errors.clone(), + None, + svcs.time_of_status, + )); + } } } Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( @@ -2779,7 +2794,6 @@ impl DataStore { dropshot::PaginationOrder::Ascending, ); while let Some(p) = paginator.next() { - // TODO-K: Do I actually need paginated multicolumn? let batch: Vec = paginated_multicolumn( dsl::inv_health_monitor_svc_in_maintenance, (dsl::sled_id, dsl::id), @@ -4063,13 +4077,21 @@ impl DataStore { let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) - .map(|svcs| { + .map(|rows| { // TODO-K: Clean up - if let Some(e) = svcs[0].svcs_cmd_error.clone() { + if let Some(e) = rows[0].svcs_cmd_error.clone() { return Err(e); } + let mut services = vec![]; - for svc in &svcs { + for svc in &rows { + if svc.fmri.is_none() && svc.zone.is_none() { + continue; + } + + // All rows should have both zone and FMRI populated or + // none at all. Nevertheless, we'll handle the case of a + // partially populated row. let fmri = if let Some(f) = svc.fmri.clone() { f } else { @@ -4081,14 +4103,13 @@ impl DataStore { "".to_string() }; - let service = SvcInMaintenance { fmri, zone }; - services.push(service) + services.push(SvcInMaintenance { fmri, zone }) } Ok(SvcsInMaintenanceResult { services, - errors: svcs[0].error_messages.clone(), - time_of_status: svcs[0].time_of_status, + errors: rows[0].error_messages.clone(), + time_of_status: rows[0].time_of_status, }) }); diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 05b4341599c..0bd2f6f679b 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -1121,26 +1121,27 @@ fn display_health_monitor( writeln!( indented, "{} SMF services in maintenance at {}", - svcs.services.len(), + services.len(), time )?; - #[derive(Tabled)] - #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] - struct SvcRow { - fmri: String, - zone: String, - } - let rows = services.iter().map(|s| SvcRow { - fmri: s.fmri.clone(), - zone: s.zone.clone(), - }); - let table = tabled::Table::new(rows) - .with(tabled::settings::Style::empty()) - .with(tabled::settings::Padding::new(4, 1, 0, 0)) - .to_string(); - writeln!(indented, "{table}")?; - + if !services.is_empty() { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + }; if !errors.is_empty() { writeln!( indented, @@ -1152,7 +1153,6 @@ fn display_health_monitor( } } } else { - // TODO-K: Should we record time even if no svcs in maintenance were found? writeln!( indented, "no data on SMF services in maintenance has been collected" From e3f0a1ff31c0ea2de8df0de6518343d1b73d4b9c Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 16:56:55 +1300 Subject: [PATCH 08/20] Clean up --- illumos-utils/src/svcs.rs | 10 -- nexus/db-model/src/inventory.rs | 3 - .../db-queries/src/db/datastore/inventory.rs | 116 +++++++++--------- sled-agent/src/sim/sled_agent.rs | 6 +- 4 files changed, 58 insertions(+), 77 deletions(-) diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 42abf8b4bf5..1ea8eac69f1 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -19,7 +19,6 @@ use serde::Deserialize; use serde::Serialize; use slog::Logger; use slog::{error, info}; -use std::fmt::Display; #[cfg(target_os = "illumos")] use tokio::process::Command; @@ -195,7 +194,6 @@ impl From for SvcState { } } -// TODO-K: Ugh, I think this might need to be versioned and moved out of here? #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running @@ -211,14 +209,6 @@ impl SvcInMaintenance { } } -impl Display for SvcInMaintenance { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let SvcInMaintenance { fmri, zone } = self; - - writeln!(f, "FMRI: {} zone: {}", fmri, zone) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index a6c59ee9e8e..239de843876 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1018,19 +1018,16 @@ impl_enum_type!( Idle => b"idle" ); -// TODO-K: add docs and move type elsewhere? #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance)] pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, - // TODO-K: Is this ID necessary? pub id: DbTypedUuid, pub fmri: Option, pub zone: Option, pub error_messages: Vec, pub svcs_cmd_error: Option, - // TODO-K: Check if this needs to be an option pub time_of_status: Option>, } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 47bb56db94b..8fd72bc3162 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -209,49 +209,55 @@ impl DataStore { } } - // TODO-K: Clean up // Pull services in maintenance out of all sled agents - let mut svcs_in_maintenance = vec![]; - - for sled_agent in &collection.sled_agents { - match &sled_agent.health_monitor.smf_services_in_maintenance { - Ok(svcs) => { + let svcs_in_maintenance: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { // When there are no services in maintenance, we will still // want to insert a row with the time the health check was // made and any parsing errors we may have collected. - if svcs.services.is_empty() && svcs.time_of_status.is_some() + Ok(svcs) + if svcs.services.is_empty() + && svcs.time_of_status.is_some() => { - svcs_in_maintenance.push(InvSvcInMaintenance::new( + vec![InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, None, svcs.errors.clone(), None, svcs.time_of_status, - )); - } else { - for svc in &svcs.services { - svcs_in_maintenance.push(InvSvcInMaintenance::new( + )] + } + Ok(svcs) => svcs + .services + .iter() + .map(|svc| { + InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, Some(svc.clone()), svcs.errors.clone(), None, svcs.time_of_status, - )); - } + ) + }) + .collect(), + Err(e) => { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + vec![], + Some(e.to_string()), + None, + )] } } - Err(e) => svcs_in_maintenance.push(InvSvcInMaintenance::new( - collection_id, - sled_agent.sled_id, - None, - vec![], - Some(e.to_string()), - None, - )), - } - } + }) + .collect(); // Pull disks out of all sled agents let disks: Vec<_> = collection @@ -2656,7 +2662,6 @@ impl DataStore { disk_firmware }; - // TODO-K: Take inspiration here // Mapping of "Sled ID" -> "All disks reported by that sled" let physical_disks: BTreeMap< SledUuid, @@ -2711,7 +2716,6 @@ impl DataStore { disks }; - // TODO-K: get inspiration ID here // Mapping of "Sled ID" -> "All zpools reported by that sled" let zpools: BTreeMap> = { use nexus_db_schema::schema::inv_zpool::dsl; @@ -2782,7 +2786,6 @@ impl DataStore { datasets }; - // TODO-K: fix // Mapping of "Sled ID" -> "All SMF services in maintenance reported by // that sled" let mut svcs_in_maintenance_by_sled = { @@ -4071,56 +4074,49 @@ impl DataStore { )) })?; - // TODO-K; Clean up // Convert all health checks into a full `HealthMonitorInventory` let mut health_monitor = HealthMonitorInventory::new(); let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) .map(|rows| { - // TODO-K: Clean up - if let Some(e) = rows[0].svcs_cmd_error.clone() { - return Err(e); + // Get metadata from the first row. All rows from the same + // collection and sled will share time_of_status, + // svcs_cmd_error and error_messages. + let first_row = + rows.first().expect("rows should not be empty"); + + // Check if the svcs command itself failed first. If so, we + // can safely assume no services in maintenance have been + // reported and return an error. + if let Some(e) = &first_row.svcs_cmd_error { + return Err(e.clone()); } - let mut services = vec![]; - for svc in &rows { - if svc.fmri.is_none() && svc.zone.is_none() { - continue; - } - - // All rows should have both zone and FMRI populated or - // none at all. Nevertheless, we'll handle the case of a - // partially populated row. - let fmri = if let Some(f) = svc.fmri.clone() { - f - } else { - "".to_string() - }; - let zone = if let Some(z) = svc.zone.clone() { - z - } else { - "".to_string() - }; - - services.push(SvcInMaintenance { fmri, zone }) - } + // Convert database rows to service in maintenance entries. + // All rows should have both zone and FMRI populated or none + // at all. Nevertheless, we'll handle the case of a + // partially populated row. + let services: Vec = rows + .iter() + .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) + .map(|svc| SvcInMaintenance { + fmri: svc.fmri.clone().unwrap_or_default(), + zone: svc.zone.clone().unwrap_or_default(), + }) + .collect(); Ok(SvcsInMaintenanceResult { services, - errors: rows[0].error_messages.clone(), - time_of_status: rows[0].time_of_status, + errors: first_row.error_messages.clone(), + time_of_status: first_row.time_of_status, }) }); if let Some(svcs) = svcs_in_maintenance { - // TODO-K: removeme - println!("DEBUG {svcs:?}"); health_monitor.smf_services_in_maintenance = svcs }; - // TODO-K: End of clean up bit - let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4157,8 +4153,6 @@ impl DataStore { reconciler_status, last_reconciliation, zone_image_resolver, - // TODO-K[omicron#9516]: Actually query the DB when there is - // something there health_monitor, }; sled_agents diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index fc0063096e8..d2bb3f3506c 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -169,9 +169,9 @@ impl SledAgent { .start(&log, &config.dropshot); // TODO-K: Uncomment and remove long running task - let health_monitor = HealthMonitorHandle::stub(); - //let health_monitor = - // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + // let health_monitor = HealthMonitorHandle::stub(); + let health_monitor = + crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 4b80284b715b5f6d2a7c196122e52f6303508a6f Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 19:05:44 +1300 Subject: [PATCH 09/20] Ability to remove rows from collection --- .../db-queries/src/db/datastore/inventory.rs | 17 ++++++++-- nexus/inventory/src/examples.rs | 32 ++++++++++++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 8fd72bc3162..a04ff5eb145 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -1950,6 +1950,7 @@ impl DataStore { nmupdate_override_non_boot: usize, nconfig_reconcilers: usize, nboot_partitions: usize, + nhealth_monitor_svc_in_maintenance: usize, nomicron_sled_configs: usize, nomicron_sled_config_disks: usize, nomicron_sled_config_datasets: usize, @@ -1984,6 +1985,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2188,6 +2190,16 @@ impl DataStore { .await? }; + // Remove rows associated with the health monitor + let nhealth_monitor_svc_in_maintenance = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { use nexus_db_schema::schema::inv_omicron_sled_config::dsl; @@ -2295,8 +2307,6 @@ impl DataStore { .await? }; - // TODO-K: Remove rows for health monitor - Ok(NumRowsDeleted { ncollections, nsps, @@ -2318,6 +2328,7 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2362,6 +2373,7 @@ impl DataStore { "nmupdate_override_non_boot" => nmupdate_override_non_boot, "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, + "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -2373,7 +2385,6 @@ impl DataStore { "ncockroach_status" => ncockroach_status, "nntp_timesync" => nntp_timesync, "ninternal_dns" => ninternal_dns, - // TODO-K: add health monitor rows here too ); Ok(()) diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 06ccdf83571..3efa0220d9d 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -7,6 +7,7 @@ use crate::CollectionBuilder; use crate::now_db_precision; use camino::Utf8Path; +use chrono::Utc; use clickhouse_admin_types::keeper::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::keeper::KeeperId; use gateway_client::types::PowerState; @@ -15,6 +16,8 @@ use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_types::rot::RotSlot; use iddqd::id_ord_map; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::InternalDnsGenerationStatus; use nexus_types::inventory::RotPage; @@ -573,6 +576,7 @@ pub fn representative() -> Representative { deserialized_zone_manifest: true, has_mupdate_override: true, }), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -605,6 +609,7 @@ pub fn representative() -> Representative { deserialized_zone_manifest: false, has_mupdate_override: false, }), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -635,13 +640,14 @@ pub fn representative() -> Representative { zone_image_resolver(ZoneImageResolverExampleKind::Mismatch { has_mupdate_override: true, }), + HealthMonitorInventory::new(), ), ) .unwrap(); // Finally, report a sled with unknown baseboard information. This should // look the same as the PC as far as inventory is concerned but let's verify - // it. + // it. Additionally, this sled will report a few SMF services in maintenance. let sled_agent_id_unknown = "5c5b4cf9-3e13-45fd-871c-f177d6537510".parse().unwrap(); @@ -660,6 +666,24 @@ pub fn representative() -> Representative { None, // Simulate an error here. zone_image_resolver(ZoneImageResolverExampleKind::Error), + HealthMonitorInventory { + smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + services: vec![ + SvcInMaintenance { + fmri: "svc:/site/fake-service:default" + .to_string(), + zone: "global".to_string(), + }, + SvcInMaintenance { + fmri: "svc:/site/fake-service2:default" + .to_string(), + zone: "global".to_string(), + }, + ], + errors: vec![], + time_of_status: Some(Utc::now()), + }), + }, ), ) .unwrap(); @@ -980,6 +1004,7 @@ pub fn sled_agent( datasets: Vec, ledgered_sled_config: Option, zone_image_resolver: ZoneImageResolverInventory, + health_monitor: HealthMonitorInventory, ) -> Inventory { // Assume the `ledgered_sled_config` was reconciled successfully. let last_reconciliation = ledgered_sled_config.clone().map(|config| { @@ -1041,9 +1066,6 @@ pub fn sled_agent( reconciler_status, last_reconciliation, zone_image_resolver, - // TODO-K: We'll want to have the functionality to add some services - // here in a future PR. This will be more useful when we add this - // information to the DB. - health_monitor: HealthMonitorInventory::new(), + health_monitor, } } From ed3ed7fee7a4db0cb6b3c348edd75bb7feb6c558 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 8 Jan 2026 19:19:13 +1300 Subject: [PATCH 10/20] Disbale health monitor on simulated system --- sled-agent/src/long_running_tasks.rs | 3 +-- sled-agent/src/sim/sled_agent.rs | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index d27aaa8595e..700d4a08f4b 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,8 +275,7 @@ async fn spawn_bootstore_tasks( node_handle } -// TODO-K: Remove pub -pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d2bb3f3506c..075dc655a0f 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,10 +168,7 @@ impl SledAgent { .await .start(&log, &config.dropshot); - // TODO-K: Uncomment and remove long running task - // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = - crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + let health_monitor = HealthMonitorHandle::stub(); Arc::new(SledAgent { id, From 4489b986401e9cf76bb98e442d28f59c155c5456 Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 9 Jan 2026 10:51:31 +1300 Subject: [PATCH 11/20] fix tests --- nexus/inventory/src/examples.rs | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 3efa0220d9d..bf19668bded 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -7,7 +7,6 @@ use crate::CollectionBuilder; use crate::now_db_precision; use camino::Utf8Path; -use chrono::Utc; use clickhouse_admin_types::keeper::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::keeper::KeeperId; use gateway_client::types::PowerState; @@ -668,20 +667,14 @@ pub fn representative() -> Representative { zone_image_resolver(ZoneImageResolverExampleKind::Error), HealthMonitorInventory { smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { - services: vec![ - SvcInMaintenance { - fmri: "svc:/site/fake-service:default" - .to_string(), - zone: "global".to_string(), - }, - SvcInMaintenance { - fmri: "svc:/site/fake-service2:default" - .to_string(), - zone: "global".to_string(), - }, - ], + services: vec![SvcInMaintenance { + fmri: "svc:/site/fake-service:default".to_string(), + zone: "global".to_string(), + }], errors: vec![], - time_of_status: Some(Utc::now()), + time_of_status: Some( + "2026-01-01T00:00:00Z".parse().unwrap(), + ), }), }, ), From 335832e38854a5101512a9606455ba92bba2328e Mon Sep 17 00:00:00 2001 From: karencfv Date: Fri, 9 Jan 2026 12:12:19 +1300 Subject: [PATCH 12/20] fmt --- nexus/db-model/src/inventory.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index b54002c8a9c..aaffca12703 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -38,8 +38,8 @@ use nexus_db_schema::schema::{ inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, - inv_last_reconciliation_disk_result, - inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_measurements, + inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, + inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_measurement_manifest_non_boot, inv_mupdate_override_non_boot, inv_ntp_timesync, inv_nvme_disk_firmware, inv_omicron_sled_config, inv_omicron_sled_config_dataset, From 9d0c632fd308da7cf6b2a29ed238b103562f35e5 Mon Sep 17 00:00:00 2001 From: karencfv Date: Wed, 14 Jan 2026 18:08:58 +1300 Subject: [PATCH 13/20] fixes after merge --- nexus/db-model/src/schema_versions.rs | 2 +- schema/crdb/dbinit.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index a77ba1abe1f..d32dcd4a5a4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(220, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(221, 0, 0); /// List of all past database schema versions, in *reverse* order /// diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 873f475a249..88a4d59c73d 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -7822,7 +7822,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '220.0.0', NULL) + (TRUE, NOW(), NOW(), '221.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From b30c3b3bed5ae68d67f4640b25d70f7a973665a6 Mon Sep 17 00:00:00 2001 From: karencfv Date: Thu, 15 Jan 2026 10:42:15 +1300 Subject: [PATCH 14/20] new tables --- nexus/db-model/src/inventory.rs | 1 + .../db-queries/src/db/datastore/inventory.rs | 2 + nexus/db-schema/src/schema.rs | 27 ++++++++++ schema/crdb/dbinit.sql | 54 +++++++++++++++++++ .../up01.sql | 16 ++---- .../up02.sql | 16 ++++++ .../up03.sql | 13 +++++ .../up04.sql | 31 +++++++++++ 8 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up02.sql create mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up03.sql create mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up04.sql diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index aaffca12703..e2658927db3 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1022,6 +1022,7 @@ impl_enum_type!( Idle => b"idle" ); +// TODO-K: Update here #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance)] pub struct InvSvcInMaintenance { diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 04931f53054..a12dd85fe27 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -213,6 +213,7 @@ impl DataStore { } } + // TODO-K: Update here // Pull services in maintenance out of all sled agents let svcs_in_maintenance: Vec<_> = collection .sled_agents @@ -2946,6 +2947,7 @@ impl DataStore { datasets }; + // TODO-K: Update here // Mapping of "Sled ID" -> "All SMF services in maintenance reported by // that sled" let mut svcs_in_maintenance_by_sled = { diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index fd655d37e5b..1596800e9e5 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1735,6 +1735,33 @@ table! { } } +table! { + inv_health_monitor_svc_in_maintenance2 (inv_collection_id, sled_id, svcs_in_maintenance_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + svcs_in_maintenance_id -> Uuid, + svcs_cmd_error -> Nullable, + time_of_status -> Nullable, + } +} + +table! { + inv_health_monitor_svc_in_maintenance_service (svcs_in_maintenance_id, id) { + svcs_in_maintenance_id -> Uuid, + id -> Uuid, + fmri -> Text, + zone -> Text, + } +} + +table! { + inv_health_monitor_svc_in_maintenance_error (svcs_in_maintenance_id, id) { + svcs_in_maintenance_id -> Uuid, + id -> Uuid, + error_message -> Text, + } +} + table! { inv_sled_boot_partition (inv_collection_id, sled_id, boot_disk_slot) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 88a4d59c73d..98f769bc782 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4091,6 +4091,60 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance PRIMARY KEY (inv_collection_id, sled_id, id) ); +-- TODO-K: rename table +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + svcs_in_maintenance_id UUID NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( + -- where this observation came from + -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) + svcs_in_maintenance_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + PRIMARY KEY (svcs_in_maintenance_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( + -- where this observation came from + -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) + svcs_in_maintenance_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- an error message found when retrieving the SMF services in maintenance + error_message TEXT, + + PRIMARY KEY (svcs_in_maintenance_id, id) +); + -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index bf0b956d23e..579c19a0050 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -1,4 +1,5 @@ -CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( +-- TODO-K: rename table +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, @@ -9,16 +10,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance sled_id UUID NOT NULL, -- unique id for each row - id UUID NOT NULL, - - -- FMRI of the SMF service in maintenance - fmri TEXT, - - -- zone the SMF service in maintenance is located in - zone TEXT, - - -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY NOT NULL, + svcs_in_maintenance_id UUID NOT NULL, -- error when calling the svcs command svcs_cmd_error TEXT, @@ -26,5 +18,5 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance -- time when the status was checked if applicable time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id, id) + PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) ); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql new file mode 100644 index 00000000000..1d408357ab5 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( + -- where this observation came from + -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) + svcs_in_maintenance_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + PRIMARY KEY (svcs_in_maintenance_id, id) +); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql new file mode 100644 index 00000000000..076033ea586 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( + -- where this observation came from + -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) + svcs_in_maintenance_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- an error message found when retrieving the SMF services in maintenance + error_message TEXT, + + PRIMARY KEY (svcs_in_maintenance_id, id) +); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql new file mode 100644 index 00000000000..f25f130e913 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql @@ -0,0 +1,31 @@ +-- TODO-K: Delete this file +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + -- any error messages found when retrieving the SMF services in maintenance + error_messages TEXT ARRAY NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file From 7d706c79247b50bdfdbeb309be7ccc7c60addbad Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 20 Jan 2026 15:47:21 +1300 Subject: [PATCH 15/20] insert to new tables --- nexus/db-model/src/inventory.rs | 97 ++++++++++- .../db-queries/src/db/datastore/inventory.rs | 159 ++++++++++++++++++ nexus/db-schema/src/schema.rs | 2 + schema/crdb/dbinit.sql | 1 + .../up01.sql | 1 + sled-agent/src/long_running_tasks.rs | 3 +- sled-agent/src/sim/sled_agent.rs | 5 +- 7 files changed, 265 insertions(+), 3 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index e2658927db3..96acf95ae9e 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -35,7 +35,10 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_health_monitor_svc_in_maintenance, inv_host_phase_1_active_slot, + inv_health_monitor_svc_in_maintenance, + inv_health_monitor_svc_in_maintenance_error, + inv_health_monitor_svc_in_maintenance_service, + inv_health_monitor_svc_in_maintenance2, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, @@ -1068,6 +1071,98 @@ impl InvSvcInMaintenance { } } +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance2)] +pub struct InvSvcInMaintenance2 { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub svcs_in_maintenance_id: DbTypedUuid, + pub svcs_cmd_error: Option, + // TODO-K: This will change to not nullable with omicron#9615 + pub time_of_status: Option>, +} + +impl InvSvcInMaintenance2 { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svcs_cmd_error: Option, + time_of_status: Option>, + ) -> Self { + // This ID is only used as a primary key, it's fine to generate it here. + // TODO-K: Is it? + let svcs_in_maintenance_id = to_db_typed_uuid( + SvcInMaintenanceUuid::from_untyped_uuid(Uuid::new_v4()), + ); + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + svcs_in_maintenance_id, + svcs_cmd_error, + time_of_status, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance_service)] +pub struct InvSvcInMaintenanceService { + pub svcs_in_maintenance_id: DbTypedUuid, + // TODO-K: Change the UUID kind + pub id: DbTypedUuid, + pub fmri: String, + pub zone: String, +} + +impl InvSvcInMaintenanceService { + pub fn new( + svcs_in_maintenance_id: SvcInMaintenanceUuid, + svc: SvcInMaintenance, + ) -> Self { + let SvcInMaintenance { fmri, zone } = svc; + + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + svcs_in_maintenance_id: svcs_in_maintenance_id.into(), + id, + fmri, + zone, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance_error)] +pub struct InvSvcInMaintenanceError { + pub svcs_in_maintenance_id: DbTypedUuid, + // TODO-K: Change the UUID kind + pub id: DbTypedUuid, + pub error_message: String, +} + +impl InvSvcInMaintenanceError { + pub fn new( + svcs_in_maintenance_id: SvcInMaintenanceUuid, + error_message: String, + ) -> Self { + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + svcs_in_maintenance_id: svcs_in_maintenance_id.into(), + id, + error_message, + } + } +} + /// See [`sled_agent_types::inventory::ConfigReconcilerInventory`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_config_reconciler)] diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index a12dd85fe27..14850902208 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -65,6 +65,9 @@ use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; use nexus_db_model::InvSvcInMaintenance; +use nexus_db_model::InvSvcInMaintenance2; +use nexus_db_model::InvSvcInMaintenanceError; +use nexus_db_model::InvSvcInMaintenanceService; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -110,6 +113,7 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::SvcInMaintenanceUuid; use sled_agent_types::inventory::BootPartitionContents; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; @@ -264,6 +268,104 @@ impl DataStore { }) .collect(); + // TODO-K: Add a comment here and change the variable name to + // svcs_in_maintenance + let svcs_in_maintenance2: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + vec![InvSvcInMaintenance2::new( + collection_id, + sled_agent.sled_id, + None, + svcs.time_of_status, + )] + } + Err(e) => { + vec![InvSvcInMaintenance2::new( + collection_id, + sled_agent.sled_id, + Some(e.to_string()), + // TODO-K: This will change to not nullable with omicron#9615 + // TODO-K: I'm guessing in this case I'll set a time + // at this point or something? + None, + )] + } + } + }) + .collect(); + + // TODO-K: Add a comment here + let svcs_in_maintenance_services: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + // TODO-K: get the ID from the other table somehow + // TODO-K: Should I just use the collection ID and the + // sled ID instead? + let temp_id = to_db_typed_uuid( + SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + ), + ); + + svcs.services + .iter() + .map(|svc| { + InvSvcInMaintenanceService::new( + temp_id.into(), + svc.clone(), + ) + }) + .collect() + } + // If there is an error we've already captured it above + Err(_) => { + vec![] + } + } + }) + .collect(); + + // TODO-K: Add a comment here + let svcs_in_maintenance_errors: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + // TODO-K: get the ID from the other table somehow + // TODO-K: Should I just use the collection ID and the + // sled ID instead? + let temp_id = to_db_typed_uuid( + SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + ), + ); + + svcs.errors + .iter() + .map(|e| { + InvSvcInMaintenanceError::new( + temp_id.into(), + e.clone(), + ) + }) + .collect() + } + // If there is an error we've already captured it above + Err(_) => { + vec![] + } + } + }) + .collect(); + // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -1580,6 +1682,63 @@ impl DataStore { } } + // TODO-K: Add comment + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance = svcs_in_maintenance2.into_iter(); + loop { + let some_svcs_in_maintenance = + svcs_in_maintenance.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance2) + .values(some_svcs_in_maintenance) + .execute_async(&conn) + .await?; + } + } + + // TODO-K: Add comment + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance_services = svcs_in_maintenance_services.into_iter(); + loop { + let some_svcs_in_maintenance_services = + svcs_in_maintenance_services.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance_services.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance_service) + .values(some_svcs_in_maintenance_services) + .execute_async(&conn) + .await?; + } + } + + // TODO-K: Add comment + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance_errors = svcs_in_maintenance_errors.into_iter(); + loop { + let some_svcs_in_maintenance_errors = + svcs_in_maintenance_errors.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance_errors.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance_error) + .values(some_svcs_in_maintenance_errors) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 6aec309bdb9..23fcbdc9a3f 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1721,6 +1721,7 @@ table! { } } +// TODO-K: Make these table names plural? inv_health_monitor_svcs_in_maintenance table! { inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, @@ -1741,6 +1742,7 @@ table! { sled_id -> Uuid, svcs_in_maintenance_id -> Uuid, svcs_cmd_error -> Nullable, + // TODO-K: This will change to not nullable with omicron#9615 time_of_status -> Nullable, } } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 11746a3c66d..c118825f08b 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4109,6 +4109,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 svcs_cmd_error TEXT, -- time when the status was checked if applicable + -- TODO-K: This will change to not null with omicron#9615 time_of_status TIMESTAMPTZ, PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index 579c19a0050..f3049f411c5 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -16,6 +16,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 svcs_cmd_error TEXT, -- time when the status was checked if applicable + -- TODO-K: This will change to not null with omicron#9615 time_of_status TIMESTAMPTZ, PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e42d3c1b0dd..994d390ad82 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -284,7 +284,8 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +// TODO-K: Remove pub +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index bb6e9c028e8..42baf924087 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,7 +168,10 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + // TODO-K: Uncomment and remove + // let health_monitor = HealthMonitorHandle::stub(); + let health_monitor = + crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 8b8393406d8cf8952a6c71321f24f94151022429 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 20 Jan 2026 16:34:40 +1300 Subject: [PATCH 16/20] change ids on the tables --- nexus/db-model/src/inventory.rs | 31 +++++---- .../db-queries/src/db/datastore/inventory.rs | 66 +++++++------------ nexus/db-schema/src/schema.rs | 14 ++-- schema/crdb/dbinit.sql | 26 +++++--- .../up01.sql | 4 +- .../up02.sql | 11 +++- .../up03.sql | 11 +++- 7 files changed, 87 insertions(+), 76 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 96acf95ae9e..1481271e412 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1076,7 +1076,7 @@ impl InvSvcInMaintenance { pub struct InvSvcInMaintenance2 { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, - pub svcs_in_maintenance_id: DbTypedUuid, + pub id: DbTypedUuid, pub svcs_cmd_error: Option, // TODO-K: This will change to not nullable with omicron#9615 pub time_of_status: Option>, @@ -1090,15 +1090,14 @@ impl InvSvcInMaintenance2 { time_of_status: Option>, ) -> Self { // This ID is only used as a primary key, it's fine to generate it here. - // TODO-K: Is it? - let svcs_in_maintenance_id = to_db_typed_uuid( - SvcInMaintenanceUuid::from_untyped_uuid(Uuid::new_v4()), - ); + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); Self { inv_collection_id: inv_collection_id.into(), sled_id: sled_id.into(), - svcs_in_maintenance_id, + id, svcs_cmd_error, time_of_status, } @@ -1108,8 +1107,9 @@ impl InvSvcInMaintenance2 { #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance_service)] pub struct InvSvcInMaintenanceService { - pub svcs_in_maintenance_id: DbTypedUuid, - // TODO-K: Change the UUID kind + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + // TODO-K: Change the UUID kind? pub id: DbTypedUuid, pub fmri: String, pub zone: String, @@ -1117,7 +1117,8 @@ pub struct InvSvcInMaintenanceService { impl InvSvcInMaintenanceService { pub fn new( - svcs_in_maintenance_id: SvcInMaintenanceUuid, + inv_collection_id: CollectionUuid, + sled_id: SledUuid, svc: SvcInMaintenance, ) -> Self { let SvcInMaintenance { fmri, zone } = svc; @@ -1128,7 +1129,8 @@ impl InvSvcInMaintenanceService { )); Self { - svcs_in_maintenance_id: svcs_in_maintenance_id.into(), + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), id, fmri, zone, @@ -1139,7 +1141,8 @@ impl InvSvcInMaintenanceService { #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance_error)] pub struct InvSvcInMaintenanceError { - pub svcs_in_maintenance_id: DbTypedUuid, + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, // TODO-K: Change the UUID kind pub id: DbTypedUuid, pub error_message: String, @@ -1147,7 +1150,8 @@ pub struct InvSvcInMaintenanceError { impl InvSvcInMaintenanceError { pub fn new( - svcs_in_maintenance_id: SvcInMaintenanceUuid, + inv_collection_id: CollectionUuid, + sled_id: SledUuid, error_message: String, ) -> Self { // This ID is only used as a primary key, it's fine to generate it here. @@ -1156,7 +1160,8 @@ impl InvSvcInMaintenanceError { )); Self { - svcs_in_maintenance_id: svcs_in_maintenance_id.into(), + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), id, error_message, } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 14850902208..bf5475228b2 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -113,7 +113,6 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; -use omicron_uuid_kinds::SvcInMaintenanceUuid; use sled_agent_types::inventory::BootPartitionContents; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; @@ -304,26 +303,17 @@ impl DataStore { .iter() .flat_map(|sled_agent| { match &sled_agent.health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - // TODO-K: get the ID from the other table somehow - // TODO-K: Should I just use the collection ID and the - // sled ID instead? - let temp_id = to_db_typed_uuid( - SvcInMaintenanceUuid::from_untyped_uuid( - Uuid::new_v4(), - ), - ); - - svcs.services - .iter() - .map(|svc| { - InvSvcInMaintenanceService::new( - temp_id.into(), - svc.clone(), - ) - }) - .collect() - } + Ok(svcs) => svcs + .services + .iter() + .map(|svc| { + InvSvcInMaintenanceService::new( + collection_id, + sled_agent.sled_id, + svc.clone(), + ) + }) + .collect(), // If there is an error we've already captured it above Err(_) => { vec![] @@ -338,26 +328,17 @@ impl DataStore { .iter() .flat_map(|sled_agent| { match &sled_agent.health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - // TODO-K: get the ID from the other table somehow - // TODO-K: Should I just use the collection ID and the - // sled ID instead? - let temp_id = to_db_typed_uuid( - SvcInMaintenanceUuid::from_untyped_uuid( - Uuid::new_v4(), - ), - ); - - svcs.errors - .iter() - .map(|e| { - InvSvcInMaintenanceError::new( - temp_id.into(), - e.clone(), - ) - }) - .collect() - } + Ok(svcs) => svcs + .errors + .iter() + .map(|e| { + InvSvcInMaintenanceError::new( + collection_id, + sled_agent.sled_id, + e.clone(), + ) + }) + .collect(), // If there is an error we've already captured it above Err(_) => { vec![] @@ -2522,6 +2503,8 @@ impl DataStore { .await? }; + // TODO-K: Delete rows in the other health tables as well + // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { use nexus_db_schema::schema::inv_omicron_sled_config::dsl; @@ -4531,6 +4514,7 @@ impl DataStore { // Convert all health checks into a full `HealthMonitorInventory` let mut health_monitor = HealthMonitorInventory::new(); + // TODO-K: Update here let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) .map(|rows| { diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 23fcbdc9a3f..e06e614d722 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1737,10 +1737,10 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance2 (inv_collection_id, sled_id, svcs_in_maintenance_id) { + inv_health_monitor_svc_in_maintenance2 (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, sled_id -> Uuid, - svcs_in_maintenance_id -> Uuid, + id -> Uuid, svcs_cmd_error -> Nullable, // TODO-K: This will change to not nullable with omicron#9615 time_of_status -> Nullable, @@ -1748,8 +1748,9 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance_service (svcs_in_maintenance_id, id) { - svcs_in_maintenance_id -> Uuid, + inv_health_monitor_svc_in_maintenance_service (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, id -> Uuid, fmri -> Text, zone -> Text, @@ -1757,8 +1758,9 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance_error (svcs_in_maintenance_id, id) { - svcs_in_maintenance_id -> Uuid, + inv_health_monitor_svc_in_maintenance_error (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, id -> Uuid, error_message -> Text, } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c118825f08b..23c5ad6c98c 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4103,7 +4103,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 sled_id UUID NOT NULL, -- unique id for each row - svcs_in_maintenance_id UUID NOT NULL, + id UUID NOT NULL, -- error when calling the svcs command svcs_cmd_error TEXT, @@ -4112,13 +4112,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 -- TODO-K: This will change to not null with omicron#9615 time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( -- where this observation came from - -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) - svcs_in_maintenance_id UUID NOT NULL, + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, -- unique id for each row id UUID NOT NULL, @@ -4129,13 +4134,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_ -- zone the SMF service in maintenance is located in zone TEXT, - PRIMARY KEY (svcs_in_maintenance_id, id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( -- where this observation came from - -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) - svcs_in_maintenance_id UUID NOT NULL, + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, -- unique id for each row id UUID NOT NULL, @@ -4143,7 +4153,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_ -- an error message found when retrieving the SMF services in maintenance error_message TEXT, - PRIMARY KEY (svcs_in_maintenance_id, id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index f3049f411c5..0e079b864c5 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -10,7 +10,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 sled_id UUID NOT NULL, -- unique id for each row - svcs_in_maintenance_id UUID NOT NULL, + id UUID NOT NULL, -- error when calling the svcs command svcs_cmd_error TEXT, @@ -19,5 +19,5 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 -- TODO-K: This will change to not null with omicron#9615 time_of_status TIMESTAMPTZ, - PRIMARY KEY (inv_collection_id, sled_id, svcs_in_maintenance_id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql index 1d408357ab5..5f1bd55b010 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql @@ -1,7 +1,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( -- where this observation came from - -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) - svcs_in_maintenance_id UUID NOT NULL, + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, -- unique id for each row id UUID NOT NULL, @@ -12,5 +17,5 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_ -- zone the SMF service in maintenance is located in zone TEXT, - PRIMARY KEY (svcs_in_maintenance_id, id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql index 076033ea586..c3b2c4ae5f6 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql @@ -1,7 +1,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( -- where this observation came from - -- (foreign key into `inv_health_monitor_svc_in_maintenance` table) - svcs_in_maintenance_id UUID NOT NULL, + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, -- unique id for each row id UUID NOT NULL, @@ -9,5 +14,5 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_ -- an error message found when retrieving the SMF services in maintenance error_message TEXT, - PRIMARY KEY (svcs_in_maintenance_id, id) + PRIMARY KEY (inv_collection_id, sled_id, id) ); \ No newline at end of file From f8603e6bd0c3e36a4ee22039fa7393dc3b44b8ab Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 20 Jan 2026 20:33:00 +1300 Subject: [PATCH 17/20] insert and read from new tables --- nexus/db-model/src/inventory.rs | 4 +- .../db-queries/src/db/datastore/inventory.rs | 371 ++++++++++++------ nexus/db-schema/src/schema.rs | 2 +- 3 files changed, 261 insertions(+), 116 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 1481271e412..b6ecb866b47 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -1078,7 +1078,7 @@ pub struct InvSvcInMaintenance2 { pub sled_id: DbTypedUuid, pub id: DbTypedUuid, pub svcs_cmd_error: Option, - // TODO-K: This will change to not nullable with omicron#9615 + // TODO-K: This might change to not nullable with omicron#9615 pub time_of_status: Option>, } @@ -1143,7 +1143,7 @@ impl InvSvcInMaintenanceService { pub struct InvSvcInMaintenanceError { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, - // TODO-K: Change the UUID kind + // TODO-K: Change the UUID kind? pub id: DbTypedUuid, pub error_message: String, } diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index bf5475228b2..2f044b0f0cd 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -64,7 +64,6 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; -use nexus_db_model::InvSvcInMaintenance; use nexus_db_model::InvSvcInMaintenance2; use nexus_db_model::InvSvcInMaintenanceError; use nexus_db_model::InvSvcInMaintenanceService; @@ -216,59 +215,59 @@ impl DataStore { } } - // TODO-K: Update here - // Pull services in maintenance out of all sled agents - let svcs_in_maintenance: Vec<_> = collection - .sled_agents - .iter() - .flat_map(|sled_agent| { - match &sled_agent.health_monitor.smf_services_in_maintenance { - // When there are no services in maintenance, we will still - // want to insert a row with the time the health check was - // made and any parsing errors we may have collected. - Ok(svcs) - if svcs.services.is_empty() - && svcs.time_of_status.is_some() => - { - vec![InvSvcInMaintenance::new( - collection_id, - sled_agent.sled_id, - None, - svcs.errors.clone(), - None, - svcs.time_of_status, - )] - } - Ok(svcs) => svcs - .services - .iter() - .map(|svc| { - InvSvcInMaintenance::new( - collection_id, - sled_agent.sled_id, - Some(svc.clone()), - svcs.errors.clone(), - None, - svcs.time_of_status, - ) - }) - .collect(), - Err(e) => { - vec![InvSvcInMaintenance::new( - collection_id, - sled_agent.sled_id, - None, - vec![], - Some(e.to_string()), - None, - )] - } - } - }) - .collect(); - - // TODO-K: Add a comment here and change the variable name to - // svcs_in_maintenance +// // TODO-K: Update here +// // Pull services in maintenance out of all sled agents +// let svcs_in_maintenance: Vec<_> = collection +// .sled_agents +// .iter() +// .flat_map(|sled_agent| { +// match &sled_agent.health_monitor.smf_services_in_maintenance { +// // When there are no services in maintenance, we will still +// // want to insert a row with the time the health check was +// // made and any parsing errors we may have collected. +// Ok(svcs) +// if svcs.services.is_empty() +// && svcs.time_of_status.is_some() => +// { +// vec![InvSvcInMaintenance::new( +// collection_id, +// sled_agent.sled_id, +// None, +// svcs.errors.clone(), +// None, +// svcs.time_of_status, +// )] +// } +// Ok(svcs) => svcs +// .services +// .iter() +// .map(|svc| { +// InvSvcInMaintenance::new( +// collection_id, +// sled_agent.sled_id, +// Some(svc.clone()), +// svcs.errors.clone(), +// None, +// svcs.time_of_status, +// ) +// }) +// .collect(), +// Err(e) => { +// vec![InvSvcInMaintenance::new( +// collection_id, +// sled_agent.sled_id, +// None, +// vec![], +// Some(e.to_string()), +// None, +// )] +// } +// } +// }) +// .collect(); + + // Pull services in maintenance result out of all sled agents + // TODO-K: change the variable name to svcs_in_maintenance let svcs_in_maintenance2: Vec<_> = collection .sled_agents .iter() @@ -287,7 +286,7 @@ impl DataStore { collection_id, sled_agent.sled_id, Some(e.to_string()), - // TODO-K: This will change to not nullable with omicron#9615 + // TODO-K: This might change to not nullable with omicron#9615 // TODO-K: I'm guessing in this case I'll set a time // at this point or something? None, @@ -297,7 +296,7 @@ impl DataStore { }) .collect(); - // TODO-K: Add a comment here + // Pull services in maintenance details out of all sled agents let svcs_in_maintenance_services: Vec<_> = collection .sled_agents .iter() @@ -322,7 +321,7 @@ impl DataStore { }) .collect(); - // TODO-K: Add a comment here + // Pull services in maintenance errors out of all sled agents let svcs_in_maintenance_errors: Vec<_> = collection .sled_agents .iter() @@ -1644,24 +1643,24 @@ impl DataStore { } } - // Insert rows for all the unhealthy services we found - { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - - let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); - let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); - loop { - let some_svcs_in_maintenance = - svcs_in_maintenance.by_ref().take(batch_size).collect::>(); - if some_svcs_in_maintenance.is_empty() { - break; - } - let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) - .values(some_svcs_in_maintenance) - .execute_async(&conn) - .await?; - } - } +// // Insert rows for all the unhealthy services we found +// { +// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; +// +// let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); +// let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); +// loop { +// let some_svcs_in_maintenance = +// svcs_in_maintenance.by_ref().take(batch_size).collect::>(); +// if some_svcs_in_maintenance.is_empty() { +// break; +// } +// let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) +// .values(some_svcs_in_maintenance) +// .execute_async(&conn) +// .await?; +// } +// } // TODO-K: Add comment { @@ -2686,6 +2685,7 @@ impl DataStore { "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, + // TODO-K: Add for the new tables here "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -3089,25 +3089,58 @@ impl DataStore { datasets }; - // TODO-K: Update here - // Mapping of "Sled ID" -> "All SMF services in maintenance reported by - // that sled" - let mut svcs_in_maintenance_by_sled = { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - - let mut svcs = BTreeMap::>::new(); +// // TODO-K: Update here +// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by +// // that sled" +// let mut svcs_in_maintenance_by_sled = { +// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; +// +// let mut svcs = BTreeMap::>::new(); +// let mut paginator = Paginator::new( +// batch_size, +// dropshot::PaginationOrder::Ascending, +// ); +// while let Some(p) = paginator.next() { +// let batch: Vec = paginated_multicolumn( +// dsl::inv_health_monitor_svc_in_maintenance, +// (dsl::sled_id, dsl::id), +// &p.current_pagparams(), +// ) +// .filter(dsl::inv_collection_id.eq(db_id)) +// .select(InvSvcInMaintenance::as_select()) +// .load_async(&*conn) +// .await +// .map_err(|e| { +// public_error_from_diesel(e, ErrorHandler::Server) +// })?; +// paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); +// for svc in batch { +// svcs.entry(svc.sled_id.into_untyped_uuid()) +// .or_default() +// .push(svc); +// } +// } +// svcs +// }; + + // Mapping of "Sled ID" -> "The result of SMF services in maintenance + // reported by that sled" + let mut svcs_in_maintenance2_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; + + let mut svcs = BTreeMap::>::new(); let mut paginator = Paginator::new( batch_size, dropshot::PaginationOrder::Ascending, ); while let Some(p) = paginator.next() { - let batch: Vec = paginated_multicolumn( - dsl::inv_health_monitor_svc_in_maintenance, + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance2, (dsl::sled_id, dsl::id), &p.current_pagparams(), ) .filter(dsl::inv_collection_id.eq(db_id)) - .select(InvSvcInMaintenance::as_select()) + .select(InvSvcInMaintenance2::as_select()) .load_async(&*conn) .await .map_err(|e| { @@ -3123,6 +3156,76 @@ impl DataStore { svcs }; + // Mapping of "Sled ID" -> "All SMF services in maintenance reported by + // that sled" + let mut svcs_in_maintenance_services_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + + let mut svcs = + BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = + paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance_service, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenanceService::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + + // Mapping of "Sled ID" -> "All SMF services in maintenance errors reported by + // that sled" + let mut svcs_in_maintenance_errors_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + + let mut svcs = + BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = + paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance_error, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenanceError::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. let baseboard_id_ids: BTreeSet<_> = sps @@ -4514,44 +4617,86 @@ impl DataStore { // Convert all health checks into a full `HealthMonitorInventory` let mut health_monitor = HealthMonitorInventory::new(); - // TODO-K: Update here - let svcs_in_maintenance = svcs_in_maintenance_by_sled + // // TODO-K: Update here + // let svcs_in_maintenance = svcs_in_maintenance_by_sled + // .remove(&sled_id.into_untyped_uuid()) + // .map(|rows| { + // // Get metadata from the first row. All rows from the same + // // collection and sled will share time_of_status, + // // svcs_cmd_error and error_messages. + // let first_row = + // rows.first().expect("rows should not be empty"); + // + // // Check if the svcs command itself failed first. If so, we + // // can safely assume no services in maintenance have been + // // reported and return an error. + // if let Some(e) = &first_row.svcs_cmd_error { + // return Err(e.clone()); + // } + // + // // Convert database rows to service in maintenance entries. + // // All rows should have both zone and FMRI populated or none + // // at all. Nevertheless, we'll handle the case of a + // // partially populated row. + // let services: Vec = rows + // .iter() + // .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) + // .map(|svc| SvcInMaintenance { + // fmri: svc.fmri.clone().unwrap_or_default(), + // zone: svc.zone.clone().unwrap_or_default(), + // }) + // .collect(); + // + // Ok(SvcsInMaintenanceResult { + // services, + // errors: first_row.error_messages.clone(), + // time_of_status: first_row.time_of_status, + // }) + // }); + + let svcs_in_maintenance2 = svcs_in_maintenance2_by_sled .remove(&sled_id.into_untyped_uuid()) .map(|rows| { - // Get metadata from the first row. All rows from the same - // collection and sled will share time_of_status, - // svcs_cmd_error and error_messages. - let first_row = - rows.first().expect("rows should not be empty"); - - // Check if the svcs command itself failed first. If so, we - // can safely assume no services in maintenance have been - // reported and return an error. - if let Some(e) = &first_row.svcs_cmd_error { - return Err(e.clone()); - } + // There should only be one row per collection per sled + let first_row = rows.first().ok_or_else(|| { + format!( + "missing SMF services in maintenance details \ + for sled {sled_id} that should have been \ + fetched" + ) + })?; - // Convert database rows to service in maintenance entries. - // All rows should have both zone and FMRI populated or none - // at all. Nevertheless, we'll handle the case of a - // partially populated row. - let services: Vec = rows - .iter() - .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) - .map(|svc| SvcInMaintenance { - fmri: svc.fmri.clone().unwrap_or_default(), - zone: svc.zone.clone().unwrap_or_default(), - }) - .collect(); + // Collect all services from svcs_in_maintenance_services_by_sled + // for this sled. + let services: Vec = + svcs_in_maintenance_services_by_sled + .remove(&sled_id.into_untyped_uuid()) + .unwrap_or_default() + .into_iter() + .map(|svc| SvcInMaintenance { + fmri: svc.fmri, + zone: svc.zone, + }) + .collect(); + + // Collect all errors from svcs_in_maintenance_errors_by_sled + // for this sled. + let errors: Vec = + svcs_in_maintenance_errors_by_sled + .remove(&sled_id.into_untyped_uuid()) + .unwrap_or_default() + .into_iter() + .map(|err| err.error_message) + .collect(); Ok(SvcsInMaintenanceResult { services, - errors: first_row.error_messages.clone(), + errors, time_of_status: first_row.time_of_status, }) }); - if let Some(svcs) = svcs_in_maintenance { + if let Some(svcs) = svcs_in_maintenance2 { health_monitor.smf_services_in_maintenance = svcs }; diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index e06e614d722..2d836cf621a 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1742,7 +1742,7 @@ table! { sled_id -> Uuid, id -> Uuid, svcs_cmd_error -> Nullable, - // TODO-K: This will change to not nullable with omicron#9615 + // TODO-K: This might change to not nullable with omicron#9615 time_of_status -> Nullable, } } From a407943eea10da14a9c2649c648ac1ed6c874303 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 20 Jan 2026 20:52:17 +1300 Subject: [PATCH 18/20] remove old table --- nexus/db-model/src/inventory.rs | 47 ------ .../db-queries/src/db/datastore/inventory.rs | 141 ++++-------------- nexus/db-schema/src/schema.rs | 15 -- schema/crdb/dbinit.sql | 31 ---- .../up04.sql | 31 ---- 5 files changed, 30 insertions(+), 235 deletions(-) delete mode 100644 schema/crdb/health-monitor-svcs-in-maintenance/up04.sql diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index b6ecb866b47..125ff197af3 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -35,7 +35,6 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, - inv_health_monitor_svc_in_maintenance, inv_health_monitor_svc_in_maintenance_error, inv_health_monitor_svc_in_maintenance_service, inv_health_monitor_svc_in_maintenance2, inv_host_phase_1_active_slot, @@ -1025,52 +1024,6 @@ impl_enum_type!( Idle => b"idle" ); -// TODO-K: Update here -#[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] -pub struct InvSvcInMaintenance { - pub inv_collection_id: DbTypedUuid, - pub sled_id: DbTypedUuid, - pub id: DbTypedUuid, - pub fmri: Option, - pub zone: Option, - pub error_messages: Vec, - pub svcs_cmd_error: Option, - pub time_of_status: Option>, -} - -impl InvSvcInMaintenance { - pub fn new( - inv_collection_id: CollectionUuid, - sled_id: SledUuid, - svc: Option, - svc_errors: Vec, - svcs_cmd_error: Option, - time_of_status: Option>, - ) -> Self { - let (fmri, zone) = match svc { - Some(svc) => (Some(svc.fmri), Some(svc.zone)), - None => (None, None), - }; - - // This ID is only used as a primary key, it's fine to generate it here. - let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( - Uuid::new_v4(), - )); - - Self { - inv_collection_id: inv_collection_id.into(), - sled_id: sled_id.into(), - id, - fmri, - zone, - error_messages: svc_errors, - svcs_cmd_error, - time_of_status, - } - } -} - #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_health_monitor_svc_in_maintenance2)] pub struct InvSvcInMaintenance2 { diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 2f044b0f0cd..8f40102002b 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -215,57 +215,6 @@ impl DataStore { } } -// // TODO-K: Update here -// // Pull services in maintenance out of all sled agents -// let svcs_in_maintenance: Vec<_> = collection -// .sled_agents -// .iter() -// .flat_map(|sled_agent| { -// match &sled_agent.health_monitor.smf_services_in_maintenance { -// // When there are no services in maintenance, we will still -// // want to insert a row with the time the health check was -// // made and any parsing errors we may have collected. -// Ok(svcs) -// if svcs.services.is_empty() -// && svcs.time_of_status.is_some() => -// { -// vec![InvSvcInMaintenance::new( -// collection_id, -// sled_agent.sled_id, -// None, -// svcs.errors.clone(), -// None, -// svcs.time_of_status, -// )] -// } -// Ok(svcs) => svcs -// .services -// .iter() -// .map(|svc| { -// InvSvcInMaintenance::new( -// collection_id, -// sled_agent.sled_id, -// Some(svc.clone()), -// svcs.errors.clone(), -// None, -// svcs.time_of_status, -// ) -// }) -// .collect(), -// Err(e) => { -// vec![InvSvcInMaintenance::new( -// collection_id, -// sled_agent.sled_id, -// None, -// vec![], -// Some(e.to_string()), -// None, -// )] -// } -// } -// }) -// .collect(); - // Pull services in maintenance result out of all sled agents // TODO-K: change the variable name to svcs_in_maintenance let svcs_in_maintenance2: Vec<_> = collection @@ -1643,26 +1592,7 @@ impl DataStore { } } -// // Insert rows for all the unhealthy services we found -// { -// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; -// -// let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); -// let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); -// loop { -// let some_svcs_in_maintenance = -// svcs_in_maintenance.by_ref().take(batch_size).collect::>(); -// if some_svcs_in_maintenance.is_empty() { -// break; -// } -// let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) -// .values(some_svcs_in_maintenance) -// .execute_async(&conn) -// .await?; -// } -// } - - // TODO-K: Add comment + // Insert rows for all the SMF services in maintenance results we found { use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; @@ -1681,7 +1611,7 @@ impl DataStore { } } - // TODO-K: Add comment + // Insert rows for all the SMF services in maintenance we found { use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; @@ -1700,7 +1630,7 @@ impl DataStore { } } - // TODO-K: Add comment + // Insert rows for all the SMF services in maintenance errors we found { use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; @@ -2224,6 +2154,8 @@ impl DataStore { nconfig_reconcilers: usize, nboot_partitions: usize, nhealth_monitor_svc_in_maintenance: usize, + nhealth_monitor_svc_in_maintenance_service: usize, + nhealth_monitor_svc_in_maintenance_error: usize, nomicron_sled_configs: usize, nomicron_sled_config_disks: usize, nomicron_sled_config_datasets: usize, @@ -2262,6 +2194,8 @@ impl DataStore { nconfig_reconcilers, nboot_partitions, nhealth_monitor_svc_in_maintenance, + nhealth_monitor_svc_in_maintenance_service, + nhealth_monitor_svc_in_maintenance_error, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2494,15 +2428,31 @@ impl DataStore { // Remove rows associated with the health monitor let nhealth_monitor_svc_in_maintenance = { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance2.filter( dsl::inv_collection_id.eq(db_collection_id), )) .execute_async(&conn) .await? }; - // TODO-K: Delete rows in the other health tables as well + let nhealth_monitor_svc_in_maintenance_service = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance_service.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + + let nhealth_monitor_svc_in_maintenance_error = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance_error.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { @@ -2636,6 +2586,8 @@ impl DataStore { nconfig_reconcilers, nboot_partitions, nhealth_monitor_svc_in_maintenance, + nhealth_monitor_svc_in_maintenance_service, + nhealth_monitor_svc_in_maintenance_error, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2685,7 +2637,8 @@ impl DataStore { "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, - // TODO-K: Add for the new tables here + "nhealth_monitor_svc_in_maintenance_service" => nhealth_monitor_svc_in_maintenance_service, + "nhealth_monitor_svc_in_maintenance_error" => nhealth_monitor_svc_in_maintenance_error, "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -3089,40 +3042,6 @@ impl DataStore { datasets }; -// // TODO-K: Update here -// // Mapping of "Sled ID" -> "All SMF services in maintenance reported by -// // that sled" -// let mut svcs_in_maintenance_by_sled = { -// use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; -// -// let mut svcs = BTreeMap::>::new(); -// let mut paginator = Paginator::new( -// batch_size, -// dropshot::PaginationOrder::Ascending, -// ); -// while let Some(p) = paginator.next() { -// let batch: Vec = paginated_multicolumn( -// dsl::inv_health_monitor_svc_in_maintenance, -// (dsl::sled_id, dsl::id), -// &p.current_pagparams(), -// ) -// .filter(dsl::inv_collection_id.eq(db_id)) -// .select(InvSvcInMaintenance::as_select()) -// .load_async(&*conn) -// .await -// .map_err(|e| { -// public_error_from_diesel(e, ErrorHandler::Server) -// })?; -// paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); -// for svc in batch { -// svcs.entry(svc.sled_id.into_untyped_uuid()) -// .or_default() -// .push(svc); -// } -// } -// svcs -// }; - // Mapping of "Sled ID" -> "The result of SMF services in maintenance // reported by that sled" let mut svcs_in_maintenance2_by_sled = { diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 2d836cf621a..b9f294d4f6a 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1721,21 +1721,6 @@ table! { } } -// TODO-K: Make these table names plural? inv_health_monitor_svcs_in_maintenance -table! { - inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { - inv_collection_id -> Uuid, - sled_id -> Uuid, - id -> Uuid, - fmri -> Nullable, - zone -> Nullable, - - error_messages -> Array, - svcs_cmd_error -> Nullable, - time_of_status -> Nullable, - } -} - table! { inv_health_monitor_svc_in_maintenance2 (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 23c5ad6c98c..4ac933b87fb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4060,37 +4060,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); -CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( - -- where this observation came from - -- (foreign key into `inv_collection` table) - inv_collection_id UUID NOT NULL, - - -- unique id for this sled (should be foreign keys into `sled` table, though - -- it's conceivable a sled will report an id that we don't know about); - -- guaranteed to match a row in this collection's `inv_sled_agent` - sled_id UUID NOT NULL, - - -- unique id for each row - id UUID NOT NULL, - - -- FMRI of the SMF service in maintenance - fmri TEXT, - - -- zone the SMF service in maintenance is located in - zone TEXT, - - -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY NOT NULL, - - -- error when calling the svcs command - svcs_cmd_error TEXT, - - -- time when the status was checked if applicable - time_of_status TIMESTAMPTZ, - - PRIMARY KEY (inv_collection_id, sled_id, id) -); - -- TODO-K: rename table CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 ( -- where this observation came from diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql deleted file mode 100644 index f25f130e913..00000000000 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up04.sql +++ /dev/null @@ -1,31 +0,0 @@ --- TODO-K: Delete this file -CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( - -- where this observation came from - -- (foreign key into `inv_collection` table) - inv_collection_id UUID NOT NULL, - - -- unique id for this sled (should be foreign keys into `sled` table, though - -- it's conceivable a sled will report an id that we don't know about); - -- guaranteed to match a row in this collection's `inv_sled_agent` - sled_id UUID NOT NULL, - - -- unique id for each row - id UUID NOT NULL, - - -- FMRI of the SMF service in maintenance - fmri TEXT, - - -- zone the SMF service in maintenance is located in - zone TEXT, - - -- any error messages found when retrieving the SMF services in maintenance - error_messages TEXT ARRAY NOT NULL, - - -- error when calling the svcs command - svcs_cmd_error TEXT, - - -- time when the status was checked if applicable - time_of_status TIMESTAMPTZ, - - PRIMARY KEY (inv_collection_id, sled_id, id) -); \ No newline at end of file From a5cffb303e3e4a65254a0f3a13702a9eedd70243 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 20 Jan 2026 21:44:57 +1300 Subject: [PATCH 19/20] rename tables and fix read --- nexus/db-model/src/inventory.rs | 12 +-- .../db-queries/src/db/datastore/inventory.rs | 78 ++++++------------- nexus/db-schema/src/schema.rs | 2 +- schema/crdb/dbinit.sql | 3 +- .../up01.sql | 3 +- sled-agent/src/sim/sled_agent.rs | 6 +- 6 files changed, 36 insertions(+), 68 deletions(-) diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 125ff197af3..a996634bf5d 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -35,11 +35,11 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, + inv_health_monitor_svc_in_maintenance, inv_health_monitor_svc_in_maintenance_error, inv_health_monitor_svc_in_maintenance_service, - inv_health_monitor_svc_in_maintenance2, inv_host_phase_1_active_slot, - inv_host_phase_1_flash_hash, inv_internal_dns, - inv_last_reconciliation_dataset_result, + inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, + inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, inv_last_reconciliation_orphaned_dataset, inv_last_reconciliation_zone_result, inv_measurement_manifest_non_boot, @@ -1025,8 +1025,8 @@ impl_enum_type!( ); #[derive(Queryable, Clone, Debug, Selectable, Insertable)] -#[diesel(table_name = inv_health_monitor_svc_in_maintenance2)] -pub struct InvSvcInMaintenance2 { +#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] +pub struct InvSvcInMaintenance { pub inv_collection_id: DbTypedUuid, pub sled_id: DbTypedUuid, pub id: DbTypedUuid, @@ -1035,7 +1035,7 @@ pub struct InvSvcInMaintenance2 { pub time_of_status: Option>, } -impl InvSvcInMaintenance2 { +impl InvSvcInMaintenance { pub fn new( inv_collection_id: CollectionUuid, sled_id: SledUuid, diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 8f40102002b..f6a973ba916 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -64,7 +64,7 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; -use nexus_db_model::InvSvcInMaintenance2; +use nexus_db_model::InvSvcInMaintenance; use nexus_db_model::InvSvcInMaintenanceError; use nexus_db_model::InvSvcInMaintenanceService; use nexus_db_model::InvZpool; @@ -217,13 +217,13 @@ impl DataStore { // Pull services in maintenance result out of all sled agents // TODO-K: change the variable name to svcs_in_maintenance - let svcs_in_maintenance2: Vec<_> = collection + let svcs_in_maintenance: Vec<_> = collection .sled_agents .iter() .flat_map(|sled_agent| { match &sled_agent.health_monitor.smf_services_in_maintenance { Ok(svcs) => { - vec![InvSvcInMaintenance2::new( + vec![InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, None, @@ -231,7 +231,7 @@ impl DataStore { )] } Err(e) => { - vec![InvSvcInMaintenance2::new( + vec![InvSvcInMaintenance::new( collection_id, sled_agent.sled_id, Some(e.to_string()), @@ -1594,17 +1594,17 @@ impl DataStore { // Insert rows for all the SMF services in maintenance results we found { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); - let mut svcs_in_maintenance = svcs_in_maintenance2.into_iter(); + let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); loop { let some_svcs_in_maintenance = svcs_in_maintenance.by_ref().take(batch_size).collect::>(); if some_svcs_in_maintenance.is_empty() { break; } - let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance2) + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) .values(some_svcs_in_maintenance) .execute_async(&conn) .await?; @@ -2428,8 +2428,8 @@ impl DataStore { // Remove rows associated with the health monitor let nhealth_monitor_svc_in_maintenance = { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; - diesel::delete(dsl::inv_health_monitor_svc_in_maintenance2.filter( + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( dsl::inv_collection_id.eq(db_collection_id), )) .execute_async(&conn) @@ -3044,22 +3044,22 @@ impl DataStore { // Mapping of "Sled ID" -> "The result of SMF services in maintenance // reported by that sled" - let mut svcs_in_maintenance2_by_sled = { - use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance2::dsl; + let mut svcs_in_maintenance_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; - let mut svcs = BTreeMap::>::new(); + let mut svcs = BTreeMap::>::new(); let mut paginator = Paginator::new( batch_size, dropshot::PaginationOrder::Ascending, ); while let Some(p) = paginator.next() { - let batch: Vec = paginated_multicolumn( - dsl::inv_health_monitor_svc_in_maintenance2, + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance, (dsl::sled_id, dsl::id), &p.current_pagparams(), ) .filter(dsl::inv_collection_id.eq(db_id)) - .select(InvSvcInMaintenance2::as_select()) + .select(InvSvcInMaintenance::as_select()) .load_async(&*conn) .await .map_err(|e| { @@ -4536,44 +4536,7 @@ impl DataStore { // Convert all health checks into a full `HealthMonitorInventory` let mut health_monitor = HealthMonitorInventory::new(); - // // TODO-K: Update here - // let svcs_in_maintenance = svcs_in_maintenance_by_sled - // .remove(&sled_id.into_untyped_uuid()) - // .map(|rows| { - // // Get metadata from the first row. All rows from the same - // // collection and sled will share time_of_status, - // // svcs_cmd_error and error_messages. - // let first_row = - // rows.first().expect("rows should not be empty"); - // - // // Check if the svcs command itself failed first. If so, we - // // can safely assume no services in maintenance have been - // // reported and return an error. - // if let Some(e) = &first_row.svcs_cmd_error { - // return Err(e.clone()); - // } - // - // // Convert database rows to service in maintenance entries. - // // All rows should have both zone and FMRI populated or none - // // at all. Nevertheless, we'll handle the case of a - // // partially populated row. - // let services: Vec = rows - // .iter() - // .filter(|svc| svc.fmri.is_some() || svc.zone.is_some()) - // .map(|svc| SvcInMaintenance { - // fmri: svc.fmri.clone().unwrap_or_default(), - // zone: svc.zone.clone().unwrap_or_default(), - // }) - // .collect(); - // - // Ok(SvcsInMaintenanceResult { - // services, - // errors: first_row.error_messages.clone(), - // time_of_status: first_row.time_of_status, - // }) - // }); - - let svcs_in_maintenance2 = svcs_in_maintenance2_by_sled + let svcs_in_maintenance = svcs_in_maintenance_by_sled .remove(&sled_id.into_untyped_uuid()) .map(|rows| { // There should only be one row per collection per sled @@ -4585,6 +4548,13 @@ impl DataStore { ) })?; + // Check if the svcs command itself failed first. If so, we + // can safely assume no services in maintenance have been + // reported and return an error. + if let Some(e) = &first_row.svcs_cmd_error { + return Err(e.clone()); + } + // Collect all services from svcs_in_maintenance_services_by_sled // for this sled. let services: Vec = @@ -4615,7 +4585,7 @@ impl DataStore { }) }); - if let Some(svcs) = svcs_in_maintenance2 { + if let Some(svcs) = svcs_in_maintenance { health_monitor.smf_services_in_maintenance = svcs }; diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index b9f294d4f6a..c11cc771867 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1722,7 +1722,7 @@ table! { } table! { - inv_health_monitor_svc_in_maintenance2 (inv_collection_id, sled_id, id) { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { inv_collection_id -> Uuid, sled_id -> Uuid, id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 4ac933b87fb..865028b50f3 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4060,8 +4060,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); --- TODO-K: rename table -CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql index 0e079b864c5..d53a568a5f3 100644 --- a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -1,5 +1,4 @@ --- TODO-K: rename table -CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance2 ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( -- where this observation came from -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 42baf924087..eb88511a5d0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -169,9 +169,9 @@ impl SledAgent { .start(&log, &config.dropshot); // TODO-K: Uncomment and remove - // let health_monitor = HealthMonitorHandle::stub(); - let health_monitor = - crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; + let health_monitor = HealthMonitorHandle::stub(); + //let health_monitor = + // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, From 4e8a4cab0b9910787be61898a1385e78f8737209 Mon Sep 17 00:00:00 2001 From: karencfv Date: Wed, 21 Jan 2026 09:51:50 +1300 Subject: [PATCH 20/20] fix tests --- nexus/inventory/src/examples.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 210f3756a53..0bbd55de4a2 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -685,7 +685,7 @@ pub fn representative() -> Representative { fmri: "svc:/site/fake-service:default".to_string(), zone: "global".to_string(), }], - errors: vec![], + errors: vec!["an unimportant error".to_string()], time_of_status: Some( "2026-01-01T00:00:00Z".parse().unwrap(), ),