diff --git a/Cargo.lock b/Cargo.lock index 25617fa03a6..1db6045fbdc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6764,6 +6764,7 @@ dependencies = [ "expectorate", "hex", "iddqd", + "illumos-utils", "ipnetwork", "itertools 0.14.0", "macaddr", diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 7839ade793e..753dafa69df 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1608,6 +1608,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 32d8d836-4d8a-4e54-8fa9-f31d79c42646 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1744,6 +1747,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 89d02b1b-478c-401a-8e28-7a26f74fa41b (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -1973,6 +1979,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 4ccceeef6ce..b16dd0685fa 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -323,6 +323,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -450,6 +453,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -566,6 +572,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 061d0d5a3e4..99f26160f7c 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -711,6 +711,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -886,6 +889,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1061,6 +1067,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index d55487fa680..a75f65c2727 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -698,6 +698,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -873,6 +876,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1048,6 +1054,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index 2a92774cfdc..e1295c23220 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -682,6 +682,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled 98e6b7c2-2efa-41ca-b20a-0a4d61102fe6 (role = Gimlet, serial serial0) found at: from fake sled agent address: [fd00:1122:3344:101::1]:12345 @@ -857,6 +860,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + sled d81c6a84-79b8-4958-ae41-ea46c9b19763 (role = Gimlet, serial serial2) found at: from fake sled agent address: [fd00:1122:3344:103::1]:12345 @@ -1032,6 +1038,9 @@ LEDGERED SLED CONFIG (measurement set is empty) reconciler task status: idle (finished at after running for s) +HEALTH MONITOR + no data on SMF services in maintenance has been collected + KEEPER MEMBERSHIP no membership retrieved diff --git a/illumos-utils/src/svcs.rs b/illumos-utils/src/svcs.rs index 17b68412379..1ea8eac69f1 100644 --- a/illumos-utils/src/svcs.rs +++ b/illumos-utils/src/svcs.rs @@ -19,7 +19,6 @@ use serde::Deserialize; use serde::Serialize; use slog::Logger; use slog::{error, info}; -use std::fmt::Display; #[cfg(target_os = "illumos")] use tokio::process::Command; @@ -199,8 +198,8 @@ impl From for SvcState { #[serde(rename_all = "snake_case")] /// Information about an SMF service that is enabled but not running pub struct SvcInMaintenance { - fmri: String, - zone: String, + pub fmri: String, + pub zone: String, } impl SvcInMaintenance { @@ -210,14 +209,6 @@ impl SvcInMaintenance { } } -impl Display for SvcInMaintenance { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let SvcInMaintenance { fmri, zone } = self; - - writeln!(f, "FMRI: {} zone: {}", fmri, zone) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index 9762bccbe76..1c30231bf53 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -21,6 +21,7 @@ derive-where.workspace = true diesel = { workspace = true, features = ["postgres", "r2d2", "chrono", "serde_json", "network-address", "uuid"] } hex.workspace = true iddqd.workspace = true +illumos-utils.workspace = true ipnetwork.workspace = true itertools.workspace = true macaddr.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 6f580f1b600..a996634bf5d 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -9,6 +9,7 @@ use crate::Generation; use crate::PhysicalDiskKind; use crate::omicron_zone_config::{self, OmicronZoneNic}; use crate::sled_cpu_family::SledCpuFamily; +use crate::to_db_typed_uuid; use crate::typed_uuid::DbTypedUuid; use crate::{ ByteCount, MacAddr, Name, ServiceKind, SqlU8, SqlU16, SqlU32, @@ -27,12 +28,16 @@ use diesel::pg::Pg; use diesel::serialize::ToSql; use diesel::{serialize, sql_types}; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcInMaintenance; use ipnetwork::IpNetwork; use nexus_db_schema::schema::inv_zone_manifest_non_boot; use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, + inv_health_monitor_svc_in_maintenance, + inv_health_monitor_svc_in_maintenance_error, + inv_health_monitor_svc_in_maintenance_service, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_measurements, @@ -63,6 +68,7 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -72,6 +78,8 @@ use omicron_uuid_kinds::OmicronSledConfigUuid; use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledKind; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::SvcInMaintenanceKind; +use omicron_uuid_kinds::SvcInMaintenanceUuid; use omicron_uuid_kinds::ZpoolKind; use omicron_uuid_kinds::{CollectionKind, OmicronZoneKind}; use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; @@ -1016,6 +1024,103 @@ impl_enum_type!( Idle => b"idle" ); +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance)] +pub struct InvSvcInMaintenance { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub id: DbTypedUuid, + pub svcs_cmd_error: Option, + // TODO-K: This might change to not nullable with omicron#9615 + pub time_of_status: Option>, +} + +impl InvSvcInMaintenance { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svcs_cmd_error: Option, + time_of_status: Option>, + ) -> Self { + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + id, + svcs_cmd_error, + time_of_status, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance_service)] +pub struct InvSvcInMaintenanceService { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + // TODO-K: Change the UUID kind? + pub id: DbTypedUuid, + pub fmri: String, + pub zone: String, +} + +impl InvSvcInMaintenanceService { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + svc: SvcInMaintenance, + ) -> Self { + let SvcInMaintenance { fmri, zone } = svc; + + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + id, + fmri, + zone, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_health_monitor_svc_in_maintenance_error)] +pub struct InvSvcInMaintenanceError { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + // TODO-K: Change the UUID kind? + pub id: DbTypedUuid, + pub error_message: String, +} + +impl InvSvcInMaintenanceError { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + error_message: String, + ) -> Self { + // This ID is only used as a primary key, it's fine to generate it here. + let id = to_db_typed_uuid(SvcInMaintenanceUuid::from_untyped_uuid( + Uuid::new_v4(), + )); + + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + id, + error_message, + } + } +} + /// See [`sled_agent_types::inventory::ConfigReconcilerInventory`]. #[derive(Queryable, Clone, Debug, Selectable, Insertable)] #[diesel(table_name = inv_sled_config_reconciler)] diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index d3bd9935cdb..e4947889b61 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(222, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(223, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(223, "health-monitor-svcs-in-maintenance"), KnownVersion::new(222, "audit-log-credential-id"), KnownVersion::new(221, "audit-log-auth-method-enum"), KnownVersion::new(220, "multicast-implicit-lifecycle"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index de7346ae98f..f6a973ba916 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -27,6 +27,8 @@ use diesel::sql_types::Nullable; use futures::FutureExt; use futures::future::BoxFuture; use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_db_errors::ErrorHandler; use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; @@ -62,6 +64,9 @@ use nexus_db_model::InvServiceProcessor; use nexus_db_model::InvSledAgent; use nexus_db_model::InvSledBootPartition; use nexus_db_model::InvSledConfigReconciler; +use nexus_db_model::InvSvcInMaintenance; +use nexus_db_model::InvSvcInMaintenanceError; +use nexus_db_model::InvSvcInMaintenanceService; use nexus_db_model::InvZpool; use nexus_db_model::RotImageError; use nexus_db_model::SledRole; @@ -210,6 +215,86 @@ impl DataStore { } } + // Pull services in maintenance result out of all sled agents + // TODO-K: change the variable name to svcs_in_maintenance + let svcs_in_maintenance: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + None, + svcs.time_of_status, + )] + } + Err(e) => { + vec![InvSvcInMaintenance::new( + collection_id, + sled_agent.sled_id, + Some(e.to_string()), + // TODO-K: This might change to not nullable with omicron#9615 + // TODO-K: I'm guessing in this case I'll set a time + // at this point or something? + None, + )] + } + } + }) + .collect(); + + // Pull services in maintenance details out of all sled agents + let svcs_in_maintenance_services: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => svcs + .services + .iter() + .map(|svc| { + InvSvcInMaintenanceService::new( + collection_id, + sled_agent.sled_id, + svc.clone(), + ) + }) + .collect(), + // If there is an error we've already captured it above + Err(_) => { + vec![] + } + } + }) + .collect(); + + // Pull services in maintenance errors out of all sled agents + let svcs_in_maintenance_errors: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + match &sled_agent.health_monitor.smf_services_in_maintenance { + Ok(svcs) => svcs + .errors + .iter() + .map(|e| { + InvSvcInMaintenanceError::new( + collection_id, + sled_agent.sled_id, + e.clone(), + ) + }) + .collect(), + // If there is an error we've already captured it above + Err(_) => { + vec![] + } + } + }) + .collect(); + // Pull disks out of all sled agents let disks: Vec<_> = collection .sled_agents @@ -1507,6 +1592,63 @@ impl DataStore { } } + // Insert rows for all the SMF services in maintenance results we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance = svcs_in_maintenance.into_iter(); + loop { + let some_svcs_in_maintenance = + svcs_in_maintenance.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance) + .values(some_svcs_in_maintenance) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for all the SMF services in maintenance we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance_services = svcs_in_maintenance_services.into_iter(); + loop { + let some_svcs_in_maintenance_services = + svcs_in_maintenance_services.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance_services.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance_service) + .values(some_svcs_in_maintenance_services) + .execute_async(&conn) + .await?; + } + } + + // Insert rows for all the SMF services in maintenance errors we found + { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut svcs_in_maintenance_errors = svcs_in_maintenance_errors.into_iter(); + loop { + let some_svcs_in_maintenance_errors = + svcs_in_maintenance_errors.by_ref().take(batch_size).collect::>(); + if some_svcs_in_maintenance_errors.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_health_monitor_svc_in_maintenance_error) + .values(some_svcs_in_maintenance_errors) + .execute_async(&conn) + .await?; + } + } + // Insert rows for the sled agents that we found. In practice, we'd // expect these to all have baseboards (if using Oxide hardware) or // none have baseboards (if not). @@ -2011,6 +2153,9 @@ impl DataStore { nmupdate_override_non_boot: usize, nconfig_reconcilers: usize, nboot_partitions: usize, + nhealth_monitor_svc_in_maintenance: usize, + nhealth_monitor_svc_in_maintenance_service: usize, + nhealth_monitor_svc_in_maintenance_error: usize, nomicron_sled_configs: usize, nomicron_sled_config_disks: usize, nomicron_sled_config_datasets: usize, @@ -2048,6 +2193,9 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, + nhealth_monitor_svc_in_maintenance_service, + nhealth_monitor_svc_in_maintenance_error, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2278,6 +2426,34 @@ impl DataStore { .await? }; + // Remove rows associated with the health monitor + let nhealth_monitor_svc_in_maintenance = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + + let nhealth_monitor_svc_in_maintenance_service = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance_service.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + + let nhealth_monitor_svc_in_maintenance_error = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + diesel::delete(dsl::inv_health_monitor_svc_in_maintenance_error.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + // Remove rows associated with `OmicronSledConfig`s. let nomicron_sled_configs = { use nexus_db_schema::schema::inv_omicron_sled_config::dsl; @@ -2409,6 +2585,9 @@ impl DataStore { nmupdate_override_non_boot, nconfig_reconcilers, nboot_partitions, + nhealth_monitor_svc_in_maintenance, + nhealth_monitor_svc_in_maintenance_service, + nhealth_monitor_svc_in_maintenance_error, nomicron_sled_configs, nomicron_sled_config_disks, nomicron_sled_config_datasets, @@ -2457,6 +2636,9 @@ impl DataStore { "nmupdate_override_non_boot" => nmupdate_override_non_boot, "nconfig_reconcilers" => nconfig_reconcilers, "nboot_partitions" => nboot_partitions, + "nhealth_monitor_svc_in_maintenance" => nhealth_monitor_svc_in_maintenance, + "nhealth_monitor_svc_in_maintenance_service" => nhealth_monitor_svc_in_maintenance_service, + "nhealth_monitor_svc_in_maintenance_error" => nhealth_monitor_svc_in_maintenance_error, "nomicron_sled_configs" => nomicron_sled_configs, "nomicron_sled_config_disks" => nomicron_sled_config_disks, "nomicron_sled_config_datasets" => nomicron_sled_config_datasets, @@ -2860,6 +3042,109 @@ impl DataStore { datasets }; + // Mapping of "Sled ID" -> "The result of SMF services in maintenance + // reported by that sled" + let mut svcs_in_maintenance_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance::dsl; + + let mut svcs = BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenance::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + + // Mapping of "Sled ID" -> "All SMF services in maintenance reported by + // that sled" + let mut svcs_in_maintenance_services_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_service::dsl; + + let mut svcs = + BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = + paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance_service, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenanceService::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + + // Mapping of "Sled ID" -> "All SMF services in maintenance errors reported by + // that sled" + let mut svcs_in_maintenance_errors_by_sled = { + use nexus_db_schema::schema::inv_health_monitor_svc_in_maintenance_error::dsl; + + let mut svcs = + BTreeMap::>::new(); + let mut paginator = Paginator::new( + batch_size, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let batch: Vec = + paginated_multicolumn( + dsl::inv_health_monitor_svc_in_maintenance_error, + (dsl::sled_id, dsl::id), + &p.current_pagparams(), + ) + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvSvcInMaintenanceError::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + paginator = p.found_batch(&batch, &|row| (row.sled_id, row.id)); + for svc in batch { + svcs.entry(svc.sled_id.into_untyped_uuid()) + .or_default() + .push(svc); + } + } + svcs + }; + // Collect the unique baseboard ids referenced by SPs, RoTs, and Sled // Agents. let baseboard_id_ids: BTreeSet<_> = sps @@ -4248,6 +4533,62 @@ impl DataStore { )) })?; + // Convert all health checks into a full `HealthMonitorInventory` + let mut health_monitor = HealthMonitorInventory::new(); + + let svcs_in_maintenance = svcs_in_maintenance_by_sled + .remove(&sled_id.into_untyped_uuid()) + .map(|rows| { + // There should only be one row per collection per sled + let first_row = rows.first().ok_or_else(|| { + format!( + "missing SMF services in maintenance details \ + for sled {sled_id} that should have been \ + fetched" + ) + })?; + + // Check if the svcs command itself failed first. If so, we + // can safely assume no services in maintenance have been + // reported and return an error. + if let Some(e) = &first_row.svcs_cmd_error { + return Err(e.clone()); + } + + // Collect all services from svcs_in_maintenance_services_by_sled + // for this sled. + let services: Vec = + svcs_in_maintenance_services_by_sled + .remove(&sled_id.into_untyped_uuid()) + .unwrap_or_default() + .into_iter() + .map(|svc| SvcInMaintenance { + fmri: svc.fmri, + zone: svc.zone, + }) + .collect(); + + // Collect all errors from svcs_in_maintenance_errors_by_sled + // for this sled. + let errors: Vec = + svcs_in_maintenance_errors_by_sled + .remove(&sled_id.into_untyped_uuid()) + .unwrap_or_default() + .into_iter() + .map(|err| err.error_message) + .collect(); + + Ok(SvcsInMaintenanceResult { + services, + errors, + time_of_status: first_row.time_of_status, + }) + }); + + if let Some(svcs) = svcs_in_maintenance { + health_monitor.smf_services_in_maintenance = svcs + }; + let sled_agent = nexus_types::inventory::SledAgent { time_collected: s.time_collected, source: s.source, @@ -4284,9 +4625,7 @@ impl DataStore { reconciler_status, last_reconciliation, file_source_resolver, - // TODO-K[omicron#9516]: Actually query the DB when there is - // something there - health_monitor: HealthMonitorInventory::new(), + health_monitor, }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 46658e98c19..c11cc771867 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1721,6 +1721,36 @@ table! { } } +table! { + inv_health_monitor_svc_in_maintenance (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + svcs_cmd_error -> Nullable, + // TODO-K: This might change to not nullable with omicron#9615 + time_of_status -> Nullable, + } +} + +table! { + inv_health_monitor_svc_in_maintenance_service (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + fmri -> Text, + zone -> Text, + } +} + +table! { + inv_health_monitor_svc_in_maintenance_error (inv_collection_id, sled_id, id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + id -> Uuid, + error_message -> Text, + } +} + table! { inv_sled_boot_partition (inv_collection_id, sled_id, boot_disk_slot) { inv_collection_id -> Uuid, diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index bc415c5fb28..0bbd55de4a2 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -16,6 +16,8 @@ use gateway_client::types::SpComponentCaboose; use gateway_client::types::SpState; use gateway_types::rot::RotSlot; use iddqd::id_ord_map; +use illumos_utils::svcs::SvcInMaintenance; +use illumos_utils::svcs::SvcsInMaintenanceResult; use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::InternalDnsGenerationStatus; use nexus_types::inventory::RotPage; @@ -581,6 +583,7 @@ pub fn representative() -> Representative { has_mupdate_override: true, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -615,6 +618,7 @@ pub fn representative() -> Representative { has_mupdate_override: false, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); @@ -647,13 +651,14 @@ pub fn representative() -> Representative { has_mupdate_override: true, }, ), + HealthMonitorInventory::new(), ), ) .unwrap(); // Finally, report a sled with unknown baseboard information. This should // look the same as the PC as far as inventory is concerned but let's verify - // it. + // it. Additionally, this sled will report a few SMF services in maintenance. let sled_agent_id_unknown = "5c5b4cf9-3e13-45fd-871c-f177d6537510".parse().unwrap(); @@ -674,6 +679,18 @@ pub fn representative() -> Representative { file_source_resolver( OmicronFileSourceResolverExampleKind::Error, ), + HealthMonitorInventory { + smf_services_in_maintenance: Ok(SvcsInMaintenanceResult { + services: vec![SvcInMaintenance { + fmri: "svc:/site/fake-service:default".to_string(), + zone: "global".to_string(), + }], + errors: vec!["an unimportant error".to_string()], + time_of_status: Some( + "2026-01-01T00:00:00Z".parse().unwrap(), + ), + }), + }, ), ) .unwrap(); @@ -1015,6 +1032,7 @@ pub fn sled_agent( datasets: Vec, ledgered_sled_config: Option, file_source_resolver: OmicronFileSourceResolverInventory, + health_monitor: HealthMonitorInventory, ) -> Inventory { // Assume the `ledgered_sled_config` was reconciled successfully. let last_reconciliation = ledgered_sled_config.clone().map(|config| { @@ -1087,9 +1105,6 @@ pub fn sled_agent( reconciler_status, last_reconciliation, file_source_resolver, - // TODO-K: We'll want to have the functionality to add some services - // here in a future PR. This will be more useful when we add this - // information to the DB. - health_monitor: HealthMonitorInventory::new(), + health_monitor, } } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 9416d4c9f8d..71e84beffcb 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -14,12 +14,14 @@ use chrono::SecondsFormat; use clap::Subcommand; use gateway_types::component::SpType; use iddqd::IdOrdMap; +use illumos_utils::svcs::SvcsInMaintenanceResult; use indent_write::fmt::IndentWriter; use itertools::Itertools; use omicron_common::disk::M2Slot; use omicron_uuid_kinds::{ DatasetUuid, OmicronZoneUuid, PhysicalDiskUuid, ZpoolUuid, }; +use sled_agent_types::inventory::HealthMonitorInventory; use sled_agent_types_versions::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, ConfigReconcilerInventory, ConfigReconcilerInventoryResult, @@ -907,41 +909,8 @@ fn display_sleds( } } - // TODO-K[omicron#9516]: This is temporarily hidden until we add the - // health monitor types to the DB. Once those have been integrated, - // we'll show health monitor status when everything is healthy as well. - if !health_monitor.is_empty() { - writeln!(indented, "HEALTH MONITOR")?; - let mut indent2 = IndentWriter::new(" ", &mut indented); - match &health_monitor.smf_services_in_maintenance { - Ok(svcs) => { - if !svcs.is_empty() { - if let Some(time_of_status) = &svcs.time_of_status { - writeln!( - indent2, - "SMF services in maintenance at {}:", - time_of_status.to_rfc3339_opts( - SecondsFormat::Millis, - /* use_z */ true, - ) - )?; - } - let mut indent3 = IndentWriter::new(" ", &mut indent2); - for svc in &svcs.services { - writeln!(indent3, "{svc}")?; - } - } - } - Err(e) => { - writeln!( - indent2, - "failed to retrieve SMF services in maintenance: {e}" - )?; - } - } - } - f = indented.into_inner(); + display_health_monitor(health_monitor, f)?; } Ok(()) } @@ -1133,6 +1102,85 @@ fn collect_config_reconciler_errors( .collect() } +fn display_health_monitor( + health_monitor: &HealthMonitorInventory, + f: &mut dyn fmt::Write, +) -> fmt::Result { + let HealthMonitorInventory { smf_services_in_maintenance } = health_monitor; + + writeln!(f, "\nHEALTH MONITOR")?; + + let mut indented = IndentWriter::new(" ", f); + + match &smf_services_in_maintenance { + Ok(svcs) => { + if !svcs.is_empty() { + let SvcsInMaintenanceResult { + services, + errors, + time_of_status, + } = svcs; + let time = if let Some(t) = time_of_status { + t.to_rfc3339_opts( + SecondsFormat::Millis, + /* use_z */ true, + ) + } else { + "unknown time".to_string() + }; + + writeln!( + indented, + "{} SMF services in maintenance at {}", + services.len(), + time + )?; + + if !services.is_empty() { + #[derive(Tabled)] + #[tabled(rename_all = "SCREAMING_SNAKE_CASE")] + struct SvcRow { + fmri: String, + zone: String, + } + let rows = services.iter().map(|s| SvcRow { + fmri: s.fmri.clone(), + zone: s.zone.clone(), + }); + let table = tabled::Table::new(rows) + .with(tabled::settings::Style::empty()) + .with(tabled::settings::Padding::new(4, 1, 0, 0)) + .to_string(); + writeln!(indented, "{table}")?; + }; + if !errors.is_empty() { + writeln!( + indented, + "\nfound errors when retrieving services in maintenance:" + )?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + for e in errors { + writeln!(indent2, "{e}")?; + } + } + } else { + writeln!( + indented, + "no data on SMF services in maintenance has been collected" + )?; + } + } + Err(e) => { + writeln!( + indented, + "failed to retrieve SMF services in maintenance: {e}" + )?; + } + }; + + Ok(()) +} + fn display_sled_config( label: &str, config: &OmicronSledConfig, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 52f9d115d76..865028b50f3 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4060,6 +4060,70 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_sled_agent ( PRIMARY KEY (inv_collection_id, sled_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + -- TODO-K: This will change to not null with omicron#9615 + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- an error message found when retrieving the SMF services in maintenance + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); + -- This type name starts with "clear_" for legacy reasons. Prefer "remove" in -- the future. CREATE TYPE IF NOT EXISTS omicron.public.clear_mupdate_override_boot_success @@ -7818,7 +7882,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '222.0.0', NULL) + (TRUE, NOW(), NOW(), '223.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql new file mode 100644 index 00000000000..d53a568a5f3 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up01.sql @@ -0,0 +1,22 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- error when calling the svcs command + svcs_cmd_error TEXT, + + -- time when the status was checked if applicable + -- TODO-K: This will change to not null with omicron#9615 + time_of_status TIMESTAMPTZ, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql new file mode 100644 index 00000000000..5f1bd55b010 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up02.sql @@ -0,0 +1,21 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_service ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- FMRI of the SMF service in maintenance + fmri TEXT, + + -- zone the SMF service in maintenance is located in + zone TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql new file mode 100644 index 00000000000..c3b2c4ae5f6 --- /dev/null +++ b/schema/crdb/health-monitor-svcs-in-maintenance/up03.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_health_monitor_svc_in_maintenance_error ( + -- where this observation came from + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + + -- unique id for this sled (should be foreign keys into `sled` table, though + -- it's conceivable a sled will report an id that we don't know about); + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + + -- unique id for each row + id UUID NOT NULL, + + -- an error message found when retrieving the SMF services in maintenance + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id, id) +); \ No newline at end of file diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index e42d3c1b0dd..994d390ad82 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -284,7 +284,8 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +// TODO-K: Remove pub +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index bb6e9c028e8..eb88511a5d0 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -168,7 +168,10 @@ impl SledAgent { .await .start(&log, &config.dropshot); + // TODO-K: Uncomment and remove let health_monitor = HealthMonitorHandle::stub(); + //let health_monitor = + // crate::long_running_tasks::spawn_health_monitor_tasks(&log).await; Arc::new(SledAgent { id, diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 2eb483166e9..a5dbedf77be 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -84,6 +84,9 @@ impl_typed_uuid_kinds! { Sled = {}, SpUpdate = {}, SupportBundle = {}, + // `SvcInMaintenance`s do not contain IDs themselves. These IDs exist + // for the same reason as those in `OmicronSledConfig`. + SvcInMaintenance = {}, TufArtifact = {}, TufRepo = {}, TufTrustRoot = {},