From 322d5b3fb70967e1dd64da3db11e619ad8fdad2b Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 16 Apr 2026 12:44:54 -0700 Subject: [PATCH 01/24] Add FMD fault inventory to sled-agent API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exposes illumos Fault Management Daemon (FMD) data through the sled-agent inventory endpoint. This lets the control plane see diagnosed hardware/software faults on each sled. New API version 35 adds an `fmd: Option` field to `Inventory`. When present, it contains: - Cases: diagnosed faults with UUID, diagnostic code, URL, and the full event nvlist serialized as JSON - Resources: affected components with FMRI, fault status flags On illumos, sled-agent queries FMD on each inventory request. On non-illumos (sim, tests), the field is None. Database storage is not included — that's a follow-up. --- Cargo.lock | 50 ++++ Cargo.toml | 2 + clients/sled-agent-client/src/lib.rs | 3 + cockroach-admin/Cargo.toml | 4 + dev-tools/omdb/Cargo.toml | 4 + dev-tools/omicron-dev/Cargo.toml | 4 + end-to-end-tests/Cargo.toml | 8 + end-to-end-tests/build.rs | 10 + nexus/Cargo.toml | 4 + nexus/db-queries/Cargo.toml | 4 + .../src/db/datastore/physical_disk.rs | 1 + nexus/inventory/Cargo.toml | 8 + nexus/inventory/build.rs | 10 + nexus/inventory/src/examples.rs | 1 + nexus/metrics-producer-gc/Cargo.toml | 4 + .../src/test_util/host_phase_2_test_state.rs | 1 + .../cli-integration-tests/Cargo.toml | 4 + nexus/reconfigurator/execution/Cargo.toml | 4 + .../planning/src/mgs_updates/test_helpers.rs | 1 + nexus/reconfigurator/planning/src/system.rs | 2 + nexus/saga-recovery/Cargo.toml | 4 + nexus/test-utils/Cargo.toml | 4 + ntp-admin/Cargo.toml | 4 + .../sled-agent-34.0.0-37fbac.json.gitstub | 1 + ...bac.json => sled-agent-35.0.0-d9a875.json} | 147 ++++++++++- openapi/sled-agent/sled-agent-latest.json | 2 +- rpaths/src/lib.rs | 113 ++++---- sled-agent/Cargo.toml | 8 + sled-agent/api/src/lib.rs | 22 +- sled-agent/build.rs | 10 + sled-agent/src/fmd.rs | 246 ++++++++++++++++++ sled-agent/src/lib.rs | 1 + sled-agent/src/rack_setup/plan/service.rs | 1 + sled-agent/src/rack_setup/service.rs | 1 + sled-agent/src/sim/sled_agent.rs | 1 + sled-agent/src/sled_agent.rs | 3 + .../src/add_fmd_to_inventory/inventory.rs | 135 ++++++++++ .../versions/src/add_fmd_to_inventory/mod.rs | 10 + sled-agent/types/versions/src/latest.rs | 5 +- sled-agent/types/versions/src/lib.rs | 2 + 40 files changed, 787 insertions(+), 62 deletions(-) create mode 100644 end-to-end-tests/build.rs create mode 100644 nexus/inventory/build.rs create mode 100644 openapi/sled-agent/sled-agent-34.0.0-37fbac.json.gitstub rename openapi/sled-agent/{sled-agent-34.0.0-37fbac.json => sled-agent-35.0.0-d9a875.json} (98%) create mode 100644 sled-agent/build.rs create mode 100644 sled-agent/src/fmd.rs create mode 100644 sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs create mode 100644 sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs diff --git a/Cargo.lock b/Cargo.lock index e07f4749e48..da6cbf42ec5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3510,6 +3510,7 @@ dependencies = [ "clap", "colored 2.2.0", "dhcproto", + "fmd-adm-sys", "futures", "hickory-resolver 0.25.2", "http", @@ -3520,6 +3521,7 @@ dependencies = [ "ispf", "macaddr", "nexus-lockstep-client", + "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", "omicron-uuid-kinds", @@ -3810,6 +3812,23 @@ dependencies = [ "spin", ] +[[package]] +name = "fmd-adm" +version = "0.3.0" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=fffb52212fb1e073e9f1b16761b3614af8b38063#fffb52212fb1e073e9f1b16761b3614af8b38063" +dependencies = [ + "fmd-adm-sys", + "illumos-nvpair", + "libc", + "thiserror 2.0.18", + "uuid", +] + +[[package]] +name = "fmd-adm-sys" +version = "0.4.1" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=fffb52212fb1e073e9f1b16761b3614af8b38063#fffb52212fb1e073e9f1b16761b3614af8b38063" + [[package]] name = "fnv" version = "1.0.7" @@ -5278,6 +5297,21 @@ dependencies = [ "num_enum 0.5.11", ] +[[package]] +name = "illumos-nvpair" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fe4718e52c3654c4e6c6501ce8387d334acd0910b7b7bb3a554aad0262d09f" +dependencies = [ + "illumos-nvpair-sys", +] + +[[package]] +name = "illumos-nvpair-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4b8219c9c8c2c844dfd5772ec0bda5cd2a81d78c4579aba97f699721d46ab24" + [[package]] name = "illumos-sys-hdrs" version = "0.1.0" @@ -6967,6 +7001,7 @@ dependencies = [ "dropshot 0.16.7", "ereport-types", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-types", @@ -7156,6 +7191,7 @@ dependencies = [ "cockroach-admin-types", "dns-service-client", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -7170,6 +7206,7 @@ dependencies = [ "ntp-admin-client", "omicron-cockroach-metrics", "omicron-common", + "omicron-rpaths", "omicron-sled-agent", "omicron-uuid-kinds", "omicron-workspace-hack", @@ -7254,6 +7291,7 @@ dependencies = [ "async-bb8-diesel", "chrono", "diesel", + "fmd-adm-sys", "futures", "httptest", "ipnetwork", @@ -7368,6 +7406,7 @@ dependencies = [ "camino", "camino-tempfile", "clap", + "fmd-adm-sys", "nexus-db-queries", "nexus-lockstep-client", "nexus-reconfigurator-preparation", @@ -7404,6 +7443,7 @@ dependencies = [ "clickhouse-admin-types", "cockroach-admin-client", "diesel", + "fmd-adm-sys", "futures", "httptest", "iddqd", @@ -7586,6 +7626,7 @@ name = "nexus-saga-recovery" version = "0.1.0" dependencies = [ "chrono", + "fmd-adm-sys", "futures", "nexus-auth", "nexus-db-model", @@ -7642,6 +7683,7 @@ dependencies = [ "dns-service-client", "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=44a949c9bedf4fcd4d280337fa1965b4293c88d1)", "dropshot 0.16.7", + "fmd-adm-sys", "futures", "gateway-messages", "gateway-test-utils", @@ -8243,6 +8285,7 @@ dependencies = [ "csv", "dropshot 0.16.7", "expectorate", + "fmd-adm-sys", "http", "illumos-utils", "nexus-test-utils", @@ -8375,6 +8418,7 @@ dependencies = [ "clap", "dropshot 0.16.7", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-test-utils", @@ -8602,6 +8646,7 @@ dependencies = [ "ereport-types", "expectorate", "fatfs", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -8760,6 +8805,7 @@ dependencies = [ "clap", "dropshot 0.16.7", "expectorate", + "fmd-adm-sys", "http", "nexus-test-utils", "ntp-admin-api", @@ -8815,6 +8861,7 @@ dependencies = [ "dyn-clone", "ereport-types", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -9097,6 +9144,8 @@ dependencies = [ "expectorate", "flate2", "flume", + "fmd-adm", + "fmd-adm-sys", "futures", "gateway-client", "glob", @@ -9129,6 +9178,7 @@ dependencies = [ "omicron-common", "omicron-ddm-admin-client", "omicron-ledger", + "omicron-rpaths", "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", diff --git a/Cargo.toml b/Cargo.toml index add10a0aa73..22b3d77f9a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -504,6 +504,8 @@ filetime = "0.2.26" flate2 = "1.1.2" float-ord = "0.3.2" flume = "0.11.1" +fmd-adm = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "fffb52212fb1e073e9f1b16761b3614af8b38063" } +fmd-adm-sys = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "fffb52212fb1e073e9f1b16761b3614af8b38063" } foreign-types = "0.3.2" fs-err = "3.1.1" futures = "0.3.31" diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 0bc41e434bd..8769decd0ea 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -60,6 +60,9 @@ progenitor::generate_api!( ExternalIpConfig = omicron_common::api::internal::shared::ExternalIpConfig, ExternalIpv4Config = omicron_common::api::internal::shared::ExternalIpv4Config, ExternalIpv6Config = omicron_common::api::internal::shared::ExternalIpv6Config, + FmdCase = sled_agent_types_versions::latest::inventory::FmdCase, + FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, + FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, Generation = omicron_common::api::external::Generation, Hostname = omicron_common::api::external::Hostname, ImportExportPolicy = sled_agent_types_versions::latest::early_networking::ImportExportPolicy, diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index d3a7fca7713..903968981b5 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -39,6 +39,10 @@ toml.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true nexus-test-utils.workspace = true diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 424f7f3100a..d5468a3439a 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -94,6 +94,10 @@ update-engine.workspace = true url.workspace = true uuid.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] camino-tempfile.workspace = true expectorate.workspace = true diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 46c44e484ad..a21ed7e90a6 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -31,6 +31,10 @@ signal-hook-tokio.workspace = true tokio.workspace = true toml.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true omicron-dev-lib.workspace = true diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index 630be83c57c..cc6ee0c30a4 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -3,10 +3,14 @@ name = "end-to-end-tests" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anstyle.workspace = true anyhow = { workspace = true, features = ["backtrace"] } @@ -50,3 +54,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } toml.workspace = true uuid.workspace = true + +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true diff --git a/end-to-end-tests/build.rs b/end-to-end-tests/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/end-to-end-tests/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 060dcabdd93..f70261a832e 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -164,6 +164,10 @@ omicron-workspace-hack.workspace = true omicron-uuid-kinds.workspace = true zip = { workspace = true, features = ["jiff-02"] } +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true camino-tempfile.workspace = true diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 1009e73ae92..476c2cec224 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -84,6 +84,10 @@ omicron-workspace-hack.workspace = true # only enabled during tests or via the `testing` feature omicron-test-utils = { workspace = true, optional = true } +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [features] # Enable to export `TestDatabase` testing = ["omicron-test-utils"] diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index e9958d18498..78d73c34b02 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -711,6 +711,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: None, }, ) .unwrap(); diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index bb00753173d..19ac5173231 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -3,10 +3,14 @@ name = "nexus-inventory" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anyhow.workspace = true base64.workspace = true @@ -46,6 +50,10 @@ uuid.workspace = true omicron-cockroach-metrics.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true gateway-test-utils.workspace = true diff --git a/nexus/inventory/build.rs b/nexus/inventory/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/nexus/inventory/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 289ffc99eb4..2ad9df01c89 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -1115,5 +1115,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, + fmd: None, } } diff --git a/nexus/metrics-producer-gc/Cargo.toml b/nexus/metrics-producer-gc/Cargo.toml index b211ea685d4..5e6519d521c 100644 --- a/nexus/metrics-producer-gc/Cargo.toml +++ b/nexus/metrics-producer-gc/Cargo.toml @@ -27,6 +27,10 @@ pq-sys = "*" omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true diesel.workspace = true diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 70db49d37bb..f8a2e5d1852 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -376,6 +376,7 @@ mod api_impl { remove_mupdate_override: None, boot_partitions, }), + fmd: None, file_source_resolver: OmicronFileSourceResolverInventory { zone_manifest: ManifestInventory { boot_disk_path: Utf8PathBuf::new(), diff --git a/nexus/reconfigurator/cli-integration-tests/Cargo.toml b/nexus/reconfigurator/cli-integration-tests/Cargo.toml index 960cf94c953..6ce589023c3 100644 --- a/nexus/reconfigurator/cli-integration-tests/Cargo.toml +++ b/nexus/reconfigurator/cli-integration-tests/Cargo.toml @@ -19,6 +19,10 @@ omicron-workspace-hack.workspace = true pq-sys = "*" reconfigurator-cli.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] camino.workspace = true camino-tempfile.workspace = true diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 30339ca61b4..430e4664b9a 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -49,6 +49,10 @@ pq-sys = "*" omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true diesel.workspace = true diff --git a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs index 02664e598d4..ca88902e431 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs @@ -1374,6 +1374,7 @@ impl<'a> TestBoardCollectionBuilder<'a> { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: None, }, ) .unwrap(); diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 163a87e80e1..b5481107507 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -1505,6 +1505,7 @@ impl Sled { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: iddqd::IdOrdMap::new(), + fmd: None, } }; @@ -1689,6 +1690,7 @@ impl Sled { reference_measurements: inv_sled_agent .reference_measurements .clone(), + fmd: None, }; Sled { diff --git a/nexus/saga-recovery/Cargo.toml b/nexus/saga-recovery/Cargo.toml index 7154496580e..978407c111f 100644 --- a/nexus/saga-recovery/Cargo.toml +++ b/nexus/saga-recovery/Cargo.toml @@ -26,6 +26,10 @@ tokio.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] nexus-auth.workspace = true nexus-db-queries.workspace = true diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 6382048b13e..4cd399aa910 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -66,5 +66,9 @@ tokio-util.workspace = true transient-dns-server.workspace = true uuid.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [features] omicron-dev = ["omicron-test-utils/seed-gen"] diff --git a/ntp-admin/Cargo.toml b/ntp-admin/Cargo.toml index 3d0a4aa3bd7..33545bb3790 100644 --- a/ntp-admin/Cargo.toml +++ b/ntp-admin/Cargo.toml @@ -35,6 +35,10 @@ toml.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true nexus-test-utils.workspace = true diff --git a/openapi/sled-agent/sled-agent-34.0.0-37fbac.json.gitstub b/openapi/sled-agent/sled-agent-34.0.0-37fbac.json.gitstub new file mode 100644 index 00000000000..0717447fb6e --- /dev/null +++ b/openapi/sled-agent/sled-agent-34.0.0-37fbac.json.gitstub @@ -0,0 +1 @@ +b4b3b160749848f735fe0c8956cac311d83fe154:openapi/sled-agent/sled-agent-34.0.0-37fbac.json diff --git a/openapi/sled-agent/sled-agent-34.0.0-37fbac.json b/openapi/sled-agent/sled-agent-35.0.0-d9a875.json similarity index 98% rename from openapi/sled-agent/sled-agent-34.0.0-37fbac.json rename to openapi/sled-agent/sled-agent-35.0.0-d9a875.json index 2e18a91f6d6..a7f75102be9 100644 --- a/openapi/sled-agent/sled-agent-34.0.0-37fbac.json +++ b/openapi/sled-agent/sled-agent-35.0.0-d9a875.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "34.0.0" + "version": "35.0.0" }, "paths": { "/artifacts": { @@ -5366,6 +5366,143 @@ ], "additionalProperties": false }, + "FmdCase": { + "description": "A diagnosed fault case from the illumos Fault Management Daemon.", + "type": "object", + "properties": { + "code": { + "description": "Diagnostic code (e.g. \"PCIEX-8000-DJ\").", + "type": "string" + }, + "event": { + "nullable": true, + "description": "Full fault event payload as JSON, if present. Contains the fault-list with classes, certainties, affected FMRIs, and other diagnostic detail." + }, + "url": { + "description": "URL for human-readable information about this fault (e.g. \"http://illumos.org/msg/PCIEX-8000-DJ\").", + "type": "string" + }, + "uuid": { + "description": "Unique identifier for this case.", + "type": "string", + "format": "uuid" + } + }, + "required": [ + "code", + "url", + "uuid" + ] + }, + "FmdInventory": { + "description": "Result of querying FMD for fault information.", + "oneOf": [ + { + "description": "FMD data was successfully collected.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "available" + ] + }, + "value": { + "type": "object", + "properties": { + "cases": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdCase" + } + }, + "resources": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdResource" + } + } + }, + "required": [ + "cases", + "resources" + ] + } + }, + "required": [ + "type", + "value" + ] + }, + { + "description": "FMD data collection failed.", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "error" + ] + }, + "value": { + "type": "object", + "properties": { + "error": { + "type": "string" + } + }, + "required": [ + "error" + ] + } + }, + "required": [ + "type", + "value" + ] + } + ] + }, + "FmdResource": { + "description": "A resource affected by a diagnosed fault.", + "type": "object", + "properties": { + "case_id": { + "description": "UUID of the case that diagnosed this fault.", + "type": "string", + "format": "uuid" + }, + "faulty": { + "description": "Whether the resource is marked faulty.", + "type": "boolean" + }, + "fmri": { + "description": "Fault Management Resource Identifier (e.g. \"dev:////pci@af,0/pci1022,1483@3,5\").", + "type": "string" + }, + "invisible": { + "description": "Whether the resource is marked invisible.", + "type": "boolean" + }, + "unusable": { + "description": "Whether the resource is marked unusable.", + "type": "boolean" + }, + "uuid": { + "description": "Unique identifier for this resource entry.", + "type": "string", + "format": "uuid" + } + }, + "required": [ + "case_id", + "faulty", + "fmri", + "invisible", + "unusable", + "uuid" + ] + }, "Generation": { "description": "Generation numbers stored in the database, used for optimistic concurrency control", "type": "integer", @@ -6002,6 +6139,14 @@ "file_source_resolver": { "$ref": "#/components/schemas/OmicronFileSourceResolverInventory" }, + "fmd": { + "nullable": true, + "allOf": [ + { + "$ref": "#/components/schemas/FmdInventory" + } + ] + }, "last_reconciliation": { "nullable": true, "allOf": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index c247c14b384..0869bf61a7d 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-34.0.0-37fbac.json \ No newline at end of file +sled-agent-35.0.0-d9a875.json \ No newline at end of file diff --git a/rpaths/src/lib.rs b/rpaths/src/lib.rs index 381b0e370e2..94719d6c7d1 100644 --- a/rpaths/src/lib.rs +++ b/rpaths/src/lib.rs @@ -7,30 +7,40 @@ //! ## The least you need to know //! //! This build-time crate is used by several top-level Omicron crates to set -//! RPATH so that libpq can be found at runtime. This is necessary because these -//! crates depend on "diesel", which depends on "pq-sys", which links in "libpq". -//! But Cargo/Rust have no built-in way to set the RPATH so that libpq can -//! actually be found at runtime. (See below.) So we've developed the pattern -//! here instead. It works like this: +//! RPATH so that native libraries linked via *-sys crates can be found at +//! runtime. Currently we do this for two libraries: //! -//! 1. Any crate that depends on pq-sys, directly or not, needs to follow these -//! instructions. Generally, we depend on pq-sys _indirectly_, by virtue of -//! depending on Diesel. +//! - **libpq** (via pq-sys, pulled in by diesel) +//! - **libfmd_adm** (via fmd-adm-sys, pulled in by fmd-adm in sled-agent) +//! +//! Cargo/Rust have no built-in way to set the RPATH for a transitively-linked +//! native library. (See below.) So we've developed the pattern here instead. +//! It works like this: +//! +//! 1. Any crate that depends (directly or transitively) on a -sys crate from +//! the list above needs to follow these instructions. Often the dep is +//! indirect — pq-sys arrives via diesel, fmd-adm-sys via omicron-sled-agent. //! 2. Affected crates (e.g., omicron-nexus) have a build.rs that just calls //! `omicron_rpath::configure_default_omicron_rpaths()`. -//! 3. These crates must also add a dependency on "pq-sys", usually version "*". -//! (This dependency is unfortunate but necessary in order for us to get the -//! metadata emitted by pq-sys that tells it where it found libpq. Since we -//! don't directly use pq-sys in the crate, we don't care what version it is. -//! We specify "*" so that when Cargo dedups our dependency with the one in -//! Diesel, we pick up whatever would be picked up anyway, and we'll get its -//! metadata.) -//! 4. At the top level of Omicron (in the workspace Cargo.toml), we use a -//! patched version of pq-sys that emits metadata that's used by -//! `configure_default_omicron_rpaths()`. +//! 3. These crates must also add a *direct* dependency on the corresponding +//! -sys crate(s), usually version "*". This is unfortunate but necessary +//! so that Cargo exposes the `DEP_*_LIBDIRS` env var to our build.rs. +//! Since we don't actually use the -sys crate in the parent crate, we +//! don't care what version it is, and "*" lets Cargo dedup with whatever +//! the transitive dep already pulled in. Use a target-gated dep +//! (`[target.'cfg(target_os = "illumos")'.dependencies]`) when the +//! library only exists on illumos (e.g. fmd-adm-sys). +//! 4. The metadata that drives this comes from the -sys crate's build.rs: +//! - For pq-sys, we maintain a fork (see `[patch.crates-io.pq-sys]` +//! in the workspace Cargo.toml) that emits `cargo:LIBDIRS=...`. +//! - For fmd-adm-sys, the upstream crate emits the metadata directly, +//! so no patch is needed. //! -//! This crate is factored (over-engineered, really) so that we can extend this -//! pattern to other native libraries in the future. +//! `configure_default_omicron_rpaths()` scans for every `DEP_*_LIBDIRS` env +//! var in `RPATH_ENV_VARS`. Each crate's build.rs makes the same call — +//! only the env vars Cargo actually sets (corresponding to that crate's +//! direct deps) contribute RPATH entries, so callers don't have to know +//! which libraries they pull in. //! //! ## More details //! @@ -57,9 +67,10 @@ //! to include RPATH entries in the binary instead. //! //! As of 1.56, Cargo supports the "cargo:rustc-link-arg" instruction for use by -//! [Build Scripts][3] to pass arbitrary options to the linker. We use that here -//! to tell the linker to include the correct RPATH entry for our one native -//! dependency that's affected by this (libpq, exposed via the pq-sys package). +//! [Build Scripts][3] to pass arbitrary options to the linker. We use that +//! here to tell the linker to include the correct RPATH entries for the +//! native dependencies affected by this (currently libpq via pq-sys and +//! libfmd_adm via fmd-adm-sys). //! //! A subtle but critical point here is that the RPATH is knowable only by the //! system that's building the top-level executable binary. This mechanism can't @@ -94,10 +105,11 @@ /// Tells Cargo to pass linker arguments that specify the right RPATH for Omicron /// binaries -// This currently assumes that all Omicron binaries link to the same set of -// native libraries. As a result, we use a fixed list of libraries. In the -// future, if they depend on different combinations, we can accept different -// arguments here that specify exactly which ones are expected to be found. +// +// We scan a fixed set of `DEP_*_LIBDIRS` env vars (see `RPATH_ENV_VARS`). +// Only those that are actually set contribute to the RPATH — so each crate +// just calls this once, and only the libraries that crate actually depends +// on get configured. No per-caller customization needed. pub fn configure_default_omicron_rpaths() { internal::configure_default_omicron_rpaths(); // If no 'rerun-if-*' directives are emitted, cargo conservatively [1] @@ -145,39 +157,27 @@ mod internal { /// variables may itself look like a path, not just a directory. That is, /// these are colon-separated lists of directories. /// - /// Currently, we only do this for libpq ("pq-sys" package), but this pattern - /// could be generalized for other native libraries. - pub static RPATH_ENV_VARS: &'static [&'static str] = &["DEP_PQ_LIBDIRS"]; + /// We scan all of these on every build.rs call. Only env vars that are + /// actually set contribute RPATH entries — a crate that doesn't depend on + /// (say) fmd-adm-sys simply won't have `DEP_FMD_ADM_LIBDIRS` set, and we + /// skip it silently. + pub static RPATH_ENV_VARS: &'static [&'static str] = + &["DEP_PQ_LIBDIRS", "DEP_FMD_ADM_LIBDIRS"]; /// Tells Cargo to pass linker arguments that specify RPATHs from the - /// environment variable `env_var_name` + /// environment variable `env_var_name`, if it is set. /// - /// Panics if the environment variable is not set or contains non-UTF8 data. - /// This might be surprising, since environment variables are optional in - /// most build-time mechanisms. We opt for strictness here because in fact - /// we _do_ expect these to always be set, and if they're not, it's most - /// likely that somebody has forgotten to include a required dependency. We - /// want to tell them that rather than silently produce unrunnable binaries. + /// If the env var is unset, this does nothing: the corresponding + /// dependency is not present in the current crate's dep tree. If the + /// crate *does* need that library at runtime, the missing dep will + /// surface as a build-time or runtime link failure later. pub fn configure_rpaths_from_env_var( rpaths: &mut Vec, env_var_name: &OsStr, ) { - // If you see this message, that means that the build script for some - // Omicron crate is trying to configure RPATHs for a native library, but - // the environment variable that's supposed to contain the RPATH - // information for that library is unset. That most likely means that - // the crate you're building is lacking a direct dependency on the - // '*-sys' crate, or else that the '*-sys' crate's build script failed - // to set this metadata. - let env_var_value = - std::env::var_os(env_var_name).unwrap_or_else(|| { - panic!( - "omicron-rpaths: expected {:?} to be set in the \ - environment, but found it unset. (Is the current \ - crate missing a dependency on a *-sys crate?)", - env_var_name, - ) - }); + let Some(env_var_value) = std::env::var_os(env_var_name) else { + return; + }; configure_rpaths_from_path(rpaths, &env_var_value).unwrap_or_else( |error| { @@ -214,16 +214,17 @@ mod internal { use std::os::unix::ffi::OsStrExt; #[test] - #[should_panic = "omicron-rpaths: expected \"SHOULD_NOT_EXIST\" \ - to be set in the environment, but found it unset"] - fn test_configure_rpaths_from_bad_envvar() { + fn test_configure_rpaths_from_unset_envvar() { use super::configure_rpaths_from_env_var; + // Unset env vars are silently ignored: the dependency that + // would set them simply isn't in this crate's dep tree. let mut v = Vec::new(); configure_rpaths_from_env_var( &mut v, &OsString::from("SHOULD_NOT_EXIST"), ); + assert!(v.is_empty()); } #[test] diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 5f36581aa8a..38f50956bb2 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -4,10 +4,14 @@ description = "Services for managing sled-local resources" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anyhow.workspace = true async-trait.workspace = true @@ -137,6 +141,10 @@ zip.workspace = true zone.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] +fmd-adm.workspace = true +# Direct dep on fmd-adm-sys so this crate sees DEP_FMD_ADM_LIBDIRS in build.rs +# and can configure RPATH via omicron-rpaths. +fmd-adm-sys.workspace = true opte-ioctl.workspace = true [dev-dependencies] diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index 98df41cf6c1..49469b5dda3 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -21,7 +21,7 @@ use omicron_common::api::internal::{ }; use sled_agent_types_versions::{ latest, v1, v4, v6, v7, v9, v10, v11, v12, v14, v16, v17, v18, v20, v22, - v24, v25, v26, v28, v29, v30, v31, v33, + v24, v25, v26, v28, v29, v30, v31, v33, v34, }; use sled_diagnostics::SledDiagnosticsQueryOutput; use slog_error_chain::InlineErrorChain; @@ -38,6 +38,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (35, ADD_FMD_TO_INVENTORY), (34, MODIFY_SVCS_TYPES), (33, BOOTSTORE_SERVICE_NAT), (32, MAKE_ALL_EXTERNAL_IP_FIELDS_OPTIONAL), @@ -1019,12 +1020,27 @@ pub trait SledAgentApi { #[endpoint { method = GET, path = "/inventory", - versions = VERSION_MODIFY_SVCS_TYPES.., + versions = VERSION_ADD_FMD_TO_INVENTORY.., }] async fn inventory( rqctx: RequestContext, ) -> Result, HttpError>; + /// Fetch basic information about this sled + #[endpoint { + operation_id = "inventory", + method = GET, + path = "/inventory", + versions = VERSION_MODIFY_SVCS_TYPES..VERSION_ADD_FMD_TO_INVENTORY, + }] + async fn inventory_v34( + rqctx: RequestContext, + ) -> Result, HttpError> { + Self::inventory(rqctx).await.map(|HttpResponseOk(inv)| { + HttpResponseOk(v34::inventory::Inventory::from(inv)) + }) + } + /// Fetch basic information about this sled #[endpoint { operation_id = "inventory", @@ -1035,7 +1051,7 @@ pub trait SledAgentApi { async fn inventory_v28( rqctx: RequestContext, ) -> Result, HttpError> { - Self::inventory(rqctx).await.map(|HttpResponseOk(inv)| { + Self::inventory_v34(rqctx).await.map(|HttpResponseOk(inv)| { HttpResponseOk(v28::inventory::Inventory::from(inv)) }) } diff --git a/sled-agent/build.rs b/sled-agent/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/sled-agent/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs new file mode 100644 index 00000000000..8f328dc4086 --- /dev/null +++ b/sled-agent/src/fmd.rs @@ -0,0 +1,246 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collects fault information from the illumos Fault Management Daemon (FMD). + +use sled_agent_types::inventory::FmdInventory; + +#[cfg(target_os = "illumos")] +mod illumos { + use fmd_adm::{FmdAdm, NvList, NvValue}; + use sled_agent_types::inventory::{FmdCase, FmdInventory, FmdResource}; + + pub(super) fn nvvalue_to_json(value: &NvValue) -> serde_json::Value { + match value { + NvValue::Boolean => serde_json::Value::Bool(true), + NvValue::BooleanValue(b) => serde_json::Value::Bool(*b), + NvValue::Byte(n) => serde_json::json!(*n), + NvValue::Int8(n) => serde_json::json!(*n), + NvValue::UInt8(n) => serde_json::json!(*n), + NvValue::Int16(n) => serde_json::json!(*n), + NvValue::UInt16(n) => serde_json::json!(*n), + NvValue::Int32(n) => serde_json::json!(*n), + NvValue::UInt32(n) => serde_json::json!(*n), + NvValue::Int64(n) => serde_json::json!(*n), + NvValue::UInt64(n) => serde_json::json!(*n), + NvValue::Double(f) => serde_json::json!(*f), + NvValue::String(s) => serde_json::Value::String(s.clone()), + NvValue::Hrtime(n) => serde_json::json!(*n), + NvValue::NvList(nvl) => nvlist_to_json(nvl), + NvValue::BooleanArray(arr) => serde_json::json!(arr), + NvValue::ByteArray(arr) => serde_json::json!(arr), + NvValue::Int8Array(arr) => serde_json::json!(arr), + NvValue::UInt8Array(arr) => serde_json::json!(arr), + NvValue::Int16Array(arr) => serde_json::json!(arr), + NvValue::UInt16Array(arr) => serde_json::json!(arr), + NvValue::Int32Array(arr) => serde_json::json!(arr), + NvValue::UInt32Array(arr) => serde_json::json!(arr), + NvValue::Int64Array(arr) => serde_json::json!(arr), + NvValue::UInt64Array(arr) => serde_json::json!(arr), + NvValue::StringArray(arr) => serde_json::json!(arr), + NvValue::NvListArray(arr) => { + let items: Vec = + arr.iter().map(nvlist_to_json).collect(); + serde_json::Value::Array(items) + } + NvValue::Unknown { type_code } => { + serde_json::json!({ + "_unknown_type": format!("{type_code:?}") + }) + } + } + } + + pub(super) fn nvlist_to_json(nvl: &NvList) -> serde_json::Value { + let mut map = serde_json::Map::new(); + for (name, value) in nvl { + map.insert(name.to_string(), nvvalue_to_json(value)); + } + serde_json::Value::Object(map) + } + + pub(super) fn collect() -> FmdInventory { + let adm = match FmdAdm::open() { + Ok(adm) => adm, + Err(e) => { + return FmdInventory::Error { + error: format!("failed to open fmd: {e}"), + }; + } + }; + + let cases = match adm.cases(None) { + Ok(cases) => cases + .into_iter() + .map(|c| FmdCase { + uuid: c.uuid, + code: c.code, + url: c.url, + event: c.event.as_ref().map(nvlist_to_json), + }) + .collect(), + Err(e) => { + return FmdInventory::Error { + error: format!("failed to list fmd cases: {e}"), + }; + } + }; + + let resources = match adm.resources(true) { + Ok(resources) => resources + .into_iter() + .map(|r| FmdResource { + fmri: r.fmri, + uuid: r.uuid, + case_id: r.case, + faulty: r.faulty, + unusable: r.unusable, + invisible: r.invisible, + }) + .collect(), + Err(e) => { + return FmdInventory::Error { + error: format!("failed to list fmd resources: {e}"), + }; + } + }; + + FmdInventory::Available { cases, resources } + } +} + +pub(crate) fn collect_fmd_inventory() -> Option { + #[cfg(target_os = "illumos")] + { + Some(illumos::collect()) + } + #[cfg(not(target_os = "illumos"))] + { + None + } +} + +#[cfg(test)] +#[cfg(target_os = "illumos")] +mod tests { + use super::illumos::nvvalue_to_json; + use fmd_adm::NvValue; + + #[test] + fn boolean_presence() { + assert_eq!(nvvalue_to_json(&NvValue::Boolean), serde_json::json!(true)); + } + + #[test] + fn boolean_value() { + assert_eq!( + nvvalue_to_json(&NvValue::BooleanValue(false)), + serde_json::json!(false), + ); + assert_eq!( + nvvalue_to_json(&NvValue::BooleanValue(true)), + serde_json::json!(true), + ); + } + + #[test] + fn integers() { + assert_eq!(nvvalue_to_json(&NvValue::Byte(42)), serde_json::json!(42)); + assert_eq!(nvvalue_to_json(&NvValue::Int8(-1)), serde_json::json!(-1)); + assert_eq!( + nvvalue_to_json(&NvValue::UInt8(255)), + serde_json::json!(255) + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int16(-32000)), + serde_json::json!(-32000), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt16(65535)), + serde_json::json!(65535), + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int32(-100_000)), + serde_json::json!(-100_000), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt32(4_000_000_000)), + serde_json::json!(4_000_000_000u64), + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int64(i64::MIN)), + serde_json::json!(i64::MIN), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt64(u64::MAX)), + serde_json::json!(u64::MAX), + ); + } + + #[test] + fn double() { + assert_eq!( + nvvalue_to_json(&NvValue::Double(4.2069)), + serde_json::json!(4.2069), + ); + } + + #[test] + fn string() { + assert_eq!( + nvvalue_to_json(&NvValue::String("hello".to_string())), + serde_json::json!("hello"), + ); + } + + #[test] + fn hrtime() { + assert_eq!( + nvvalue_to_json(&NvValue::Hrtime(1_000_000_000)), + serde_json::json!(1_000_000_000i64), + ); + } + + #[test] + fn integer_arrays() { + assert_eq!( + nvvalue_to_json(&NvValue::Int32Array(vec![1, 2, 3])), + serde_json::json!([1, 2, 3]), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt8Array(vec![0, 128, 255])), + serde_json::json!([0, 128, 255]), + ); + } + + #[test] + fn boolean_array() { + assert_eq!( + nvvalue_to_json(&NvValue::BooleanArray(vec![true, false, true])), + serde_json::json!([true, false, true]), + ); + } + + #[test] + fn string_array() { + assert_eq!( + nvvalue_to_json(&NvValue::StringArray(vec![ + "a".to_string(), + "b".to_string(), + ])), + serde_json::json!(["a", "b"]), + ); + } + + #[test] + fn unknown_type() { + // The type_code is a data_type_t from the illumos nvpair FFI. + // We just format it via Debug. + let val = NvValue::Unknown { type_code: 0 }; + let json = nvvalue_to_json(&val); + // Should be an object with a single "_unknown_type" key. + assert!(json.is_object()); + assert!(json.get("_unknown_type").unwrap().is_string()); + } +} diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 41d86e86bf9..641ef5c9d4c 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -20,6 +20,7 @@ mod backing_fs; pub mod bootstrap; pub mod config; mod ddm_reconciler; +mod fmd; pub(crate) mod hardware_monitor; mod http_entrypoints; mod instance; diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index f5b51b91b82..601e0ca0016 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -1530,6 +1530,7 @@ mod tests { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: None, }, is_scrimlet, ) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index fff8ed7247c..be010ea526b 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1947,6 +1947,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: None, }, true, ) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 4b5b5e7441b..b91462a6bbe 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -995,6 +995,7 @@ impl SledAgent { ), smf_services_enabled_not_online, reference_measurements, + fmd: None, }) } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 11271a514b6..d1859d4fe54 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1273,6 +1273,8 @@ impl SledAgent { let smf_services_enabled_not_online = self.inner.health_monitor.to_inventory(); + let fmd = crate::fmd::collect_fmd_inventory(); + let ReconcilerInventory { disks, zpools, @@ -1300,6 +1302,7 @@ impl SledAgent { file_source_resolver, smf_services_enabled_not_online, reference_measurements: self.inner.measurements.to_inventory(), + fmd, }) } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs new file mode 100644 index 00000000000..98870b80cc2 --- /dev/null +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -0,0 +1,135 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use iddqd::IdOrdMap; +use omicron_common::api::external::ByteCount; +use omicron_uuid_kinds::SledUuid; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_hardware_types::{Baseboard, SledCpuFamily}; +use std::net::SocketAddrV6; +use uuid::Uuid; + +use crate::v1::inventory::InventoryDataset; +use crate::v1::inventory::InventoryDisk; +use crate::v1::inventory::SledRole; +use crate::v14::inventory::ConfigReconcilerInventoryStatus; +use crate::v14::inventory::OmicronFileSourceResolverInventory; +use crate::v14::inventory::OmicronSledConfig; +use crate::v16::inventory::ConfigReconcilerInventory; +use crate::v16::inventory::SingleMeasurementInventory; +use crate::v24::inventory::InventoryZpool; +use crate::v34; + +/// A diagnosed fault case from the illumos Fault Management Daemon. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub struct FmdCase { + /// Unique identifier for this case. + pub uuid: Uuid, + /// Diagnostic code (e.g. "PCIEX-8000-DJ"). + pub code: String, + /// URL for human-readable information about this fault + /// (e.g. `http://illumos.org/msg/PCIEX-8000-DJ`). + pub url: String, + /// Full fault event payload as JSON, if present. Contains the + /// fault-list with classes, certainties, affected FMRIs, and other + /// diagnostic detail. + pub event: Option, +} + +/// A resource affected by a diagnosed fault. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub struct FmdResource { + /// Fault Management Resource Identifier + /// (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + pub fmri: String, + /// Unique identifier for this resource entry. + pub uuid: Uuid, + /// UUID of the case that diagnosed this fault. + pub case_id: Uuid, + /// Whether the resource is marked faulty. + pub faulty: bool, + /// Whether the resource is marked unusable. + pub unusable: bool, + /// Whether the resource is marked invisible. + pub invisible: bool, +} + +/// Result of querying FMD for fault information. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[serde(tag = "type", content = "value", rename_all = "snake_case")] +pub enum FmdInventory { + /// FMD data was successfully collected. + Available { cases: Vec, resources: Vec }, + /// FMD data collection failed. + Error { error: String }, +} + +/// Identity and basic status information about this sled agent +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct Inventory { + pub sled_id: SledUuid, + pub sled_agent_address: SocketAddrV6, + pub sled_role: SledRole, + pub baseboard: Baseboard, + pub usable_hardware_threads: u32, + pub usable_physical_ram: ByteCount, + pub cpu_family: SledCpuFamily, + pub reservoir_size: ByteCount, + pub disks: Vec, + pub zpools: Vec, + pub datasets: Vec, + pub ledgered_sled_config: Option, + pub reconciler_status: ConfigReconcilerInventoryStatus, + pub last_reconciliation: Option, + pub file_source_resolver: OmicronFileSourceResolverInventory, + pub smf_services_enabled_not_online: + v34::inventory::SvcsEnabledNotOnlineResult, + pub reference_measurements: IdOrdMap, + pub fmd: Option, +} + +impl From for v34::inventory::Inventory { + fn from(value: Inventory) -> Self { + let Inventory { + sled_id, + sled_agent_address, + sled_role, + baseboard, + usable_hardware_threads, + usable_physical_ram, + cpu_family, + reservoir_size, + disks, + zpools, + datasets, + ledgered_sled_config, + reconciler_status, + last_reconciliation, + file_source_resolver, + smf_services_enabled_not_online, + reference_measurements, + fmd: _, + } = value; + Self { + sled_id, + sled_agent_address, + sled_role, + baseboard, + usable_hardware_threads, + usable_physical_ram, + cpu_family, + reservoir_size, + disks, + zpools, + datasets, + ledgered_sled_config, + reconciler_status, + last_reconciliation, + file_source_resolver, + smf_services_enabled_not_online, + reference_measurements, + } + } +} diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs new file mode 100644 index 00000000000..ef475fd6e32 --- /dev/null +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `ADD_FMD_TO_INVENTORY` of the Sled Agent API. +//! +//! This version adds FMD (Fault Management Daemon) data to the sled inventory +//! response, exposing diagnosed faults and affected resources. + +pub mod inventory; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 757f88f19fb..99dc3d08dff 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -157,7 +157,6 @@ pub mod inventory { pub use crate::v24::inventory::InventoryZpool; pub use crate::v24::inventory::ZpoolHealth; - pub use crate::v34::inventory::Inventory; pub use crate::v34::inventory::Svc; pub use crate::v34::inventory::SvcEnabledNotOnline; pub use crate::v34::inventory::SvcEnabledNotOnlineState; @@ -165,6 +164,10 @@ pub mod inventory { pub use crate::v34::inventory::SvcsEnabledNotOnline; pub use crate::v34::inventory::SvcsEnabledNotOnlineResult; pub use crate::v34::inventory::SvcsError; + pub use crate::v35::inventory::FmdCase; + pub use crate::v35::inventory::FmdInventory; + pub use crate::v35::inventory::FmdResource; + pub use crate::v35::inventory::Inventory; pub use crate::impls::inventory::ManifestBootInventoryDisplay; pub use crate::impls::inventory::ManifestInventoryDisplay; diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 69a6e70fcd9..127be764b8a 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -79,6 +79,8 @@ pub mod v32; pub mod v33; #[path = "modify_svcs_types/mod.rs"] pub mod v34; +#[path = "add_fmd_to_inventory/mod.rs"] +pub mod v35; #[path = "add_nexus_lockstep_port_to_inventory/mod.rs"] pub mod v4; #[path = "add_probe_put_endpoint/mod.rs"] From ab9399bd7c9cb01df3247dcb4d0439a5a73ce218 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Apr 2026 14:48:15 -0700 Subject: [PATCH 02/24] Run FMD collection on a blocking thread The inventory endpoint is async, but FMD queries go through door calls to fmd(1M) that can stall the calling thread. Move the work onto spawn_blocking so it doesn't occupy a Tokio worker; surface any JoinError as FmdInventory::Error. --- sled-agent/src/fmd.rs | 11 +++++++++-- sled-agent/src/sled_agent.rs | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index 8f328dc4086..d7b011fcfa0 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -110,10 +110,17 @@ mod illumos { } } -pub(crate) fn collect_fmd_inventory() -> Option { +pub(crate) async fn collect_fmd_inventory() -> Option { #[cfg(target_os = "illumos")] { - Some(illumos::collect()) + // FMD queries go through door calls to fmd(1M) and can block, so run + // them on a blocking-friendly thread rather than stalling the runtime. + match tokio::task::spawn_blocking(illumos::collect).await { + Ok(inv) => Some(inv), + Err(e) => Some(FmdInventory::Error { + error: format!("fmd collection task failed: {e}"), + }), + } } #[cfg(not(target_os = "illumos"))] { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index d1859d4fe54..cf07ef98a20 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1273,7 +1273,7 @@ impl SledAgent { let smf_services_enabled_not_online = self.inner.health_monitor.to_inventory(); - let fmd = crate::fmd::collect_fmd_inventory(); + let fmd = crate::fmd::collect_fmd_inventory().await; let ReconcilerInventory { disks, From 32c5872cc3832d70eadd608d77a1eb9a8aff847c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Apr 2026 14:50:52 -0700 Subject: [PATCH 03/24] Regenerate sled-agent OpenAPI spec The FmdCase.url docstring now uses backticks instead of quotes around the example URL, which changes the schema description and thus the spec hash. --- ...d-agent-35.0.0-d9a875.json => sled-agent-35.0.0-6344c3.json} | 2 +- openapi/sled-agent/sled-agent-latest.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename openapi/sled-agent/{sled-agent-35.0.0-d9a875.json => sled-agent-35.0.0-6344c3.json} (99%) diff --git a/openapi/sled-agent/sled-agent-35.0.0-d9a875.json b/openapi/sled-agent/sled-agent-35.0.0-6344c3.json similarity index 99% rename from openapi/sled-agent/sled-agent-35.0.0-d9a875.json rename to openapi/sled-agent/sled-agent-35.0.0-6344c3.json index a7f75102be9..fc4bed8f063 100644 --- a/openapi/sled-agent/sled-agent-35.0.0-d9a875.json +++ b/openapi/sled-agent/sled-agent-35.0.0-6344c3.json @@ -5379,7 +5379,7 @@ "description": "Full fault event payload as JSON, if present. Contains the fault-list with classes, certainties, affected FMRIs, and other diagnostic detail." }, "url": { - "description": "URL for human-readable information about this fault (e.g. \"http://illumos.org/msg/PCIEX-8000-DJ\").", + "description": "URL for human-readable information about this fault (e.g. `http://illumos.org/msg/PCIEX-8000-DJ`).", "type": "string" }, "uuid": { diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 0869bf61a7d..e68ac9ca1c3 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-35.0.0-d9a875.json \ No newline at end of file +sled-agent-35.0.0-6344c3.json \ No newline at end of file From 887f61a7e4acbb721e943029590d5503377b9a47 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 17 Apr 2026 14:50:52 -0700 Subject: [PATCH 04/24] xtask: allowlist libfmd_adm.so.1 for sled-agent binaries The verify-libraries xtask checks that binaries don't link against unexpected libraries. Add libfmd_adm.so.1 to the allowlist for the binaries that legitimately need it (sled-agent, sled-agent-sim, and omicron-dev which spins up sled-agent for tests). --- .cargo/xtask.toml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.cargo/xtask.toml b/.cargo/xtask.toml index 0a5c6960a0b..76d7b342d94 100644 --- a/.cargo/xtask.toml +++ b/.cargo/xtask.toml @@ -50,3 +50,12 @@ binary_allow_list = [ "sled-agent", "sled-agent-sim", ] + +# libfmd_adm is the illumos Fault Management Daemon admin library, used by +# sled-agent to collect FMD case/resource information for inventory. +[libraries."libfmd_adm.so.1"] +binary_allow_list = [ + "omicron-dev", + "sled-agent", + "sled-agent-sim", +] From 026bd3ffdcbb3430f8cb8299d96fc9047890b471 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 20 Apr 2026 14:46:59 -0700 Subject: [PATCH 05/24] InventoryResult, less optional --- clients/sled-agent-client/src/lib.rs | 1 + .../src/db/datastore/physical_disk.rs | 10 ++-- nexus/inventory/src/examples.rs | 5 +- .../src/test_util/host_phase_2_test_state.rs | 5 +- .../planning/src/mgs_updates/test_helpers.rs | 5 +- nexus/reconfigurator/planning/src/system.rs | 9 +++- ...114.json => sled-agent-36.0.0-b95af0.json} | 52 +++++++++---------- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/src/fmd.rs | 28 +++++----- sled-agent/src/rack_setup/plan/service.rs | 5 +- sled-agent/src/rack_setup/service.rs | 11 ++-- sled-agent/src/sim/sled_agent.rs | 12 +++-- .../src/add_fmd_to_inventory/inventory.rs | 15 ++++-- sled-agent/types/versions/src/latest.rs | 1 + 14 files changed, 99 insertions(+), 62 deletions(-) rename openapi/sled-agent/{sled-agent-36.0.0-809114.json => sled-agent-36.0.0-b95af0.json} (99%) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 8769decd0ea..94cd8237055 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -62,6 +62,7 @@ progenitor::generate_api!( ExternalIpv6Config = omicron_common::api::internal::shared::ExternalIpv6Config, FmdCase = sled_agent_types_versions::latest::inventory::FmdCase, FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, + FmdInventoryResult = sled_agent_types_versions::latest::inventory::FmdInventoryResult, FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, Generation = omicron_common::api::external::Generation, Hostname = omicron_common::api::external::Hostname, diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 78d73c34b02..dc15aacf4c0 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -347,9 +347,9 @@ mod test { use omicron_test_utils::dev; use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, - OmicronFileSourceResolverInventory, SledCpuFamily, SledRole, - SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventoryResult, + Inventory, InventoryDisk, OmicronFileSourceResolverInventory, + SledCpuFamily, SledRole, SvcsEnabledNotOnlineResult, }; use std::num::NonZeroU32; @@ -711,7 +711,9 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, }, ) .unwrap(); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 2ad9df01c89..d224d667a6c 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -51,6 +51,7 @@ use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; @@ -1115,6 +1116,8 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, } } diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index f8a2e5d1852..9e637a5e4cc 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -239,6 +239,7 @@ mod api_impl { use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; @@ -376,7 +377,9 @@ mod api_impl { remove_mupdate_override: None, boot_partitions, }), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, file_source_resolver: OmicronFileSourceResolverInventory { zone_manifest: ManifestInventory { boot_disk_path: Utf8PathBuf::new(), diff --git a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs index ca88902e431..af1d748ec0e 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs @@ -39,6 +39,7 @@ use sled_agent_types::inventory::BootPartitionContents; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; @@ -1374,7 +1375,9 @@ impl<'a> TestBoardCollectionBuilder<'a> { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, }, ) .unwrap(); diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index b5481107507..0c17f0ce270 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -61,6 +61,7 @@ use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::Baseboard; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; use sled_agent_types::inventory::InventoryDisk; @@ -1505,7 +1506,9 @@ impl Sled { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: iddqd::IdOrdMap::new(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, } }; @@ -1690,7 +1693,9 @@ impl Sled { reference_measurements: inv_sled_agent .reference_measurements .clone(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "(testing) FMD unavailable".to_string(), + }, }; Sled { diff --git a/openapi/sled-agent/sled-agent-36.0.0-809114.json b/openapi/sled-agent/sled-agent-36.0.0-b95af0.json similarity index 99% rename from openapi/sled-agent/sled-agent-36.0.0-809114.json rename to openapi/sled-agent/sled-agent-36.0.0-b95af0.json index 8aa3a63e8c8..1fef212c79e 100644 --- a/openapi/sled-agent/sled-agent-36.0.0-809114.json +++ b/openapi/sled-agent/sled-agent-36.0.0-b95af0.json @@ -5395,6 +5395,28 @@ ] }, "FmdInventory": { + "description": "Successfully collected FMD fault data.", + "type": "object", + "properties": { + "cases": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdCase" + } + }, + "resources": { + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdResource" + } + } + }, + "required": [ + "cases", + "resources" + ] + }, + "FmdInventoryResult": { "description": "Result of querying FMD for fault information.", "oneOf": [ { @@ -5408,25 +5430,7 @@ ] }, "value": { - "type": "object", - "properties": { - "cases": { - "type": "array", - "items": { - "$ref": "#/components/schemas/FmdCase" - } - }, - "resources": { - "type": "array", - "items": { - "$ref": "#/components/schemas/FmdResource" - } - } - }, - "required": [ - "cases", - "resources" - ] + "$ref": "#/components/schemas/FmdInventory" } }, "required": [ @@ -5435,7 +5439,7 @@ ] }, { - "description": "FMD data collection failed.", + "description": "FMD data collection failed or is not available on this platform.", "type": "object", "properties": { "type": { @@ -6140,12 +6144,7 @@ "$ref": "#/components/schemas/OmicronFileSourceResolverInventory" }, "fmd": { - "nullable": true, - "allOf": [ - { - "$ref": "#/components/schemas/FmdInventory" - } - ] + "$ref": "#/components/schemas/FmdInventoryResult" }, "last_reconciliation": { "nullable": true, @@ -6220,6 +6219,7 @@ "datasets", "disks", "file_source_resolver", + "fmd", "reconciler_status", "reference_measurements", "reservoir_size", diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 7e3b0a3f40e..021cc493261 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-36.0.0-809114.json \ No newline at end of file +sled-agent-36.0.0-b95af0.json \ No newline at end of file diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index d7b011fcfa0..4d675b13396 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -4,12 +4,14 @@ //! Collects fault information from the illumos Fault Management Daemon (FMD). -use sled_agent_types::inventory::FmdInventory; +use sled_agent_types::inventory::FmdInventoryResult; #[cfg(target_os = "illumos")] mod illumos { use fmd_adm::{FmdAdm, NvList, NvValue}; - use sled_agent_types::inventory::{FmdCase, FmdInventory, FmdResource}; + use sled_agent_types::inventory::{ + FmdCase, FmdInventory, FmdInventoryResult, FmdResource, + }; pub(super) fn nvvalue_to_json(value: &NvValue) -> serde_json::Value { match value { @@ -60,11 +62,11 @@ mod illumos { serde_json::Value::Object(map) } - pub(super) fn collect() -> FmdInventory { + pub(super) fn collect() -> FmdInventoryResult { let adm = match FmdAdm::open() { Ok(adm) => adm, Err(e) => { - return FmdInventory::Error { + return FmdInventoryResult::Error { error: format!("failed to open fmd: {e}"), }; } @@ -81,7 +83,7 @@ mod illumos { }) .collect(), Err(e) => { - return FmdInventory::Error { + return FmdInventoryResult::Error { error: format!("failed to list fmd cases: {e}"), }; } @@ -100,31 +102,33 @@ mod illumos { }) .collect(), Err(e) => { - return FmdInventory::Error { + return FmdInventoryResult::Error { error: format!("failed to list fmd resources: {e}"), }; } }; - FmdInventory::Available { cases, resources } + FmdInventoryResult::Available(FmdInventory { cases, resources }) } } -pub(crate) async fn collect_fmd_inventory() -> Option { +pub(crate) async fn collect_fmd_inventory() -> FmdInventoryResult { #[cfg(target_os = "illumos")] { // FMD queries go through door calls to fmd(1M) and can block, so run // them on a blocking-friendly thread rather than stalling the runtime. match tokio::task::spawn_blocking(illumos::collect).await { - Ok(inv) => Some(inv), - Err(e) => Some(FmdInventory::Error { + Ok(inv) => inv, + Err(e) => FmdInventoryResult::Error { error: format!("fmd collection task failed: {e}"), - }), + }, } } #[cfg(not(target_os = "illumos"))] { - None + FmdInventoryResult::Error { + error: "fmd not supported on this platform".to_string(), + } } } diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 601e0ca0016..03713e0c0fc 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -1358,6 +1358,7 @@ mod tests { use oxnet::Ipv6Net; use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; @@ -1530,7 +1531,9 @@ mod tests { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "fmd not collected during rack setup".to_string(), + }, }, is_scrimlet, ) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index be010ea526b..051c84cff9e 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1884,9 +1884,10 @@ mod test { use sled_agent_types::{ early_networking::RackNetworkConfig, inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, Inventory, - InventoryDisk, OmicronFileSourceResolverInventory, OmicronZoneType, - SledCpuFamily, SledRole, SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventoryResult, + Inventory, InventoryDisk, OmicronFileSourceResolverInventory, + OmicronZoneType, SledCpuFamily, SledRole, + SvcsEnabledNotOnlineResult, }, }; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -1947,7 +1948,9 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: None, + fmd: FmdInventoryResult::Error { + error: "fmd not collected during rack setup".to_string(), + }, }, true, ) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index b91462a6bbe..dc3a9109222 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -62,10 +62,10 @@ use sled_agent_types::instance::{ }; use sled_agent_types::inventory::{ ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - ConfigReconcilerInventoryStatus, HostPhase2DesiredSlots, Inventory, - InventoryDataset, InventoryDisk, InventoryZpool, - OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, - SingleMeasurementInventory, SledRole, ZpoolHealth, + ConfigReconcilerInventoryStatus, FmdInventoryResult, + HostPhase2DesiredSlots, Inventory, InventoryDataset, InventoryDisk, + InventoryZpool, OmicronFileSourceResolverInventory, OmicronSledConfig, + OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, }; use sled_agent_types::support_bundle::SupportBundleMetadata; use sled_agent_types::system_networking::SystemNetworkingConfig; @@ -995,7 +995,9 @@ impl SledAgent { ), smf_services_enabled_not_online, reference_measurements, - fmd: None, + fmd: FmdInventoryResult::Error { + error: "fmd not collected in simulator".to_string(), + }, }) } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs index 98870b80cc2..7625e62d423 100644 --- a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -59,13 +59,20 @@ pub struct FmdResource { /// Result of querying FMD for fault information. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(tag = "type", content = "value", rename_all = "snake_case")] -pub enum FmdInventory { +pub enum FmdInventoryResult { /// FMD data was successfully collected. - Available { cases: Vec, resources: Vec }, - /// FMD data collection failed. + Available(FmdInventory), + /// FMD data collection failed or is not available on this platform. Error { error: String }, } +/// Successfully collected FMD fault data. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub struct FmdInventory { + pub cases: Vec, + pub resources: Vec, +} + /// Identity and basic status information about this sled agent #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] pub struct Inventory { @@ -87,7 +94,7 @@ pub struct Inventory { pub smf_services_enabled_not_online: v34::inventory::SvcsEnabledNotOnlineResult, pub reference_measurements: IdOrdMap, - pub fmd: Option, + pub fmd: FmdInventoryResult, } impl From for v34::inventory::Inventory { diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index e32d3851066..c3104d417ae 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -166,6 +166,7 @@ pub mod inventory { pub use crate::v34::inventory::SvcsError; pub use crate::v36::inventory::FmdCase; pub use crate::v36::inventory::FmdInventory; + pub use crate::v36::inventory::FmdInventoryResult; pub use crate::v36::inventory::FmdResource; pub use crate::v36::inventory::Inventory; From bcb288d3e36a7297d1ee7e932d451c97f8390d1d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 22 Apr 2026 14:43:28 -0700 Subject: [PATCH 06/24] Convert 'FMD unavailable' to a default impl that's empty --- nexus/db-queries/src/db/datastore/physical_disk.rs | 11 +++++------ nexus/inventory/src/examples.rs | 5 ++--- .../src/test_util/host_phase_2_test_state.rs | 5 ++--- .../planning/src/mgs_updates/test_helpers.rs | 7 ++++--- nexus/reconfigurator/planning/src/system.rs | 9 +++------ sled-agent/src/rack_setup/plan/service.rs | 5 ++--- sled-agent/src/rack_setup/service.rs | 12 +++++------- sled-agent/src/sim/sled_agent.rs | 6 ++---- .../versions/src/add_fmd_to_inventory/inventory.rs | 4 +++- 9 files changed, 28 insertions(+), 36 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index dc15aacf4c0..48c55c0c4be 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -347,9 +347,10 @@ mod test { use omicron_test_utils::dev; use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, FmdInventoryResult, - Inventory, InventoryDisk, OmicronFileSourceResolverInventory, - SledCpuFamily, SledRole, SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, + FmdInventoryResult, Inventory, InventoryDisk, + OmicronFileSourceResolverInventory, SledCpuFamily, SledRole, + SvcsEnabledNotOnlineResult, }; use std::num::NonZeroU32; @@ -711,9 +712,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), }, ) .unwrap(); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index d224d667a6c..06cbb4950d1 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -51,6 +51,7 @@ use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; @@ -1116,8 +1117,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), } } diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 9e637a5e4cc..b837d4e57f3 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -239,6 +239,7 @@ mod api_impl { use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; @@ -377,9 +378,7 @@ mod api_impl { remove_mupdate_override: None, boot_partitions, }), - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), file_source_resolver: OmicronFileSourceResolverInventory { zone_manifest: ManifestInventory { boot_disk_path: Utf8PathBuf::new(), diff --git a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs index af1d748ec0e..d438be6795c 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs @@ -39,6 +39,7 @@ use sled_agent_types::inventory::BootPartitionContents; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; @@ -1375,9 +1376,9 @@ impl<'a> TestBoardCollectionBuilder<'a> { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available( + FmdInventory::default(), + ), }, ) .unwrap(); diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 0c17f0ce270..ad3e102afb9 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -61,6 +61,7 @@ use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::Baseboard; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; @@ -1506,9 +1507,7 @@ impl Sled { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: iddqd::IdOrdMap::new(), - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), } }; @@ -1693,9 +1692,7 @@ impl Sled { reference_measurements: inv_sled_agent .reference_measurements .clone(), - fmd: FmdInventoryResult::Error { - error: "(testing) FMD unavailable".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), }; Sled { diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index 03713e0c0fc..7341ea83a8f 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -1358,6 +1358,7 @@ mod tests { use oxnet::Ipv6Net; use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; use sled_agent_types::inventory::SledCpuFamily; @@ -1531,9 +1532,7 @@ mod tests { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Error { - error: "fmd not collected during rack setup".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), }, is_scrimlet, ) diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 051c84cff9e..8818e9c7ac0 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -1884,10 +1884,10 @@ mod test { use sled_agent_types::{ early_networking::RackNetworkConfig, inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, FmdInventoryResult, - Inventory, InventoryDisk, OmicronFileSourceResolverInventory, - OmicronZoneType, SledCpuFamily, SledRole, - SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, + FmdInventoryResult, Inventory, InventoryDisk, + OmicronFileSourceResolverInventory, OmicronZoneType, SledCpuFamily, + SledRole, SvcsEnabledNotOnlineResult, }, }; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -1948,9 +1948,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Error { - error: "fmd not collected during rack setup".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), }, true, ) diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index dc3a9109222..644e1435550 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -62,7 +62,7 @@ use sled_agent_types::instance::{ }; use sled_agent_types::inventory::{ ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - ConfigReconcilerInventoryStatus, FmdInventoryResult, + ConfigReconcilerInventoryStatus, FmdInventory, FmdInventoryResult, HostPhase2DesiredSlots, Inventory, InventoryDataset, InventoryDisk, InventoryZpool, OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, @@ -995,9 +995,7 @@ impl SledAgent { ), smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Error { - error: "fmd not collected in simulator".to_string(), - }, + fmd: FmdInventoryResult::Available(FmdInventory::default()), }) } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs index 7625e62d423..61c924f1c30 100644 --- a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -67,7 +67,9 @@ pub enum FmdInventoryResult { } /// Successfully collected FMD fault data. -#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +#[derive( + Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] pub struct FmdInventory { pub cases: Vec, pub resources: Vec, From 778afd5e383742537f29087fe27dde5e42e7a66e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 23 Apr 2026 10:10:09 -0700 Subject: [PATCH 07/24] Destructuring --- sled-agent/src/fmd.rs | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index 4d675b13396..5ec00e7007b 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -75,11 +75,14 @@ mod illumos { let cases = match adm.cases(None) { Ok(cases) => cases .into_iter() - .map(|c| FmdCase { - uuid: c.uuid, - code: c.code, - url: c.url, - event: c.event.as_ref().map(nvlist_to_json), + .map(|c| { + let fmd_adm::CaseInfo { uuid, code, url, event } = c; + FmdCase { + uuid, + code, + url, + event: event.as_ref().map(nvlist_to_json), + } }) .collect(), Err(e) => { @@ -92,13 +95,23 @@ mod illumos { let resources = match adm.resources(true) { Ok(resources) => resources .into_iter() - .map(|r| FmdResource { - fmri: r.fmri, - uuid: r.uuid, - case_id: r.case, - faulty: r.faulty, - unusable: r.unusable, - invisible: r.invisible, + .map(|r| { + let fmd_adm::ResourceInfo { + fmri, + uuid, + case, + faulty, + unusable, + invisible, + } = r; + FmdResource { + fmri, + uuid, + case_id: case, + faulty, + unusable, + invisible, + } }) .collect(), Err(e) => { From 34884bad98e56ace4c704265704f791e758bf6cd Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 23 Apr 2026 10:29:09 -0700 Subject: [PATCH 08/24] review feedback --- clients/sled-agent-client/src/lib.rs | 2 +- ...af0.json => sled-agent-36.0.0-9a4f77.json} | 73 ++++++++++++++++--- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/src/fmd.rs | 38 +++++++--- sled-agent/src/sled_agent.rs | 2 +- .../src/add_fmd_to_inventory/inventory.rs | 39 +++++++--- sled-agent/types/versions/src/latest.rs | 2 +- uuid-kinds/src/lib.rs | 2 + 8 files changed, 124 insertions(+), 36 deletions(-) rename openapi/sled-agent/{sled-agent-36.0.0-b95af0.json => sled-agent-36.0.0-9a4f77.json} (99%) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 94cd8237055..b1449add66a 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -60,7 +60,7 @@ progenitor::generate_api!( ExternalIpConfig = omicron_common::api::internal::shared::ExternalIpConfig, ExternalIpv4Config = omicron_common::api::internal::shared::ExternalIpv4Config, ExternalIpv6Config = omicron_common::api::internal::shared::ExternalIpv6Config, - FmdCase = sled_agent_types_versions::latest::inventory::FmdCase, + FmdHostCase = sled_agent_types_versions::latest::inventory::FmdHostCase, FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, FmdInventoryResult = sled_agent_types_versions::latest::inventory::FmdInventoryResult, FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, diff --git a/openapi/sled-agent/sled-agent-36.0.0-b95af0.json b/openapi/sled-agent/sled-agent-36.0.0-9a4f77.json similarity index 99% rename from openapi/sled-agent/sled-agent-36.0.0-b95af0.json rename to openapi/sled-agent/sled-agent-36.0.0-9a4f77.json index 1fef212c79e..06e03c3eb0e 100644 --- a/openapi/sled-agent/sled-agent-36.0.0-b95af0.json +++ b/openapi/sled-agent/sled-agent-36.0.0-9a4f77.json @@ -5366,8 +5366,8 @@ ], "additionalProperties": false }, - "FmdCase": { - "description": "A diagnosed fault case from the illumos Fault Management Daemon.", + "FmdHostCase": { + "description": "A diagnosed fault case from the illumos Fault Management Daemon on a sled.", "type": "object", "properties": { "code": { @@ -5384,8 +5384,11 @@ }, "uuid": { "description": "Unique identifier for this case.", - "type": "string", - "format": "uuid" + "allOf": [ + { + "$ref": "#/components/schemas/FmdHostCaseUuid" + } + ] } }, "required": [ @@ -5394,21 +5397,54 @@ "uuid" ] }, + "FmdHostCaseUuid": { + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::FmdHostCaseUuid", + "version": "*" + }, + "type": "string", + "format": "uuid" + }, "FmdInventory": { "description": "Successfully collected FMD fault data.", "type": "object", "properties": { "cases": { + "title": "IdOrdMap", + "x-rust-type": { + "crate": "iddqd", + "parameters": [ + { + "$ref": "#/components/schemas/FmdHostCase" + } + ], + "path": "iddqd::IdOrdMap", + "version": "*" + }, "type": "array", "items": { - "$ref": "#/components/schemas/FmdCase" - } + "$ref": "#/components/schemas/FmdHostCase" + }, + "uniqueItems": true }, "resources": { + "title": "IdOrdMap", + "x-rust-type": { + "crate": "iddqd", + "parameters": [ + { + "$ref": "#/components/schemas/FmdResource" + } + ], + "path": "iddqd::IdOrdMap", + "version": "*" + }, "type": "array", "items": { "$ref": "#/components/schemas/FmdResource" - } + }, + "uniqueItems": true } }, "required": [ @@ -5473,8 +5509,11 @@ "properties": { "case_id": { "description": "UUID of the case that diagnosed this fault.", - "type": "string", - "format": "uuid" + "allOf": [ + { + "$ref": "#/components/schemas/FmdHostCaseUuid" + } + ] }, "faulty": { "description": "Whether the resource is marked faulty.", @@ -5494,8 +5533,11 @@ }, "uuid": { "description": "Unique identifier for this resource entry.", - "type": "string", - "format": "uuid" + "allOf": [ + { + "$ref": "#/components/schemas/FmdResourceUuid" + } + ] } }, "required": [ @@ -5507,6 +5549,15 @@ "uuid" ] }, + "FmdResourceUuid": { + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::FmdResourceUuid", + "version": "*" + }, + "type": "string", + "format": "uuid" + }, "Generation": { "description": "Generation numbers stored in the database, used for optimistic concurrency control", "type": "integer", diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 021cc493261..d4a04698168 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-36.0.0-b95af0.json \ No newline at end of file +sled-agent-36.0.0-9a4f77.json \ No newline at end of file diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index 5ec00e7007b..6a19647a478 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -5,13 +5,19 @@ //! Collects fault information from the illumos Fault Management Daemon (FMD). use sled_agent_types::inventory::FmdInventoryResult; +use slog::Logger; +#[cfg(target_os = "illumos")] +use slog::warn; #[cfg(target_os = "illumos")] mod illumos { use fmd_adm::{FmdAdm, NvList, NvValue}; + use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid}; use sled_agent_types::inventory::{ - FmdCase, FmdInventory, FmdInventoryResult, FmdResource, + FmdHostCase, FmdInventory, FmdInventoryResult, FmdResource, }; + use slog::Logger; + use slog::warn; pub(super) fn nvvalue_to_json(value: &NvValue) -> serde_json::Value { match value { @@ -62,10 +68,11 @@ mod illumos { serde_json::Value::Object(map) } - pub(super) fn collect() -> FmdInventoryResult { + pub(super) fn collect(log: Logger) -> FmdInventoryResult { let adm = match FmdAdm::open() { Ok(adm) => adm, Err(e) => { + warn!(log, "failed to open fmd"; "error" => %e); return FmdInventoryResult::Error { error: format!("failed to open fmd: {e}"), }; @@ -77,8 +84,8 @@ mod illumos { .into_iter() .map(|c| { let fmd_adm::CaseInfo { uuid, code, url, event } = c; - FmdCase { - uuid, + FmdHostCase { + uuid: FmdHostCaseUuid::from_untyped_uuid(uuid), code, url, event: event.as_ref().map(nvlist_to_json), @@ -86,6 +93,7 @@ mod illumos { }) .collect(), Err(e) => { + warn!(log, "failed to list fmd cases"; "error" => %e); return FmdInventoryResult::Error { error: format!("failed to list fmd cases: {e}"), }; @@ -106,8 +114,8 @@ mod illumos { } = r; FmdResource { fmri, - uuid, - case_id: case, + uuid: FmdResourceUuid::from_untyped_uuid(uuid), + case_id: FmdHostCaseUuid::from_untyped_uuid(case), faulty, unusable, invisible, @@ -115,6 +123,7 @@ mod illumos { }) .collect(), Err(e) => { + warn!(log, "failed to list fmd resources"; "error" => %e); return FmdInventoryResult::Error { error: format!("failed to list fmd resources: {e}"), }; @@ -125,20 +134,27 @@ mod illumos { } } -pub(crate) async fn collect_fmd_inventory() -> FmdInventoryResult { +pub(crate) async fn collect_fmd_inventory(log: &Logger) -> FmdInventoryResult { #[cfg(target_os = "illumos")] { // FMD queries go through door calls to fmd(1M) and can block, so run // them on a blocking-friendly thread rather than stalling the runtime. - match tokio::task::spawn_blocking(illumos::collect).await { + let task_log = log.clone(); + match tokio::task::spawn_blocking(move || illumos::collect(task_log)) + .await + { Ok(inv) => inv, - Err(e) => FmdInventoryResult::Error { - error: format!("fmd collection task failed: {e}"), - }, + Err(e) => { + warn!(log, "fmd collection task failed"; "error" => %e); + FmdInventoryResult::Error { + error: format!("fmd collection task failed: {e}"), + } + } } } #[cfg(not(target_os = "illumos"))] { + let _ = log; FmdInventoryResult::Error { error: "fmd not supported on this platform".to_string(), } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index cf07ef98a20..77fa793c0e9 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1273,7 +1273,7 @@ impl SledAgent { let smf_services_enabled_not_online = self.inner.health_monitor.to_inventory(); - let fmd = crate::fmd::collect_fmd_inventory().await; + let fmd = crate::fmd::collect_fmd_inventory(&self.log).await; let ReconcilerInventory { disks, diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs index 61c924f1c30..37b2b30164e 100644 --- a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -2,14 +2,13 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use iddqd::IdOrdMap; +use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; use omicron_common::api::external::ByteCount; -use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, SledUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use sled_hardware_types::{Baseboard, SledCpuFamily}; use std::net::SocketAddrV6; -use uuid::Uuid; use crate::v1::inventory::InventoryDataset; use crate::v1::inventory::InventoryDisk; @@ -22,11 +21,11 @@ use crate::v16::inventory::SingleMeasurementInventory; use crate::v24::inventory::InventoryZpool; use crate::v34; -/// A diagnosed fault case from the illumos Fault Management Daemon. +/// A diagnosed fault case from the illumos Fault Management Daemon on a sled. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -pub struct FmdCase { +pub struct FmdHostCase { /// Unique identifier for this case. - pub uuid: Uuid, + pub uuid: FmdHostCaseUuid, /// Diagnostic code (e.g. "PCIEX-8000-DJ"). pub code: String, /// URL for human-readable information about this fault @@ -38,6 +37,16 @@ pub struct FmdCase { pub event: Option, } +impl IdOrdItem for FmdHostCase { + type Key<'a> = FmdHostCaseUuid; + + fn key(&self) -> Self::Key<'_> { + self.uuid + } + + id_upcast!(); +} + /// A resource affected by a diagnosed fault. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] pub struct FmdResource { @@ -45,9 +54,9 @@ pub struct FmdResource { /// (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). pub fmri: String, /// Unique identifier for this resource entry. - pub uuid: Uuid, + pub uuid: FmdResourceUuid, /// UUID of the case that diagnosed this fault. - pub case_id: Uuid, + pub case_id: FmdHostCaseUuid, /// Whether the resource is marked faulty. pub faulty: bool, /// Whether the resource is marked unusable. @@ -56,6 +65,16 @@ pub struct FmdResource { pub invisible: bool, } +impl IdOrdItem for FmdResource { + type Key<'a> = FmdResourceUuid; + + fn key(&self) -> Self::Key<'_> { + self.uuid + } + + id_upcast!(); +} + /// Result of querying FMD for fault information. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] #[serde(tag = "type", content = "value", rename_all = "snake_case")] @@ -71,8 +90,8 @@ pub enum FmdInventoryResult { Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, JsonSchema, )] pub struct FmdInventory { - pub cases: Vec, - pub resources: Vec, + pub cases: IdOrdMap, + pub resources: IdOrdMap, } /// Identity and basic status information about this sled agent diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index c3104d417ae..414fff267dc 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -164,7 +164,7 @@ pub mod inventory { pub use crate::v34::inventory::SvcsEnabledNotOnline; pub use crate::v34::inventory::SvcsEnabledNotOnlineResult; pub use crate::v34::inventory::SvcsError; - pub use crate::v36::inventory::FmdCase; + pub use crate::v36::inventory::FmdHostCase; pub use crate::v36::inventory::FmdInventory; pub use crate::v36::inventory::FmdInventoryResult; pub use crate::v36::inventory::FmdResource; diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 93c01d312dc..554896cfd84 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -57,6 +57,8 @@ impl_typed_uuid_kinds! { ExternalIp = {}, ExternalSubnet = {}, ExternalZpool = {}, + FmdHostCase = {}, + FmdResource = {}, Instance = {}, InternalZpool = {}, LoopbackAddress = {}, From a65910701040395927b69b5e7375565c606355b5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 27 Apr 2026 17:16:20 -0700 Subject: [PATCH 09/24] Bump fmd-adm rev; use InvisibleResources::Included oxidecomputer/fmd-adm#2 replaced the bool argument on `FmdAdm::resources()` with an `InvisibleResources` enum to make callsites self-documenting (per hawk's review feedback). Bump to the post-merge rev and update the call in sled-agent. --- Cargo.lock | 4 ++-- Cargo.toml | 4 ++-- sled-agent/src/fmd.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 78357718a0d..faa1b904836 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3815,7 +3815,7 @@ dependencies = [ [[package]] name = "fmd-adm" version = "0.3.0" -source = "git+https://github.com/oxidecomputer/fmd-adm?rev=fffb52212fb1e073e9f1b16761b3614af8b38063#fffb52212fb1e073e9f1b16761b3614af8b38063" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=846361bf0a698a8c7efefd97b2828b9aa74858c4#846361bf0a698a8c7efefd97b2828b9aa74858c4" dependencies = [ "fmd-adm-sys", "illumos-nvpair", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "fmd-adm-sys" version = "0.4.1" -source = "git+https://github.com/oxidecomputer/fmd-adm?rev=fffb52212fb1e073e9f1b16761b3614af8b38063#fffb52212fb1e073e9f1b16761b3614af8b38063" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=846361bf0a698a8c7efefd97b2828b9aa74858c4#846361bf0a698a8c7efefd97b2828b9aa74858c4" [[package]] name = "fnv" diff --git a/Cargo.toml b/Cargo.toml index 7756c88d847..1b212439c7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -506,8 +506,8 @@ filetime = "0.2.26" flate2 = "1.1.2" float-ord = "0.3.2" flume = "0.11.1" -fmd-adm = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "fffb52212fb1e073e9f1b16761b3614af8b38063" } -fmd-adm-sys = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "fffb52212fb1e073e9f1b16761b3614af8b38063" } +fmd-adm = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" } +fmd-adm-sys = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" } foreign-types = "0.3.2" fs-err = "3.1.1" futures = "0.3.31" diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index 6a19647a478..c3817fde996 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -11,7 +11,7 @@ use slog::warn; #[cfg(target_os = "illumos")] mod illumos { - use fmd_adm::{FmdAdm, NvList, NvValue}; + use fmd_adm::{FmdAdm, InvisibleResources, NvList, NvValue}; use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid}; use sled_agent_types::inventory::{ FmdHostCase, FmdInventory, FmdInventoryResult, FmdResource, @@ -100,7 +100,7 @@ mod illumos { } }; - let resources = match adm.resources(true) { + let resources = match adm.resources(InvisibleResources::Included) { Ok(resources) => resources .into_iter() .map(|r| { From 0c2676098ba92890f7bdacdc91581949d0abe40c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 27 Apr 2026 17:21:46 -0700 Subject: [PATCH 10/24] fmd: replace match on spawn_blocking JoinError with .expect() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Err arm was dead code: omicron compiles with panic="abort", so a panic inside the blocking task aborts the process before the JoinHandle can return Err. Switch to .expect with a descriptive message — if the invariant ever changes, the panic will say what happened. Addresses the take-it-or-leave-it nit at oxidecomputer/omicron#10283 (comment r3112399887). --- sled-agent/src/fmd.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index c3817fde996..ea970682da2 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -6,8 +6,6 @@ use sled_agent_types::inventory::FmdInventoryResult; use slog::Logger; -#[cfg(target_os = "illumos")] -use slog::warn; #[cfg(target_os = "illumos")] mod illumos { @@ -139,18 +137,13 @@ pub(crate) async fn collect_fmd_inventory(log: &Logger) -> FmdInventoryResult { { // FMD queries go through door calls to fmd(1M) and can block, so run // them on a blocking-friendly thread rather than stalling the runtime. - let task_log = log.clone(); - match tokio::task::spawn_blocking(move || illumos::collect(task_log)) + // The expect is safe: omicron compiles with `panic = "abort"`, so a + // panic inside the blocking task aborts the whole process and the + // `JoinHandle` should not be able to return `Err`. + let log = log.clone(); + tokio::task::spawn_blocking(move || illumos::collect(log)) .await - { - Ok(inv) => inv, - Err(e) => { - warn!(log, "fmd collection task failed"; "error" => %e); - FmdInventoryResult::Error { - error: format!("fmd collection task failed: {e}"), - } - } - } + .expect("fmd collection task panicked") } #[cfg(not(target_os = "illumos"))] { From 67ec15acc209318d3630bba0ffd5cb6614234f91 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 28 Apr 2026 16:50:47 -0700 Subject: [PATCH 11/24] fmd: import GenericUuid so from_untyped_uuid resolves on illumos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The illumos module is cfg-gated, so cargo check on Linux skips it entirely — the missing trait import wasn't visible locally. CI on helios caught it: three E0599 errors for from_untyped_uuid on FmdHostCaseUuid and FmdResourceUuid. Verified fix on atrium. --- sled-agent/src/fmd.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index ea970682da2..af2f5182766 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -10,7 +10,7 @@ use slog::Logger; #[cfg(target_os = "illumos")] mod illumos { use fmd_adm::{FmdAdm, InvisibleResources, NvList, NvValue}; - use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid}; + use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, GenericUuid}; use sled_agent_types::inventory::{ FmdHostCase, FmdInventory, FmdInventoryResult, FmdResource, }; From 829bd2f00b265b71f49ebae1bb94623efb044ea8 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:14:34 -0700 Subject: [PATCH 12/24] Add CRDB schema + DB model for FMD inventory tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three tables for persisting per-sled FMD data: inv_fmd_status — per-sled outcome of FMD collection inv_fmd_host_case — diagnosed cases (event payload as JSONB) inv_fmd_resource — resources affected by cases Bumps SCHEMA_VERSION to 254 with directory schema/crdb/inv-fmd. Adds diesel table! entries, db-model structs, and From impls for the read path. No callers yet — write/read/display follow in subsequent commits. --- nexus/db-model/src/inventory.rs | 120 ++++++++++++++++++++++++++ nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-schema/src/schema.rs | 32 +++++++ schema/crdb/dbinit.sql | 44 +++++++++- schema/crdb/inv-fmd/up01.sql | 10 +++ schema/crdb/inv-fmd/up02.sql | 13 +++ schema/crdb/inv-fmd/up03.sql | 16 ++++ 7 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/inv-fmd/up01.sql create mode 100644 schema/crdb/inv-fmd/up02.sql create mode 100644 schema/crdb/inv-fmd/up03.sql diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 5479994d340..87e6d9c5974 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -34,6 +34,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, + inv_fmd_host_case, inv_fmd_resource, inv_fmd_status, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, @@ -64,6 +65,8 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::FmdHostCaseKind; +use omicron_uuid_kinds::FmdResourceKind; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -85,6 +88,9 @@ use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; use sled_agent_types::inventory::BootImageHeader; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdHostCase; +use sled_agent_types::inventory::FmdInventoryResult; +use sled_agent_types::inventory::FmdResource; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::ManifestBootInventory; @@ -2128,6 +2134,120 @@ impl InvSvcEnabledNotOnlineParseError { } } +/// One row per (collection, sled) recording the outcome of FMD inventory +/// collection. `error_message` is `NULL` when the daemon was queried +/// successfully (even if it reported zero faults); set when collection +/// failed (e.g. on non-illumos sleds, or when the daemon was unreachable). +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_status)] +pub struct InvFmdStatus { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub error_message: Option, +} + +impl InvFmdStatus { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + result: &FmdInventoryResult, + ) -> Self { + let error_message = match result { + FmdInventoryResult::Available(_) => None, + FmdInventoryResult::Error { error } => Some(error.clone()), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + error_message, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_host_case)] +pub struct InvFmdHostCase { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub code: String, + pub url: String, + pub event: Option, +} + +impl InvFmdHostCase { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + case: &FmdHostCase, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + case_id: case.uuid.into(), + code: case.code.clone(), + url: case.url.clone(), + event: case.event.clone(), + } + } +} + +impl From for FmdHostCase { + fn from(row: InvFmdHostCase) -> Self { + Self { + uuid: row.case_id.into(), + code: row.code, + url: row.url, + event: row.event, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_resource)] +pub struct InvFmdResource { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub resource_id: DbTypedUuid, + pub fmri: String, + pub case_id: DbTypedUuid, + pub faulty: bool, + pub unusable: bool, + pub invisible: bool, +} + +impl InvFmdResource { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + resource: &FmdResource, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + resource_id: resource.uuid.into(), + fmri: resource.fmri.clone(), + case_id: resource.case_id.into(), + faulty: resource.faulty, + unusable: resource.unusable, + invisible: resource.invisible, + } + } +} + +impl From for FmdResource { + fn from(row: InvFmdResource) -> Self { + Self { + uuid: row.resource_id.into(), + fmri: row.fmri, + case_id: row.case_id.into(), + faulty: row.faulty, + unusable: row.unusable, + invisible: row.invisible, + } + } +} + // See [`sled_agent_types::inventory::SvcEnabledNotOnlineState`]. impl_enum_type!( InvSvcEnabledNotOnlineStateEnum: diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index c927e8cfc64..7d230d2ecd6 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(253, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(254, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(254, "inv-fmd"), KnownVersion::new(253, "delete-nexus-default-allow-firewall-rule"), KnownVersion::new(252, "fm-support-bundle-and-alert-request-comments"), KnownVersion::new(251, "fm-sitrep-next-inv-min-time-started"), diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 0a66531be52..69e48b1bb2c 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1791,6 +1791,38 @@ table! { } } +table! { + inv_fmd_status (inv_collection_id, sled_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + error_message -> Nullable, + } +} + +table! { + inv_fmd_host_case (inv_collection_id, sled_id, case_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + case_id -> Uuid, + code -> Text, + url -> Text, + event -> Nullable, + } +} + +table! { + inv_fmd_resource (inv_collection_id, sled_id, resource_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + resource_id -> Uuid, + fmri -> Text, + case_id -> Uuid, + faulty -> Bool, + unusable -> Bool, + invisible -> Bool, + } +} + table! { inv_sled_agent (inv_collection_id, sled_id) { inv_collection_id -> Uuid, diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 42bad504270..46b66f311c5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5131,6 +5131,48 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_svc_enabled_not_online_parse_error PRIMARY KEY (inv_collection_id, sled_id, id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + -- NULL when FMD data was successfully collected. Set to the error + -- string when FMD collection failed (e.g. on non-illumos sleds, or + -- when the daemon was unreachable). + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- The case_id pairs with a corresponding row in inv_fmd_host_case + -- under the same (inv_collection_id, sled_id) partition. + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); + /* * Various runtime configuration switches for reconfigurator * @@ -8475,7 +8517,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '253.0.0', NULL) + (TRUE, NOW(), NOW(), '254.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql new file mode 100644 index 00000000000..de8f7c2c448 --- /dev/null +++ b/schema/crdb/inv-fmd/up01.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + -- NULL when FMD data was successfully collected. Set to the error + -- string when FMD collection failed (e.g. on non-illumos sleds, or + -- when the daemon was unreachable). + error_message TEXT, + + PRIMARY KEY (inv_collection_id, sled_id) +); diff --git a/schema/crdb/inv-fmd/up02.sql b/schema/crdb/inv-fmd/up02.sql new file mode 100644 index 00000000000..7907cc8ac9f --- /dev/null +++ b/schema/crdb/inv-fmd/up02.sql @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql new file mode 100644 index 00000000000..9bf3e4a7930 --- /dev/null +++ b/schema/crdb/inv-fmd/up03.sql @@ -0,0 +1,16 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + inv_collection_id UUID NOT NULL, + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- The case_id pairs with a corresponding row in inv_fmd_host_case + -- under the same (inv_collection_id, sled_id) partition. + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); From 00e634707ef002b1d2f9970c5e622dc738377d02 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:20:28 -0700 Subject: [PATCH 13/24] nexus-types/inventory: add fmd field on SledAgent + builder passthrough Wires the fmd field added to sled-agent's Inventory by the parent PR through into Nexus's in-memory inventory representation. The collector builder copies inventory.fmd verbatim. The DB read path will populate it from the inv_fmd_* tables in a follow-on commit; for now, the read path substitutes Available(empty) so existing tests round-trip cleanly. --- nexus/db-queries/src/db/datastore/inventory.rs | 4 ++++ nexus/inventory/src/builder.rs | 1 + nexus/types/src/inventory.rs | 2 ++ nexus/types/src/inventory/display.rs | 1 + 4 files changed, 8 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index cd47ca48dcf..f37d666c70a 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -4637,6 +4637,10 @@ impl DataStore { reference_measurements: last_reconciliation_measurements .remove(&sled_id) .unwrap_or_default(), + // Populated by the read path in a follow-on commit. + fmd: sled_agent_types::inventory::FmdInventoryResult::Available( + sled_agent_types::inventory::FmdInventory::default(), + ), }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 08ff5ea2538..fb9e4890a8b 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -678,6 +678,7 @@ impl CollectionBuilder { smf_services_enabled_not_online: inventory .smf_services_enabled_not_online, reference_measurements: inventory.reference_measurements, + fmd: inventory.fmd, }; self.sleds diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 887fb615599..26218753b32 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -35,6 +35,7 @@ use serde_with::serde_as; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventory; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryResult; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types_versions::latest::inventory::FmdInventoryResult; use sled_agent_types_versions::latest::inventory::InventoryDataset; use sled_agent_types_versions::latest::inventory::InventoryDisk; use sled_agent_types_versions::latest::inventory::InventoryZpool; @@ -649,6 +650,7 @@ pub struct SledAgent { pub file_source_resolver: OmicronFileSourceResolverInventory, pub smf_services_enabled_not_online: SvcsEnabledNotOnlineResult, pub reference_measurements: IdOrdMap, + pub fmd: FmdInventoryResult, } impl IdOrdItem for SledAgent { diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 0964ba23d92..a410b22c04a 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -625,6 +625,7 @@ fn display_sleds( file_source_resolver, smf_services_enabled_not_online, reference_measurements, + fmd: _, } = sled; writeln!( From cfb3eea40d7eafc5e97a8342e82b2a902d489dc5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:24:19 -0700 Subject: [PATCH 14/24] datastore: write + prune for inv_fmd_* tables Insert one InvFmdStatus row per sled in each inventory collection, plus a row per case and resource when collection succeeded. Wire the three tables into the existing prune transaction so old collections clean up after themselves. --- .../db-queries/src/db/datastore/inventory.rs | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index f37d666c70a..553b7c4c43f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -39,6 +39,9 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvFmdHostCase; +use nexus_db_model::InvFmdResource; +use nexus_db_model::InvFmdStatus; use nexus_db_model::InvHostPhase1ActiveSlot; use nexus_db_model::InvHostPhase1FlashHash; use nexus_db_model::InvInternalDns; @@ -456,6 +459,59 @@ impl DataStore { }) .collect(); + // Pull FMD inventory out of all sled agents. We always record one + // status row per sled (capturing the success/failure discriminant) + // and, when collection succeeded, a row per case and per resource. + let fmd_status_rows: Vec<_> = collection + .sled_agents + .iter() + .map(|sled_agent| { + InvFmdStatus::new( + collection_id, + sled_agent.sled_id, + &sled_agent.fmd, + ) + }) + .collect(); + let fmd_host_case_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let cases = match &sled_agent.fmd { + sled_agent_types::inventory::FmdInventoryResult::Available( + inv, + ) => Some(&inv.cases), + sled_agent_types::inventory::FmdInventoryResult::Error { + .. + } => None, + }; + cases.into_iter().flatten().map(|case| { + InvFmdHostCase::new(collection_id, sled_agent.sled_id, case) + }) + }) + .collect(); + let fmd_resource_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let resources = match &sled_agent.fmd { + sled_agent_types::inventory::FmdInventoryResult::Available( + inv, + ) => Some(&inv.resources), + sled_agent_types::inventory::FmdInventoryResult::Error { + .. + } => None, + }; + resources.into_iter().flatten().map(|resource| { + InvFmdResource::new( + collection_id, + sled_agent.sled_id, + resource, + ) + }) + }) + .collect(); + // Build up a list of `OmicronSledConfig`s we need to insert. Each sled // has 0-3: // @@ -1430,7 +1486,62 @@ impl DataStore { } } + // Insert FMD status rows (one per sled). + { + use nexus_db_schema::schema::inv_fmd_status::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_status_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_status) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + // Insert FMD host case rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_host_case_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_host_case) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + + // Insert FMD resource rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_resource_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_resource) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } // Insert rows for all the sled config reconciler disk results { @@ -2164,6 +2275,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets: usize, nlast_reconciliation_zone_results: usize, nlast_reconciliation_measurements: usize, + nfmd_status: usize, + nfmd_host_cases: usize, + nfmd_resources: usize, nzone_manifest_zones: usize, nzone_manifest_measurements: usize, nzone_manifest_non_boot: usize, @@ -2204,6 +2318,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2382,6 +2499,31 @@ impl DataStore { .await? }; + // Remove FMD inventory rows. + let nfmd_status = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + diesel::delete(dsl::inv_fmd_status.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_host_cases = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + diesel::delete(dsl::inv_fmd_host_case.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_resources = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + diesel::delete(dsl::inv_fmd_resource.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; // Remove rows associated with zone resolver inventory. let nzone_manifest_zones = { @@ -2596,6 +2738,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2647,6 +2792,9 @@ impl DataStore { nlast_reconciliation_zone_results, "nlast_reconciliation_measurements" => nlast_reconciliation_measurements, + "nfmd_status" => nfmd_status, + "nfmd_host_cases" => nfmd_host_cases, + "nfmd_resources" => nfmd_resources, "nzone_manifest_zones" => nzone_manifest_zones, "nzone_manifest_measurements" => nzone_manifest_measurements, "nzone_manifest_non_boot" => nzone_manifest_non_boot, From 8e47394589c68c01ab2d8c50012d3adce7422a9c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:26:58 -0700 Subject: [PATCH 15/24] datastore: read path for FMD inventory Loads inv_fmd_status, inv_fmd_host_case, and inv_fmd_resource for the collection and reconstructs SledAgent.fmd. Status row's NULL error_message indicates Available; non-NULL becomes Error{error}. A missing status row falls back to Available with whatever cases/resources were found (defensive, in case of historical data predating this PR). --- .../db-queries/src/db/datastore/inventory.rs | 112 +++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 553b7c4c43f..e4ab70dcb4f 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -4199,6 +4199,89 @@ impl DataStore { measurements }; + // Load all FMD inventory rows. We expect at most ~tens of cases or + // resources per sled, so we don't bother paginating. + let mut fmd_status_by_sled: BTreeMap> = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + let rows = dsl::inv_fmd_status + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdStatus::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + rows.into_iter() + .map(|row| (row.sled_id.into(), row.error_message)) + .collect() + }; + + let mut fmd_cases_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + let rows = dsl::inv_fmd_host_case + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdHostCase::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD case: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + + let mut fmd_resources_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + let rows = dsl::inv_fmd_resource + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdResource::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD resource: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + // Load all the config reconciler zone results; build a map of maps // keyed by sled ID. let mut last_reconciliation_zone_results = { @@ -4785,10 +4868,31 @@ impl DataStore { reference_measurements: last_reconciliation_measurements .remove(&sled_id) .unwrap_or_default(), - // Populated by the read path in a follow-on commit. - fmd: sled_agent_types::inventory::FmdInventoryResult::Available( - sled_agent_types::inventory::FmdInventory::default(), - ), + fmd: { + use sled_agent_types::inventory::{ + FmdInventory, FmdInventoryResult, + }; + let cases = + fmd_cases_by_sled.remove(&sled_id).unwrap_or_default(); + let resources = fmd_resources_by_sled + .remove(&sled_id) + .unwrap_or_default(); + // The status row's error_message column distinguishes + // Available (NULL) from Error (the message). If no row + // exists at all (i.e. an older collection predates this + // migration), fall back to Available with whatever + // case/resource rows we found, which will normally be + // empty. + match fmd_status_by_sled.remove(&sled_id) { + Some(Some(error)) => { + FmdInventoryResult::Error { error } + } + _ => FmdInventoryResult::Available(FmdInventory { + cases, + resources, + }), + } + }, }; sled_agents .insert_unique(sled_agent) From 39053f794ef695c73d5056c1ea5aee97766d1229 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 29 Apr 2026 10:35:38 -0700 Subject: [PATCH 16/24] Display wrappers + omdb golden output for FMD inventory Adds Display wrappers for FmdInventoryResult/FmdInventory/FmdHostCase/ FmdResource on the sled-agent types. Wires them into nexus/types/src/inventory/display.rs::display_sleds so that `omdb db inventory collections show` (and reconfigurator-cli scripts that print sled inventories) include the FMD section. The FmdHostCase event payload is the FMD nvlist serialized to JSON; we intentionally don't interpret the schema, so it's pretty-printed verbatim under the case heading. Also seeds the representative test inventory (nexus/inventory examples) with a single fault case + resource so the inv_fmd_* tables get rows under test_representative_collection_populates_database. The reconfigurator-cli golden outputs grow a 'fmd:' section accordingly. --- .../tests/output/cmds-example-stdout | 6 + .../output/cmds-mupdate-update-flow-stdout | 6 + .../cmds-nexus-generation-autobump-stdout | 6 + .../tests/output/cmds-target-release-stdout | 6 + .../tests/output/cmds-unsafe-zone-mgs-stdout | 6 + nexus/inventory/src/examples.rs | 28 +++- nexus/types/src/inventory/display.rs | 6 +- .../types/versions/src/impls/inventory.rs | 121 +++++++++++++++++- sled-agent/types/versions/src/latest.rs | 4 + 9 files changed, 184 insertions(+), 5 deletions(-) diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index f8cf0bd29be..c4cb0672eba 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1739,6 +1739,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1890,6 +1892,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -2134,6 +2138,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 3f7a146fd52..b20a3575a16 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -326,6 +326,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -461,6 +463,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -584,6 +588,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 11ed4e50970..67cc9d20531 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -762,6 +762,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -955,6 +957,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1148,6 +1152,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index ea202ca9b46..6ef7ef4621a 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -746,6 +746,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -939,6 +941,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1132,6 +1136,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index e759bbbe11e..0e42aac9d93 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -800,6 +800,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -993,6 +995,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1186,6 +1190,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 06cbb4950d1..40bf3842e27 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -1099,6 +1099,32 @@ pub fn sled_agent( result: ConfigReconcilerInventoryResult::Ok, }); + // Synthesize a representative FMD payload: a single faulted resource + // diagnosed by a single case. This keeps the per-table-population test + // happy and gives downstream golden-output tests something to render. + let case_id = omicron_uuid_kinds::FmdHostCaseUuid::new_v4(); + let resource_id = omicron_uuid_kinds::FmdResourceUuid::new_v4(); + let mut fmd_cases = iddqd::IdOrdMap::new(); + fmd_cases.insert_overwrite(sled_agent_types::inventory::FmdHostCase { + uuid: case_id, + code: "PCIEX-8000-DJ".to_string(), + url: "http://illumos.org/msg/PCIEX-8000-DJ".to_string(), + event: Some(serde_json::json!({"class": "fault.io.pci.bus"})), + }); + let mut fmd_resources = iddqd::IdOrdMap::new(); + fmd_resources.insert_overwrite(sled_agent_types::inventory::FmdResource { + uuid: resource_id, + fmri: "dev:////pci@af,0/pci1022,1483@3,5".to_string(), + case_id, + faulty: true, + unusable: false, + invisible: false, + }); + let fmd = FmdInventoryResult::Available(FmdInventory { + cases: fmd_cases, + resources: fmd_resources, + }); + Inventory { baseboard, reservoir_size: ByteCount::from(1024), @@ -1117,6 +1143,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd, } } diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index a410b22c04a..f7cd65e585e 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -625,7 +625,7 @@ fn display_sleds( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: _, + fmd, } = sled; writeln!( @@ -915,6 +915,10 @@ fn display_sleds( } } + writeln!(indented, "fmd:")?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + write!(indent2, "{}", fmd.display())?; + f = indented.into_inner(); display_svcs_enabled_not_online(smf_services_enabled_not_online, f)?; } diff --git a/sled-agent/types/versions/src/impls/inventory.rs b/sled-agent/types/versions/src/impls/inventory.rs index e6af2c3cd85..5aedde4ed9b 100644 --- a/sled-agent/types/versions/src/impls/inventory.rs +++ b/sled-agent/types/versions/src/impls/inventory.rs @@ -21,9 +21,10 @@ use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; use crate::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, - ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - HostPhase2DesiredContents, HostPhase2DesiredSlots, ManifestBootInventory, - ManifestInventory, ManifestNonBootInventory, MupdateOverrideBootInventory, + ConfigReconcilerInventory, ConfigReconcilerInventoryResult, FmdHostCase, + FmdInventory, FmdInventoryResult, FmdResource, HostPhase2DesiredContents, + HostPhase2DesiredSlots, ManifestBootInventory, ManifestInventory, + ManifestNonBootInventory, MupdateOverrideBootInventory, MupdateOverrideInventory, MupdateOverrideNonBootInventory, NetworkInterface, OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, @@ -910,6 +911,120 @@ impl fmt::Display for SingleMeasurementInventoryDisplay<'_> { } } +impl FmdInventoryResult { + pub fn display(&self) -> FmdInventoryResultDisplay<'_> { + FmdInventoryResultDisplay { inner: self } + } +} + +/// a displayer for [`FmdInventoryResult`] +pub struct FmdInventoryResultDisplay<'a> { + inner: &'a FmdInventoryResult, +} + +impl fmt::Display for FmdInventoryResultDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.inner { + FmdInventoryResult::Available(inv) => { + write!(f, "{}", inv.display()) + } + FmdInventoryResult::Error { error } => { + writeln!(f, "FMD collection failed: {error}") + } + } + } +} + +impl FmdInventory { + pub fn display(&self) -> FmdInventoryDisplay<'_> { + FmdInventoryDisplay { inner: self } + } +} + +/// a displayer for [`FmdInventory`] +pub struct FmdInventoryDisplay<'a> { + inner: &'a FmdInventory, +} + +impl fmt::Display for FmdInventoryDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdInventory { cases, resources } = self.inner; + if cases.is_empty() && resources.is_empty() { + writeln!(f, "no faults reported")?; + return Ok(()); + } + writeln!(f, "cases ({}):", cases.len())?; + for case in cases { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", case.display())?; + } + writeln!(f, "resources ({}):", resources.len())?; + for resource in resources { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", resource.display())?; + } + Ok(()) + } +} + +impl FmdHostCase { + pub fn display(&self) -> FmdHostCaseDisplay<'_> { + FmdHostCaseDisplay { inner: self } + } +} + +/// a displayer for [`FmdHostCase`] +pub struct FmdHostCaseDisplay<'a> { + inner: &'a FmdHostCase, +} + +impl fmt::Display for FmdHostCaseDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdHostCase { uuid, code, url, event } = self.inner; + writeln!(f, "case {uuid} ({code})")?; + writeln!(f, " url: {url}")?; + // The event payload is the FMD nvlist serialized to JSON. We + // intentionally do not interpret it; round-trip pretty-printing + // is enough to make it human-readable. + if let Some(event) = event { + match serde_json::to_string_pretty(event) { + Ok(rendered) => { + writeln!(f, " event:")?; + let mut indent = IndentWriter::new(" ", &mut *f); + writeln!(indent, "{rendered}")?; + } + Err(_) => writeln!(f, " event: ")?, + } + } + Ok(()) + } +} + +impl FmdResource { + pub fn display(&self) -> FmdResourceDisplay<'_> { + FmdResourceDisplay { inner: self } + } +} + +/// a displayer for [`FmdResource`] +pub struct FmdResourceDisplay<'a> { + inner: &'a FmdResource, +} + +impl fmt::Display for FmdResourceDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdResource { uuid, fmri, case_id, faulty, unusable, invisible } = + self.inner; + writeln!(f, "resource {uuid} (case {case_id})")?; + writeln!(f, " fmri: {fmri}")?; + writeln!( + f, + " faulty: {faulty}, unusable: {unusable}, invisible: {invisible}" + )?; + Ok(()) + } +} + #[derive(Debug, thiserror::Error, PartialEq, Eq)] #[error("unrecognized zpool health value `{0}`")] pub struct ZpoolHealthParseError(pub String); diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 1b8c8277fa5..d6bbd7e0763 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -187,6 +187,10 @@ pub mod inventory { pub use crate::v37::inventory::FmdResource; pub use crate::v37::inventory::Inventory; + pub use crate::impls::inventory::FmdHostCaseDisplay; + pub use crate::impls::inventory::FmdInventoryDisplay; + pub use crate::impls::inventory::FmdInventoryResultDisplay; + pub use crate::impls::inventory::FmdResourceDisplay; pub use crate::impls::inventory::ManifestBootInventoryDisplay; pub use crate::impls::inventory::ManifestInventoryDisplay; pub use crate::impls::inventory::ManifestNonBootInventoryDisplay; From d44c5888aa458466b9d69b7e5d8b08adb7f1595d Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 May 2026 14:10:54 -0700 Subject: [PATCH 17/24] omicron-rpaths: panic on illumos if no DEP_*_LIBDIRS env vars are set If a build.rs calls configure_default_omicron_rpaths() but contributes no RPATH entries on illumos, the caller almost certainly forgot a direct `*-sys` dep in Cargo.toml. Catching this at build time avoids silently producing a binary with NEEDED libfmd_adm.so.1 (or libpq.so.5) but no RPATH, which would only surface as an ld.so.1 failure at process startup. The check is illumos-only because fmd-adm-sys is illumos-gated; on Linux a caller can legitimately contribute nothing. --- rpaths/src/lib.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/rpaths/src/lib.rs b/rpaths/src/lib.rs index 94719d6c7d1..3f95f67e368 100644 --- a/rpaths/src/lib.rs +++ b/rpaths/src/lib.rs @@ -143,6 +143,21 @@ mod internal { configure_rpaths_from_env_var(&mut rpaths, &env_var_name); } + // If none of the expected env vars were set, the caller opted into + // this plumbing but has no direct `*-sys` dep that would contribute. + // Only enforced on illumos: on Linux some `*-sys` deps (e.g. + // fmd-adm-sys) are target-gated to illumos, so a caller can + // legitimately contribute nothing on Linux. + #[cfg(target_os = "illumos")] + assert!( + !rpaths.is_empty(), + "omicron-rpaths: configure_default_omicron_rpaths() was called \ + but none of {:?} were set. Add a direct dep on the \ + corresponding *-sys crate(s) to your Cargo.toml, or remove \ + the call from build.rs.", + RPATH_ENV_VARS, + ); + for r in rpaths { println!("{}", emit_rpath(&r)); } From 604a83138ae8091a7d56c0d2244ff0576a86d9dc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 May 2026 14:53:34 -0700 Subject: [PATCH 18/24] Replace FmdInventoryResult with Result hawkw asked why we were reinventing Result. The answer for the wire format is openapi-lint requires snake_case property names, and Result's default serde representation produces 'Ok'/'Err'. Use omicron_common::snake_case_result (already used for other Result fields in the inventory) to keep the Rust API as a normal Result while presenting snake_case on the wire. --- clients/sled-agent-client/src/lib.rs | 1 - .../src/db/datastore/physical_disk.rs | 9 +- nexus/inventory/src/examples.rs | 3 +- .../src/test_util/host_phase_2_test_state.rs | 3 +- .../planning/src/mgs_updates/test_helpers.rs | 5 +- nexus/reconfigurator/planning/src/system.rs | 5 +- ...100.json => sled-agent-40.0.0-ca4068.json} | 89 ++++++++----------- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/rack-setup/src/plan/service.rs | 3 +- sled-agent/rack-setup/src/service.rs | 8 +- sled-agent/src/fmd.rs | 30 +++---- sled-agent/src/sim/sled_agent.rs | 10 +-- .../src/add_fmd_to_inventory/inventory.rs | 18 ++-- sled-agent/types/versions/src/latest.rs | 1 - 14 files changed, 75 insertions(+), 112 deletions(-) rename openapi/sled-agent/{sled-agent-40.0.0-2cd100.json => sled-agent-40.0.0-ca4068.json} (99%) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 6578f1b2dcd..5a4939193e8 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -62,7 +62,6 @@ progenitor::generate_api!( ExternalIpv6Config = sled_agent_types_versions::latest::instance::ExternalIpv6Config, FmdHostCase = sled_agent_types_versions::latest::inventory::FmdHostCase, FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, - FmdInventoryResult = sled_agent_types_versions::latest::inventory::FmdInventoryResult, FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, Generation = omicron_common::api::external::Generation, Hostname = omicron_common::api::external::Hostname, diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index efce69ec435..d314ed5454e 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -773,10 +773,9 @@ mod test { use omicron_test_utils::dev; use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, - FmdInventoryResult, Inventory, InventoryDisk, - OmicronFileSourceResolverInventory, SledCpuFamily, SledRole, - SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, Inventory, + InventoryDisk, OmicronFileSourceResolverInventory, SledCpuFamily, + SledRole, SvcsEnabledNotOnlineResult, }; use std::num::NonZeroU32; @@ -1138,7 +1137,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), }, ) .unwrap(); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 06cbb4950d1..90c502b677b 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -52,7 +52,6 @@ use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; use sled_agent_types::inventory::FmdInventory; -use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; @@ -1117,6 +1116,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), } } diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 873bc8b49fb..0cede8c217c 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -240,7 +240,6 @@ mod api_impl { use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; use sled_agent_types::inventory::FmdInventory; - use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; @@ -379,7 +378,7 @@ mod api_impl { remove_mupdate_override: None, boot_partitions, }), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), file_source_resolver: OmicronFileSourceResolverInventory { zone_manifest: ManifestInventory { boot_disk_path: Utf8PathBuf::new(), diff --git a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs index d438be6795c..20f450ec294 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs @@ -40,7 +40,6 @@ use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; use sled_agent_types::inventory::FmdInventory; -use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; @@ -1376,9 +1375,7 @@ impl<'a> TestBoardCollectionBuilder<'a> { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Available( - FmdInventory::default(), - ), + fmd: Ok(FmdInventory::default()), }, ) .unwrap(); diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 31e99714198..b13104d8ce9 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -62,7 +62,6 @@ use sled_agent_types::inventory::Baseboard; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; use sled_agent_types::inventory::FmdInventory; -use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; use sled_agent_types::inventory::InventoryDisk; @@ -1507,7 +1506,7 @@ impl Sled { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: iddqd::IdOrdMap::new(), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), } }; @@ -1692,7 +1691,7 @@ impl Sled { reference_measurements: inv_sled_agent .reference_measurements .clone(), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), }; Sled { diff --git a/openapi/sled-agent/sled-agent-40.0.0-2cd100.json b/openapi/sled-agent/sled-agent-40.0.0-ca4068.json similarity index 99% rename from openapi/sled-agent/sled-agent-40.0.0-2cd100.json rename to openapi/sled-agent/sled-agent-40.0.0-ca4068.json index 4a8c784c91c..585de26873f 100644 --- a/openapi/sled-agent/sled-agent-40.0.0-2cd100.json +++ b/openapi/sled-agent/sled-agent-40.0.0-ca4068.json @@ -5478,57 +5478,6 @@ "resources" ] }, - "FmdInventoryResult": { - "description": "Result of querying FMD for fault information.", - "oneOf": [ - { - "description": "FMD data was successfully collected.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "available" - ] - }, - "value": { - "$ref": "#/components/schemas/FmdInventory" - } - }, - "required": [ - "type", - "value" - ] - }, - { - "description": "FMD data collection failed or is not available on this platform.", - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "error" - ] - }, - "value": { - "type": "object", - "properties": { - "error": { - "type": "string" - } - }, - "required": [ - "error" - ] - } - }, - "required": [ - "type", - "value" - ] - } - ] - }, "FmdResource": { "description": "A resource affected by a diagnosed fault.", "type": "object", @@ -6221,7 +6170,43 @@ "$ref": "#/components/schemas/OmicronFileSourceResolverInventory" }, "fmd": { - "$ref": "#/components/schemas/FmdInventoryResult" + "x-rust-type": { + "crate": "std", + "parameters": [ + { + "$ref": "#/components/schemas/FmdInventory" + }, + { + "type": "string" + } + ], + "path": "::std::result::Result", + "version": "*" + }, + "oneOf": [ + { + "type": "object", + "properties": { + "ok": { + "$ref": "#/components/schemas/FmdInventory" + } + }, + "required": [ + "ok" + ] + }, + { + "type": "object", + "properties": { + "err": { + "type": "string" + } + }, + "required": [ + "err" + ] + } + ] }, "last_reconciliation": { "nullable": true, diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index b6627080a18..17e2779581d 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-40.0.0-2cd100.json \ No newline at end of file +sled-agent-40.0.0-ca4068.json \ No newline at end of file diff --git a/sled-agent/rack-setup/src/plan/service.rs b/sled-agent/rack-setup/src/plan/service.rs index c05e5ef43b7..48de2b1d3e9 100644 --- a/sled-agent/rack-setup/src/plan/service.rs +++ b/sled-agent/rack-setup/src/plan/service.rs @@ -1363,7 +1363,6 @@ mod tests { use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; use sled_agent_types::inventory::FmdInventory; - use sled_agent_types::inventory::FmdInventoryResult; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; @@ -1536,7 +1535,7 @@ mod tests { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), }, is_scrimlet, ) diff --git a/sled-agent/rack-setup/src/service.rs b/sled-agent/rack-setup/src/service.rs index e715964d1e2..d4e18fefcd0 100644 --- a/sled-agent/rack-setup/src/service.rs +++ b/sled-agent/rack-setup/src/service.rs @@ -1897,9 +1897,9 @@ mod test { early_networking::RackNetworkConfig, inventory::{ Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, - FmdInventoryResult, Inventory, InventoryDisk, - OmicronFileSourceResolverInventory, OmicronZoneType, SledCpuFamily, - SledRole, SvcsEnabledNotOnlineResult, + Inventory, InventoryDisk, OmicronFileSourceResolverInventory, + OmicronZoneType, SledCpuFamily, SledRole, + SvcsEnabledNotOnlineResult, }, }; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -1960,7 +1960,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), }, true, ) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index af2f5182766..f9d1b8341fa 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -4,16 +4,14 @@ //! Collects fault information from the illumos Fault Management Daemon (FMD). -use sled_agent_types::inventory::FmdInventoryResult; +use sled_agent_types::inventory::FmdInventory; use slog::Logger; #[cfg(target_os = "illumos")] mod illumos { use fmd_adm::{FmdAdm, InvisibleResources, NvList, NvValue}; use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, GenericUuid}; - use sled_agent_types::inventory::{ - FmdHostCase, FmdInventory, FmdInventoryResult, FmdResource, - }; + use sled_agent_types::inventory::{FmdHostCase, FmdInventory, FmdResource}; use slog::Logger; use slog::warn; @@ -66,14 +64,12 @@ mod illumos { serde_json::Value::Object(map) } - pub(super) fn collect(log: Logger) -> FmdInventoryResult { + pub(super) fn collect(log: Logger) -> Result { let adm = match FmdAdm::open() { Ok(adm) => adm, Err(e) => { warn!(log, "failed to open fmd"; "error" => %e); - return FmdInventoryResult::Error { - error: format!("failed to open fmd: {e}"), - }; + return Err(format!("failed to open fmd: {e}")); } }; @@ -92,9 +88,7 @@ mod illumos { .collect(), Err(e) => { warn!(log, "failed to list fmd cases"; "error" => %e); - return FmdInventoryResult::Error { - error: format!("failed to list fmd cases: {e}"), - }; + return Err(format!("failed to list fmd cases: {e}")); } }; @@ -122,17 +116,17 @@ mod illumos { .collect(), Err(e) => { warn!(log, "failed to list fmd resources"; "error" => %e); - return FmdInventoryResult::Error { - error: format!("failed to list fmd resources: {e}"), - }; + return Err(format!("failed to list fmd resources: {e}")); } }; - FmdInventoryResult::Available(FmdInventory { cases, resources }) + Ok(FmdInventory { cases, resources }) } } -pub(crate) async fn collect_fmd_inventory(log: &Logger) -> FmdInventoryResult { +pub(crate) async fn collect_fmd_inventory( + log: &Logger, +) -> Result { #[cfg(target_os = "illumos")] { // FMD queries go through door calls to fmd(1M) and can block, so run @@ -148,9 +142,7 @@ pub(crate) async fn collect_fmd_inventory(log: &Logger) -> FmdInventoryResult { #[cfg(not(target_os = "illumos"))] { let _ = log; - FmdInventoryResult::Error { - error: "fmd not supported on this platform".to_string(), - } + Err("fmd not supported on this platform".to_string()) } } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index f5b9b519dd6..a992f103d81 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -61,10 +61,10 @@ use sled_agent_types::instance::{ }; use sled_agent_types::inventory::{ ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - ConfigReconcilerInventoryStatus, FmdInventory, FmdInventoryResult, - HostPhase2DesiredSlots, Inventory, InventoryDataset, InventoryDisk, - InventoryZpool, OmicronFileSourceResolverInventory, OmicronSledConfig, - OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, + ConfigReconcilerInventoryStatus, FmdInventory, HostPhase2DesiredSlots, + Inventory, InventoryDataset, InventoryDisk, InventoryZpool, + OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, + SingleMeasurementInventory, SledRole, ZpoolHealth, }; use sled_agent_types::support_bundle::SupportBundleMetadata; use sled_agent_types::system_networking::SystemNetworkingConfig; @@ -994,7 +994,7 @@ impl SledAgent { ), smf_services_enabled_not_online, reference_measurements, - fmd: FmdInventoryResult::Available(FmdInventory::default()), + fmd: Ok(FmdInventory::default()), }) } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs index 8eeeb0484c9..5f505d2307d 100644 --- a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -4,6 +4,8 @@ use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; use omicron_common::api::external::ByteCount; +use omicron_common::snake_case_result; +use omicron_common::snake_case_result::SnakeCaseResult; use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, SledUuid}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -75,16 +77,6 @@ impl IdOrdItem for FmdResource { id_upcast!(); } -/// Result of querying FMD for fault information. -#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] -#[serde(tag = "type", content = "value", rename_all = "snake_case")] -pub enum FmdInventoryResult { - /// FMD data was successfully collected. - Available(FmdInventory), - /// FMD data collection failed or is not available on this platform. - Error { error: String }, -} - /// Successfully collected FMD fault data. #[derive( Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, JsonSchema, @@ -115,7 +107,11 @@ pub struct Inventory { pub smf_services_enabled_not_online: v37::inventory::SvcsEnabledNotOnlineResult, pub reference_measurements: IdOrdMap, - pub fmd: FmdInventoryResult, + #[serde(with = "snake_case_result")] + #[schemars( + schema_with = "SnakeCaseResult::::json_schema" + )] + pub fmd: Result, } impl From for v37::inventory::Inventory { diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 70df5c1c703..d656855cedf 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -185,7 +185,6 @@ pub mod inventory { pub use crate::v40::inventory::FmdHostCase; pub use crate::v40::inventory::FmdInventory; - pub use crate::v40::inventory::FmdInventoryResult; pub use crate::v40::inventory::FmdResource; pub use crate::v40::inventory::Inventory; From 040b8151ac6accfe215b4b169d67b3a96c407ac1 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 May 2026 15:33:33 -0700 Subject: [PATCH 19/24] Add foreign-key comments to inv_fmd_* tables Match the convention used by other inv_* tables in dbinit.sql. Addresses review feedback on #10345. --- schema/crdb/dbinit.sql | 10 ++++++++-- schema/crdb/inv-fmd/up01.sql | 2 ++ schema/crdb/inv-fmd/up02.sql | 2 ++ schema/crdb/inv-fmd/up03.sql | 6 ++++-- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5adb3612be8..b6b8f534ad9 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5229,7 +5229,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_svc_enabled_not_online_parse_error ); CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, -- NULL when FMD data was successfully collected. Set to the error -- string when FMD collection failed (e.g. on non-illumos sleds, or @@ -5240,7 +5242,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( ); CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, case_id UUID NOT NULL, code TEXT NOT NULL, @@ -5254,14 +5258,16 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( ); CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, resource_id UUID NOT NULL, -- Fault Management Resource Identifier -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). fmri TEXT NOT NULL, - -- The case_id pairs with a corresponding row in inv_fmd_host_case - -- under the same (inv_collection_id, sled_id) partition. + -- (foreign key into `inv_fmd_host_case`, with the same + -- (inv_collection_id, sled_id)) case_id UUID NOT NULL, faulty BOOL NOT NULL, unusable BOOL NOT NULL, diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql index de8f7c2c448..7a5917c4f99 100644 --- a/schema/crdb/inv-fmd/up01.sql +++ b/schema/crdb/inv-fmd/up01.sql @@ -1,5 +1,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, -- NULL when FMD data was successfully collected. Set to the error -- string when FMD collection failed (e.g. on non-illumos sleds, or diff --git a/schema/crdb/inv-fmd/up02.sql b/schema/crdb/inv-fmd/up02.sql index 7907cc8ac9f..daba123b7e4 100644 --- a/schema/crdb/inv-fmd/up02.sql +++ b/schema/crdb/inv-fmd/up02.sql @@ -1,5 +1,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, case_id UUID NOT NULL, code TEXT NOT NULL, diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql index 9bf3e4a7930..bb6ee964f24 100644 --- a/schema/crdb/inv-fmd/up03.sql +++ b/schema/crdb/inv-fmd/up03.sql @@ -1,12 +1,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, resource_id UUID NOT NULL, -- Fault Management Resource Identifier -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). fmri TEXT NOT NULL, - -- The case_id pairs with a corresponding row in inv_fmd_host_case - -- under the same (inv_collection_id, sled_id) partition. + -- (foreign key into `inv_fmd_host_case`, with the same + -- (inv_collection_id, sled_id)) case_id UUID NOT NULL, faulty BOOL NOT NULL, unusable BOOL NOT NULL, From f677b4bb9e884e744f71154d3b9c84f3296c8e09 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 May 2026 15:41:09 -0700 Subject: [PATCH 20/24] sled-agent: use InlineErrorChain for FMD collection errors Captures the full `source()` chain on the three error paths in `fmd::illumos::collect` (FMD open / list cases / list resources). The underlying `fmd_adm::Error` variants `Nul(NulError)` and `Uuid(uuid::Error)` carry inner errors via `#[from]` that the bare `Display` impl can drop, so `InlineErrorChain` actually surfaces the extra context. Matches the existing pattern used elsewhere in `omicron-sled-agent` (e.g. `instance_manager.rs:989`). --- sled-agent/src/fmd.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index f9d1b8341fa..fe50f76f892 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -14,6 +14,7 @@ mod illumos { use sled_agent_types::inventory::{FmdHostCase, FmdInventory, FmdResource}; use slog::Logger; use slog::warn; + use slog_error_chain::InlineErrorChain; pub(super) fn nvvalue_to_json(value: &NvValue) -> serde_json::Value { match value { @@ -68,8 +69,9 @@ mod illumos { let adm = match FmdAdm::open() { Ok(adm) => adm, Err(e) => { - warn!(log, "failed to open fmd"; "error" => %e); - return Err(format!("failed to open fmd: {e}")); + let err = InlineErrorChain::new(&e); + warn!(log, "failed to open fmd"; &err); + return Err(format!("failed to open fmd: {err}")); } }; @@ -87,8 +89,9 @@ mod illumos { }) .collect(), Err(e) => { - warn!(log, "failed to list fmd cases"; "error" => %e); - return Err(format!("failed to list fmd cases: {e}")); + let err = InlineErrorChain::new(&e); + warn!(log, "failed to list fmd cases"; &err); + return Err(format!("failed to list fmd cases: {err}")); } }; @@ -115,8 +118,9 @@ mod illumos { }) .collect(), Err(e) => { - warn!(log, "failed to list fmd resources"; "error" => %e); - return Err(format!("failed to list fmd resources: {e}")); + let err = InlineErrorChain::new(&e); + warn!(log, "failed to list fmd resources"; &err); + return Err(format!("failed to list fmd resources: {err}")); } }; From e43ae1ffe3a543a4c852b756e45dca1d1eb3136f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 14 May 2026 17:55:22 -0700 Subject: [PATCH 21/24] Add typed FmdInventoryError and per-sled FMD bounds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces `Result` with a typed `FmdInventoryError` (addresses hawkw's r3173718186 on PR #10283) and introduces cap constants `FMD_MAX_CASES` / `FMD_MAX_RESOURCES` (1000 each). The error type is flat — a `kind: FmdInventoryErrorKind` discriminator plus a free-form `message`. Three variants: - `FmdError`: catch-all for FMD daemon failures (open / list cases / list resources, or platform doesn't support FMD). - `TooManyCases` / `TooManyResources`: bound exceeded; producer refuses to report a partial set because a count this high is itself a signal operators should investigate directly. Counts/limits are included in the `message` string, so downstream display shows them without needing structured fields. --- clients/sled-agent-client/src/lib.rs | 2 + ...068.json => sled-agent-40.0.0-600e45.json} | 46 +++++- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/src/fmd.rs | 133 ++++++++++++------ .../src/add_fmd_to_inventory/inventory.rs | 55 +++++++- sled-agent/types/versions/src/latest.rs | 4 + 6 files changed, 194 insertions(+), 48 deletions(-) rename openapi/sled-agent/{sled-agent-40.0.0-ca4068.json => sled-agent-40.0.0-600e45.json} (99%) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 5a4939193e8..1293d57c4c4 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -62,6 +62,8 @@ progenitor::generate_api!( ExternalIpv6Config = sled_agent_types_versions::latest::instance::ExternalIpv6Config, FmdHostCase = sled_agent_types_versions::latest::inventory::FmdHostCase, FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, + FmdInventoryError = sled_agent_types_versions::latest::inventory::FmdInventoryError, + FmdInventoryErrorKind = sled_agent_types_versions::latest::inventory::FmdInventoryErrorKind, FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, Generation = omicron_common::api::external::Generation, Hostname = omicron_common::api::external::Hostname, diff --git a/openapi/sled-agent/sled-agent-40.0.0-ca4068.json b/openapi/sled-agent/sled-agent-40.0.0-600e45.json similarity index 99% rename from openapi/sled-agent/sled-agent-40.0.0-ca4068.json rename to openapi/sled-agent/sled-agent-40.0.0-600e45.json index 585de26873f..fa6c04f9104 100644 --- a/openapi/sled-agent/sled-agent-40.0.0-ca4068.json +++ b/openapi/sled-agent/sled-agent-40.0.0-600e45.json @@ -5478,6 +5478,48 @@ "resources" ] }, + "FmdInventoryError": { + "description": "An error reported by sled-agent in place of an [`FmdInventory`].\n\n`kind` is a typed discriminator suitable for filtering / monitoring. `message` is a human-readable description (built via `Display`); it is informational only and should not be parsed.", + "type": "object", + "properties": { + "kind": { + "$ref": "#/components/schemas/FmdInventoryErrorKind" + }, + "message": { + "type": "string" + } + }, + "required": [ + "kind", + "message" + ] + }, + "FmdInventoryErrorKind": { + "description": "Classification of an [`FmdInventoryError`].\n\n`FmdError` is a catch-all for any FMD-side failure: the daemon was unreachable, a case/resource listing failed, or the platform doesn't have FMD at all. The accompanying message disambiguates these cases. `TooManyCases` and `TooManyResources` are first-class because exceeding those bounds is operationally distinct from a transient FMD failure.", + "oneOf": [ + { + "description": "Catch-all for FMD-side failures.", + "type": "string", + "enum": [ + "fmd_error" + ] + }, + { + "description": "Number of FMD cases exceeded [`FMD_MAX_CASES`].", + "type": "string", + "enum": [ + "too_many_cases" + ] + }, + { + "description": "Number of FMD resources exceeded [`FMD_MAX_RESOURCES`].", + "type": "string", + "enum": [ + "too_many_resources" + ] + } + ] + }, "FmdResource": { "description": "A resource affected by a diagnosed fault.", "type": "object", @@ -6177,7 +6219,7 @@ "$ref": "#/components/schemas/FmdInventory" }, { - "type": "string" + "$ref": "#/components/schemas/FmdInventoryError" } ], "path": "::std::result::Result", @@ -6199,7 +6241,7 @@ "type": "object", "properties": { "err": { - "type": "string" + "$ref": "#/components/schemas/FmdInventoryError" } }, "required": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 17e2779581d..5f70b30a1ee 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-40.0.0-ca4068.json \ No newline at end of file +sled-agent-40.0.0-600e45.json \ No newline at end of file diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs index fe50f76f892..c7cd8e3a60d 100644 --- a/sled-agent/src/fmd.rs +++ b/sled-agent/src/fmd.rs @@ -4,14 +4,17 @@ //! Collects fault information from the illumos Fault Management Daemon (FMD). -use sled_agent_types::inventory::FmdInventory; +use sled_agent_types::inventory::{FmdInventory, FmdInventoryError}; use slog::Logger; #[cfg(target_os = "illumos")] mod illumos { use fmd_adm::{FmdAdm, InvisibleResources, NvList, NvValue}; use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, GenericUuid}; - use sled_agent_types::inventory::{FmdHostCase, FmdInventory, FmdResource}; + use sled_agent_types::inventory::{ + FMD_MAX_CASES, FMD_MAX_RESOURCES, FmdHostCase, FmdInventory, + FmdInventoryError, FmdInventoryErrorKind, FmdResource, + }; use slog::Logger; use slog::warn; use slog_error_chain::InlineErrorChain; @@ -65,64 +68,105 @@ mod illumos { serde_json::Value::Object(map) } - pub(super) fn collect(log: Logger) -> Result { + pub(super) fn collect( + log: Logger, + ) -> Result { let adm = match FmdAdm::open() { Ok(adm) => adm, Err(e) => { let err = InlineErrorChain::new(&e); warn!(log, "failed to open fmd"; &err); - return Err(format!("failed to open fmd: {err}")); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to open fmd: {err}"), + }); } }; - let cases = match adm.cases(None) { - Ok(cases) => cases - .into_iter() - .map(|c| { - let fmd_adm::CaseInfo { uuid, code, url, event } = c; - FmdHostCase { - uuid: FmdHostCaseUuid::from_untyped_uuid(uuid), - code, - url, - event: event.as_ref().map(nvlist_to_json), - } - }) - .collect(), + let raw_cases = match adm.cases(None) { + Ok(cases) => cases, Err(e) => { let err = InlineErrorChain::new(&e); warn!(log, "failed to list fmd cases"; &err); - return Err(format!("failed to list fmd cases: {err}")); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to list fmd cases: {err}"), + }); } }; + let case_count = raw_cases.len(); + if case_count as u64 > u64::from(FMD_MAX_CASES) { + warn!( + log, "too many fmd cases reported, refusing partial inventory"; + "count" => case_count, "limit" => FMD_MAX_CASES, + ); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::TooManyCases, + message: format!( + "too many fmd cases ({case_count} > limit {FMD_MAX_CASES})" + ), + }); + } + let cases: iddqd::IdOrdMap<_> = raw_cases + .into_iter() + .map(|c| { + let fmd_adm::CaseInfo { uuid, code, url, event } = c; + FmdHostCase { + uuid: FmdHostCaseUuid::from_untyped_uuid(uuid), + code, + url, + event: event.as_ref().map(nvlist_to_json), + } + }) + .collect(); - let resources = match adm.resources(InvisibleResources::Included) { - Ok(resources) => resources - .into_iter() - .map(|r| { - let fmd_adm::ResourceInfo { - fmri, - uuid, - case, - faulty, - unusable, - invisible, - } = r; - FmdResource { - fmri, - uuid: FmdResourceUuid::from_untyped_uuid(uuid), - case_id: FmdHostCaseUuid::from_untyped_uuid(case), - faulty, - unusable, - invisible, - } - }) - .collect(), + let raw_resources = match adm.resources(InvisibleResources::Included) { + Ok(resources) => resources, Err(e) => { let err = InlineErrorChain::new(&e); warn!(log, "failed to list fmd resources"; &err); - return Err(format!("failed to list fmd resources: {err}")); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to list fmd resources: {err}"), + }); } }; + let resource_count = raw_resources.len(); + if resource_count as u64 > u64::from(FMD_MAX_RESOURCES) { + warn!( + log, + "too many fmd resources reported, refusing partial inventory"; + "count" => resource_count, "limit" => FMD_MAX_RESOURCES, + ); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::TooManyResources, + message: format!( + "too many fmd resources \ + ({resource_count} > limit {FMD_MAX_RESOURCES})" + ), + }); + } + let resources: iddqd::IdOrdMap<_> = raw_resources + .into_iter() + .map(|r| { + let fmd_adm::ResourceInfo { + fmri, + uuid, + case, + faulty, + unusable, + invisible, + } = r; + FmdResource { + fmri, + uuid: FmdResourceUuid::from_untyped_uuid(uuid), + case_id: FmdHostCaseUuid::from_untyped_uuid(case), + faulty, + unusable, + invisible, + } + }) + .collect(); Ok(FmdInventory { cases, resources }) } @@ -130,7 +174,7 @@ mod illumos { pub(crate) async fn collect_fmd_inventory( log: &Logger, -) -> Result { +) -> Result { #[cfg(target_os = "illumos")] { // FMD queries go through door calls to fmd(1M) and can block, so run @@ -146,7 +190,10 @@ pub(crate) async fn collect_fmd_inventory( #[cfg(not(target_os = "illumos"))] { let _ = log; - Err("fmd not supported on this platform".to_string()) + Err(FmdInventoryError { + kind: sled_agent_types::inventory::FmdInventoryErrorKind::FmdError, + message: "fmd not supported on this platform".to_string(), + }) } } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs index 5f505d2307d..9cb3034ecc5 100644 --- a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -86,6 +86,57 @@ pub struct FmdInventory { pub resources: IdOrdMap, } +/// Maximum number of FMD cases sled-agent will report for a single sled. +/// Exceeding this returns [`FmdInventoryErrorKind::TooManyCases`] rather than +/// silently truncating: a count this high indicates a pathological state +/// operators should investigate directly via `fmadm`. +pub const FMD_MAX_CASES: u32 = 1000; + +/// Maximum number of FMD resources sled-agent will report for a single sled. +/// See [`FMD_MAX_CASES`] for rationale. +pub const FMD_MAX_RESOURCES: u32 = 1000; + +/// Classification of an [`FmdInventoryError`]. +/// +/// `FmdError` is a catch-all for any FMD-side failure: the daemon was +/// unreachable, a case/resource listing failed, or the platform doesn't have +/// FMD at all. The accompanying message disambiguates these cases. +/// `TooManyCases` and `TooManyResources` are first-class because exceeding +/// those bounds is operationally distinct from a transient FMD failure. +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum FmdInventoryErrorKind { + /// Catch-all for FMD-side failures. + FmdError, + /// Number of FMD cases exceeded [`FMD_MAX_CASES`]. + TooManyCases, + /// Number of FMD resources exceeded [`FMD_MAX_RESOURCES`]. + TooManyResources, +} + +/// An error reported by sled-agent in place of an [`FmdInventory`]. +/// +/// `kind` is a typed discriminator suitable for filtering / monitoring. +/// `message` is a human-readable description (built via `Display`); it is +/// informational only and should not be parsed. +#[derive( + Clone, + Debug, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, + thiserror::Error, +)] +#[error("{message}")] +pub struct FmdInventoryError { + pub kind: FmdInventoryErrorKind, + pub message: String, +} + /// Identity and basic status information about this sled agent #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] pub struct Inventory { @@ -109,9 +160,9 @@ pub struct Inventory { pub reference_measurements: IdOrdMap, #[serde(with = "snake_case_result")] #[schemars( - schema_with = "SnakeCaseResult::::json_schema" + schema_with = "SnakeCaseResult::::json_schema" )] - pub fmd: Result, + pub fmd: Result, } impl From for v37::inventory::Inventory { diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index d656855cedf..60591115103 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -183,8 +183,12 @@ pub mod inventory { pub use crate::v37::inventory::SvcsEnabledNotOnline; pub use crate::v37::inventory::SvcsEnabledNotOnlineResult; + pub use crate::v40::inventory::FMD_MAX_CASES; + pub use crate::v40::inventory::FMD_MAX_RESOURCES; pub use crate::v40::inventory::FmdHostCase; pub use crate::v40::inventory::FmdInventory; + pub use crate::v40::inventory::FmdInventoryError; + pub use crate::v40::inventory::FmdInventoryErrorKind; pub use crate::v40::inventory::FmdResource; pub use crate::v40::inventory::Inventory; From 25e6ceafd43317b94e79e01023441382bca1d82c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 15 May 2026 08:25:17 -0700 Subject: [PATCH 22/24] Split inv-fmd up01.sql into per-DDL files The schema test `test_migration_verification_files` enforces at most one DDL statement per up*.sql file for schema versions after 220. The previous up01.sql had two (CREATE TYPE + CREATE TABLE). Split into: - up01.sql: CREATE TYPE fmd_inventory_error_kind - up02.sql: CREATE TABLE inv_fmd_status - up03.sql: CREATE TABLE inv_fmd_host_case (was up02) - up04.sql: CREATE TABLE inv_fmd_resource (was up03) --- schema/crdb/inv-fmd/up01.sql | 20 -------------------- schema/crdb/inv-fmd/up02.sql | 22 +++++++++++++--------- schema/crdb/inv-fmd/up03.sql | 19 ++++++++----------- schema/crdb/inv-fmd/up04.sql | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+), 40 deletions(-) create mode 100644 schema/crdb/inv-fmd/up04.sql diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql index 1910f9aad8c..3aac46a3768 100644 --- a/schema/crdb/inv-fmd/up01.sql +++ b/schema/crdb/inv-fmd/up01.sql @@ -9,23 +9,3 @@ CREATE TYPE IF NOT EXISTS omicron.public.fmd_inventory_error_kind AS ENUM ( -- Number of FMD resources reported by the sled exceeded the limit. 'too_many_resources' ); - -CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( - -- (foreign key into `inv_collection` table) - inv_collection_id UUID NOT NULL, - -- guaranteed to match a row in this collection's `inv_sled_agent` - sled_id UUID NOT NULL, - -- Classifies the failure mode when FMD inventory collection failed. - -- NULL iff `error_message` is NULL (FMD was successfully collected). - error_kind omicron.public.fmd_inventory_error_kind, - -- Display() of the original error; informational only, do not parse. - -- The `error_kind` discriminator is the structured signal. - -- NULL iff `error_kind` is NULL. - error_message TEXT, - - CONSTRAINT error_kind_and_message_together CHECK ( - (error_kind IS NULL) = (error_message IS NULL) - ), - - PRIMARY KEY (inv_collection_id, sled_id) -); diff --git a/schema/crdb/inv-fmd/up02.sql b/schema/crdb/inv-fmd/up02.sql index daba123b7e4..13f5f65a3b6 100644 --- a/schema/crdb/inv-fmd/up02.sql +++ b/schema/crdb/inv-fmd/up02.sql @@ -1,15 +1,19 @@ -CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, - case_id UUID NOT NULL, - code TEXT NOT NULL, - url TEXT NOT NULL, - -- The full FMD fault event payload as JSON, if present. Stored as - -- JSONB without parsing — Nexus does not interpret the FMD event - -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). - event JSONB, + -- Classifies the failure mode when FMD inventory collection failed. + -- NULL iff `error_message` is NULL (FMD was successfully collected). + error_kind omicron.public.fmd_inventory_error_kind, + -- Display() of the original error; informational only, do not parse. + -- The `error_kind` discriminator is the structured signal. + -- NULL iff `error_kind` is NULL. + error_message TEXT, - PRIMARY KEY (inv_collection_id, sled_id, case_id) + CONSTRAINT error_kind_and_message_together CHECK ( + (error_kind IS NULL) = (error_message IS NULL) + ), + + PRIMARY KEY (inv_collection_id, sled_id) ); diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql index bb6ee964f24..daba123b7e4 100644 --- a/schema/crdb/inv-fmd/up03.sql +++ b/schema/crdb/inv-fmd/up03.sql @@ -1,18 +1,15 @@ -CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( -- (foreign key into `inv_collection` table) inv_collection_id UUID NOT NULL, -- guaranteed to match a row in this collection's `inv_sled_agent` sled_id UUID NOT NULL, - resource_id UUID NOT NULL, - -- Fault Management Resource Identifier - -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). - fmri TEXT NOT NULL, - -- (foreign key into `inv_fmd_host_case`, with the same - -- (inv_collection_id, sled_id)) case_id UUID NOT NULL, - faulty BOOL NOT NULL, - unusable BOOL NOT NULL, - invisible BOOL NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, - PRIMARY KEY (inv_collection_id, sled_id, resource_id) + PRIMARY KEY (inv_collection_id, sled_id, case_id) ); diff --git a/schema/crdb/inv-fmd/up04.sql b/schema/crdb/inv-fmd/up04.sql new file mode 100644 index 00000000000..bb6ee964f24 --- /dev/null +++ b/schema/crdb/inv-fmd/up04.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- (foreign key into `inv_fmd_host_case`, with the same + -- (inv_collection_id, sled_id)) + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); From fa93ec85f48984ee88350f18a5914fae7d0aefee Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 20 May 2026 13:55:51 -0700 Subject: [PATCH 23/24] review feedback --- Cargo.lock | 1 + .../db-queries/src/db/datastore/inventory.rs | 40 ++++++----- schema/crdb/dbinit.sql | 4 +- schema/crdb/inv-fmd/up03.sql | 4 +- sled-agent/types/versions/Cargo.toml | 1 + .../types/versions/src/impls/inventory.rs | 68 +++++++++++++++++++ .../tests/output/fmd_inventory_display.txt | 19 ++++++ 7 files changed, 117 insertions(+), 20 deletions(-) create mode 100644 sled-agent/types/versions/tests/output/fmd_inventory_display.txt diff --git a/Cargo.lock b/Cargo.lock index 459efe13cc0..42bde814d05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13829,6 +13829,7 @@ dependencies = [ "camino", "chrono", "daft", + "expectorate", "iddqd", "indent_write", "ipnetwork", diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index f3ad63aad5e..c364578904a 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -4207,14 +4207,18 @@ impl DataStore { let err = match (row.error_kind, row.error_message) { (Some(kind), Some(message)) => Some((kind, message)), (None, None) => None, - _ => unreachable!( - "inv_fmd_status CHECK constraint enforces \ - error_kind and error_message agree on NULL" - ), + _ => { + return Err(Error::internal_error( + "inv_fmd_status row violates \ + error_kind_and_message_together CHECK \ + constraint: exactly one of (error_kind, \ + error_message) is NULL", + )); + } }; - (row.sled_id.into(), err) + Ok((row.sled_id.into(), err)) }) - .collect() + .collect::, _>>()? }; let mut fmd_cases_by_sled: BTreeMap< @@ -4240,12 +4244,12 @@ impl DataStore { .entry(sled_id) .or_default() .insert_unique(row.into()) - .map_err(|err| { - Error::internal_error(&format!( - "unexpected duplicate FMD case: {}", - InlineErrorChain::new(&err) - )) - })?; + .map_err(|err| Error::InternalError { + internal_message: format!( + "unexpected duplicate FMD case: {}", + InlineErrorChain::new(&err) + ), + })?; } by_sled }; @@ -4273,12 +4277,12 @@ impl DataStore { .entry(sled_id) .or_default() .insert_unique(row.into()) - .map_err(|err| { - Error::internal_error(&format!( - "unexpected duplicate FMD resource: {}", - InlineErrorChain::new(&err) - )) - })?; + .map_err(|err| Error::InternalError { + internal_message: format!( + "unexpected duplicate FMD resource: {}", + InlineErrorChain::new(&err) + ), + })?; } by_sled }; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 0f8d599e7ac..dde9452e962 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5270,7 +5270,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( url TEXT NOT NULL, -- The full FMD fault event payload as JSON, if present. Stored as -- JSONB without parsing — Nexus does not interpret the FMD event - -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + -- schema. JSONB normalizes whitespace and key order, so the value is + -- preserved structurally (not byte-for-byte) for downstream tooling + -- (e.g. omdb). event JSONB, PRIMARY KEY (inv_collection_id, sled_id, case_id) diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql index daba123b7e4..9a2b6c92e22 100644 --- a/schema/crdb/inv-fmd/up03.sql +++ b/schema/crdb/inv-fmd/up03.sql @@ -8,7 +8,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( url TEXT NOT NULL, -- The full FMD fault event payload as JSON, if present. Stored as -- JSONB without parsing — Nexus does not interpret the FMD event - -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + -- schema. JSONB normalizes whitespace and key order, so the value is + -- preserved structurally (not byte-for-byte) for downstream tooling + -- (e.g. omdb). event JSONB, PRIMARY KEY (inv_collection_id, sled_id, case_id) diff --git a/sled-agent/types/versions/Cargo.toml b/sled-agent/types/versions/Cargo.toml index 14c5bab313b..aa3c50ed6d7 100644 --- a/sled-agent/types/versions/Cargo.toml +++ b/sled-agent/types/versions/Cargo.toml @@ -45,6 +45,7 @@ uuid.workspace = true [dev-dependencies] assert_matches.workspace = true +expectorate.workspace = true omicron-common = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true proptest.workspace = true diff --git a/sled-agent/types/versions/src/impls/inventory.rs b/sled-agent/types/versions/src/impls/inventory.rs index bca742e28d1..461180f885e 100644 --- a/sled-agent/types/versions/src/impls/inventory.rs +++ b/sled-agent/types/versions/src/impls/inventory.rs @@ -1176,3 +1176,71 @@ pub enum SourceNatConfigError { )] UnalignedPortPair { first_port: u16, last_port: u16 }, } + +#[cfg(test)] +mod tests { + use super::*; + use crate::latest::inventory::{FmdInventoryError, FmdInventoryErrorKind}; + use iddqd::IdOrdMap; + use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, GenericUuid}; + use uuid::Uuid; + + #[test] + fn fmd_inventory_result_display_snapshot() { + let case_uuid = FmdHostCaseUuid::from_untyped_uuid(Uuid::from_u128( + 0xfeed_face_dead_beef_dead_beef_dead_beef, + )); + let resource_uuid = FmdResourceUuid::from_untyped_uuid( + Uuid::from_u128(0xbada_55ca_fe00_0000_0000_0000_0000_0001), + ); + + let mut cases = IdOrdMap::new(); + cases + .insert_unique(FmdHostCase { + uuid: case_uuid, + code: "JOKE-9001-FAKE".to_string(), + url: "http://example.invalid/msg/JOKE-9001-FAKE".to_string(), + event: Some(serde_json::json!({ + "class": "fault.vibes.off", + "spookiness": 9001, + "suspects": ["casper", "slimer"], + })), + }) + .expect("case uuid is unique"); + + let mut resources = IdOrdMap::new(); + resources + .insert_unique(FmdResource { + uuid: resource_uuid, + fmri: "ghost:///not/a/real/fmri".to_string(), + case_id: case_uuid, + faulty: true, + unusable: false, + invisible: false, + }) + .expect("resource uuid is unique"); + + let ok: Result = + Ok(FmdInventory { cases, resources }); + let err: Result = + Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: "haunted by an absent fmd daemon".to_string(), + }); + + let mut out = String::new(); + out.push_str("--- ok variant ---\n"); + out.push_str( + &FmdInventoryResultDisplay::new(&ok).to_string(), + ); + out.push_str("--- err variant ---\n"); + out.push_str( + &FmdInventoryResultDisplay::new(&err).to_string(), + ); + + expectorate::assert_contents( + "tests/output/fmd_inventory_display.txt", + &out, + ); + } +} diff --git a/sled-agent/types/versions/tests/output/fmd_inventory_display.txt b/sled-agent/types/versions/tests/output/fmd_inventory_display.txt new file mode 100644 index 00000000000..65e21f14250 --- /dev/null +++ b/sled-agent/types/versions/tests/output/fmd_inventory_display.txt @@ -0,0 +1,19 @@ +--- ok variant --- +cases (1): + case feedface-dead-beef-dead-beefdeadbeef (JOKE-9001-FAKE) + url: http://example.invalid/msg/JOKE-9001-FAKE + event: + { + "class": "fault.vibes.off", + "spookiness": 9001, + "suspects": [ + "casper", + "slimer" + ] + } +resources (1): + resource bada55ca-fe00-0000-0000-000000000001 (case feedface-dead-beef-dead-beefdeadbeef) + fmri: ghost:///not/a/real/fmri + faulty: true, unusable: false, invisible: false +--- err variant --- +FMD collection failed: haunted by an absent fmd daemon From 1e81977390c6f0ce86f254b318d3f490b0dc6e8f Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 21 May 2026 14:50:38 -0700 Subject: [PATCH 24/24] Address eliza review nits: SQL comment, bool padding, snapshot example - schema/crdb/inv-fmd/up01.sql + dbinit.sql: document "no partial data is recorded" for too_many_resources too (matching too_many_cases). - sled-agent/types/versions/src/impls/inventory.rs: pad bools in FmdResourceDisplay to width 5 so faulty/unusable/invisible columns line up across multiple resources. - Snapshot test: cover a second case (event: None) and a second resource with the bool flags flipped, so the aligned bool output is visible in the expectorate file. --- schema/crdb/dbinit.sql | 3 ++- schema/crdb/inv-fmd/up01.sql | 3 ++- .../types/versions/src/impls/inventory.rs | 26 ++++++++++++++++++- .../tests/output/fmd_inventory_display.txt | 11 +++++--- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 33764bc3471..24297d6f643 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5236,7 +5236,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.fmd_inventory_error_kind AS ENUM ( -- Number of FMD cases reported by the sled exceeded the producer's -- limit; no partial data is recorded. 'too_many_cases', - -- Number of FMD resources reported by the sled exceeded the limit. + -- Number of FMD resources reported by the sled exceeded the limit; + -- no partial data is recorded. 'too_many_resources' ); diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql index 3aac46a3768..e813c601230 100644 --- a/schema/crdb/inv-fmd/up01.sql +++ b/schema/crdb/inv-fmd/up01.sql @@ -6,6 +6,7 @@ CREATE TYPE IF NOT EXISTS omicron.public.fmd_inventory_error_kind AS ENUM ( -- Number of FMD cases reported by the sled exceeded the producer's -- limit; no partial data is recorded. 'too_many_cases', - -- Number of FMD resources reported by the sled exceeded the limit. + -- Number of FMD resources reported by the sled exceeded the limit; + -- no partial data is recorded. 'too_many_resources' ); diff --git a/sled-agent/types/versions/src/impls/inventory.rs b/sled-agent/types/versions/src/impls/inventory.rs index 654c2fcb1f1..b1ca6d9fe3e 100644 --- a/sled-agent/types/versions/src/impls/inventory.rs +++ b/sled-agent/types/versions/src/impls/inventory.rs @@ -1015,7 +1015,7 @@ impl fmt::Display for FmdResourceDisplay<'_> { writeln!(f, " fmri: {fmri}")?; writeln!( f, - " faulty: {faulty}, unusable: {unusable}, invisible: {invisible}" + " faulty: {faulty:<5} unusable: {unusable:<5} invisible: {invisible:<5}" )?; Ok(()) } @@ -1190,9 +1190,15 @@ mod tests { let case_uuid = FmdHostCaseUuid::from_untyped_uuid(Uuid::from_u128( 0xfeed_face_dead_beef_dead_beef_dead_beef, )); + let case_uuid_2 = FmdHostCaseUuid::from_untyped_uuid(Uuid::from_u128( + 0xfeed_face_dead_beef_dead_beef_dead_b00f, + )); let resource_uuid = FmdResourceUuid::from_untyped_uuid( Uuid::from_u128(0xbada_55ca_fe00_0000_0000_0000_0000_0001), ); + let resource_uuid_2 = FmdResourceUuid::from_untyped_uuid( + Uuid::from_u128(0xbada_55ca_fe00_0000_0000_0000_0000_0002), + ); let mut cases = IdOrdMap::new(); cases @@ -1207,6 +1213,14 @@ mod tests { })), }) .expect("case uuid is unique"); + cases + .insert_unique(FmdHostCase { + uuid: case_uuid_2, + code: "BOO-0000-FAKE".to_string(), + url: "http://example.invalid/msg/BOO-0000-FAKE".to_string(), + event: None, + }) + .expect("case uuid is unique"); let mut resources = IdOrdMap::new(); resources @@ -1219,6 +1233,16 @@ mod tests { invisible: false, }) .expect("resource uuid is unique"); + resources + .insert_unique(FmdResource { + uuid: resource_uuid_2, + fmri: "ghost:///also/not/a/real/fmri".to_string(), + case_id: case_uuid_2, + faulty: false, + unusable: true, + invisible: true, + }) + .expect("resource uuid is unique"); let ok: Result = Ok(FmdInventory { cases, resources }); diff --git a/sled-agent/types/versions/tests/output/fmd_inventory_display.txt b/sled-agent/types/versions/tests/output/fmd_inventory_display.txt index 65e21f14250..e38555b235f 100644 --- a/sled-agent/types/versions/tests/output/fmd_inventory_display.txt +++ b/sled-agent/types/versions/tests/output/fmd_inventory_display.txt @@ -1,5 +1,7 @@ --- ok variant --- -cases (1): +cases (2): + case feedface-dead-beef-dead-beefdeadb00f (BOO-0000-FAKE) + url: http://example.invalid/msg/BOO-0000-FAKE case feedface-dead-beef-dead-beefdeadbeef (JOKE-9001-FAKE) url: http://example.invalid/msg/JOKE-9001-FAKE event: @@ -11,9 +13,12 @@ cases (1): "slimer" ] } -resources (1): +resources (2): resource bada55ca-fe00-0000-0000-000000000001 (case feedface-dead-beef-dead-beefdeadbeef) fmri: ghost:///not/a/real/fmri - faulty: true, unusable: false, invisible: false + faulty: true unusable: false invisible: false + resource bada55ca-fe00-0000-0000-000000000002 (case feedface-dead-beef-dead-beefdeadb00f) + fmri: ghost:///also/not/a/real/fmri + faulty: false unusable: true invisible: true --- err variant --- FMD collection failed: haunted by an absent fmd daemon