Skip to content

Commit a3a58cc

Browse files
authored
Add FMD fault inventory to sled-agent API (#10283)
Exposes data from [fmd-adm](https://github.com/oxidecomputer/fmd-adm) through the sled-agent inventory endpoint. We're extracting: - Cases: diagnosed faults with UUID, diagnostic code, URL, and the full event nvlist serialized as JSON - Resources: affected components with FMRI, fault status flags I'm only exposing this through the API right now - Nexus isn't yet shoving it into the DB. Soon! But wanted feedback on this data first. To give you a sense of "what does case/resource data look like", here's what I pulled out of Atrium, using `fmd-adm`: ``` { "type": "available", "value": { "cases": [ { "code": "SUNOS-8000-KL", "event": { "class": "list.suspect", "code": "SUNOS-8000-KL", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "software-diagnosis", "mod-version": "0.1", "scheme": "fmd", "version": 0 }, "diag-time": [ 1667146378, 127967 ], "fault-list": [ { "asru": { "object": { "path": "/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4" }, "scheme": "sw", "version": 0 }, "certainty": 100, "class": "defect.sunos.kernel.panic", "crashtime": 1667146083, "dump-dir": "/var/crash/atrium", "dump-files": [ "vmdump.2" ], "os-instance-uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4", "panic-time": "Sun Oct 30 16:08:03 2022 UTC", "panicstack": "genunix:kadmin+627 () | genunix:uadmin+17d () | unix:brand_sys_syscall32+186 () | ", "panicstr": "forced crash dump initiated at user request", "resource": { "object": { "path": "/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4" }, "scheme": "sw", "version": 0 }, "savecore-succcess": true, "version": 0 } ], "fault-list-sz": 1, "fault-status": [ 1 ], "severity": "Major", "uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4", "version": 0 }, "url": "http://illumos.org/msg/SUNOS-8000-KL", "uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4" }, { "code": "PCIEX-8000-DJ", "event": { "class": "list.suspect", "code": "PCIEX-8000-DJ", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "eft", "mod-version": "1.16", "scheme": "fmd", "version": 0 }, "diag-time": [ 1729703082, 937698 ], "fault-list": [ { "asru": { "device-path": "/pci@af,0/pci1022,1483@3,5/pci1458,0@0", "scheme": "dev", "version": 0 }, "certainty": 40, "class": "fault.io.pciex.device-interr", "fru": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" } ], "hc-root": "", "scheme": "hc", "version": 0 }, "location": "MB", "resource": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" }, { "hc-id": "19", "hc-name": "hostbridge" }, { "hc-id": "19", "hc-name": "pciexrc" }, { "hc-id": "195", "hc-name": "pciexbus" }, { "hc-id": "0", "hc-name": "pciexdev" }, { "hc-id": "0", "hc-name": "pciexfn" } ], "hc-list-sz": 6, "hc-root": "", "scheme": "hc", "version": 0 }, "version": 0 }, { "asru": { "device-path": "/pci@af,0/pci1022,1483@3,5", "scheme": "dev", "version": 0 }, "certainty": 20, "class": "fault.io.pciex.device-interr", "fru": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" } ], "hc-root": "", "scheme": "hc", "version": 0 }, "location": "MB", "resource": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" }, { "hc-id": "19", "hc-name": "hostbridge" }, { "hc-id": "19", "hc-name": "pciexrc" } ], "hc-list-sz": 3, "hc-root": "", "scheme": "hc", "version": 0 }, "version": 0 }, { "asru": { "device-path": "/pci@af,0/pci1022,1483@3,5/pci1458,0@0", "scheme": "dev", "version": 0 }, "certainty": 20, "class": "fault.io.pciex.bus-noresp", "fru": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" } ], "hc-root": "", "scheme": "hc", "version": 0 }, "location": "MB", "resource": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" }, { "hc-id": "19", "hc-name": "hostbridge" }, { "hc-id": "19", "hc-name": "pciexrc" }, { "hc-id": "195", "hc-name": "pciexbus" }, { "hc-id": "0", "hc-name": "pciexdev" }, { "hc-id": "0", "hc-name": "pciexfn" } ], "hc-list-sz": 6, "hc-root": "", "scheme": "hc", "version": 0 }, "version": 0 }, { "asru": { "device-path": "/pci@af,0/pci1022,1483@3,5", "scheme": "dev", "version": 0 }, "certainty": 20, "class": "fault.io.pciex.device-noresp", "fru": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" } ], "hc-root": "", "scheme": "hc", "version": 0 }, "location": "MB", "resource": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium" }, "hc-list": [ { "hc-id": "0", "hc-name": "motherboard" }, { "hc-id": "19", "hc-name": "hostbridge" }, { "hc-id": "19", "hc-name": "pciexrc" } ], "hc-list-sz": 3, "hc-root": "", "scheme": "hc", "version": 0 }, "version": 0 } ], "fault-list-sz": 4, "fault-status": [ 1, 1, 1, 1 ], "severity": "Major", "uuid": "71b830c4-cef2-410b-afc8-9c6f504a3c02", "version": 0 }, "url": "http://illumos.org/msg/PCIEX-8000-DJ", "uuid": "71b830c4-cef2-410b-afc8-9c6f504a3c02" }, { "code": "SUNOS-8000-KL", "event": { "class": "list.suspect", "code": "SUNOS-8000-KL", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "software-diagnosis", "mod-version": "0.1", "scheme": "fmd", "version": 0 }, "diag-time": [ 1644520051, 664144 ], "fault-list": [ { "asru": { "object": { "path": "/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2" }, "scheme": "sw", "version": 0 }, "certainty": 100, "class": "defect.sunos.kernel.panic", "crashtime": 1644519866, "dump-dir": "/var/crash/atrium", "dump-files": [ "vmdump.0" ], "os-instance-uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2", "panic-time": "Thu Feb 10 19:04:26 2022 UTC", "panicstack": "unix:real_mode_stop_cpu_stage2_end+c60d () | unix:trap+1169 () | unix:cmntrap+e9 () | unix:bcopy+368 () | kstat:read_kstat_data+1c6 () | kstat:kstat_ioctl+5b () | genunix:cdev_ioctl+2b () | specfs:spec_ioctl+45 () | genunix:fop_ioctl+5b () | genunix:ioctl+153 () | unix:brand_sys_syscall32+186 () | ", "panicstr": "BAD TRAP: type=e (#pf Page fault) rp=fffffe00f5df4910 addr=fffffeb691b5139c", "resource": { "object": { "path": "/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2" }, "scheme": "sw", "version": 0 }, "savecore-succcess": true, "version": 0 } ], "fault-list-sz": 1, "fault-status": [ 1 ], "severity": "Major", "uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2", "version": 0 }, "url": "http://illumos.org/msg/SUNOS-8000-KL", "uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2" }, { "code": "SUNOS-8000-KL", "event": { "class": "list.suspect", "code": "SUNOS-8000-KL", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "software-diagnosis", "mod-version": "0.1", "scheme": "fmd", "version": 0 }, "diag-time": [ 1648150968, 960574 ], "fault-list": [ { "asru": { "object": { "path": "/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22" }, "scheme": "sw", "version": 0 }, "certainty": 100, "class": "defect.sunos.kernel.panic", "crashtime": 1648150784, "dump-dir": "/var/crash/atrium", "dump-files": [ "vmdump.1" ], "os-instance-uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22", "panic-time": "Thu Mar 24 19:39:44 2022 UTC", "panicstack": "unix:real_mode_stop_cpu_stage2_end+c60d () | unix:trap+1169 () | unix:cmntrap+e9 () | vmm:vmm_kstat_update_vcpu+23 () | kstat:read_kstat_data+f5 () | kstat:kstat_ioctl+5b () | genunix:cdev_ioctl+2b () | specfs:spec_ioctl+45 () | genunix:fop_ioctl+5b () | genunix:ioctl+153 () | unix:brand_sys_syscall32+186 () | ", "panicstr": "BAD TRAP: type=e (#pf Page fault) rp=fffffe00f734a950 addr=fffffeb1e67c43d0", "resource": { "object": { "path": "/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22" }, "scheme": "sw", "version": 0 }, "savecore-succcess": true, "version": 0 } ], "fault-list-sz": 1, "fault-status": [ 1 ], "severity": "Major", "uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22", "version": 0 }, "url": "http://illumos.org/msg/SUNOS-8000-KL", "uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22" }, { "code": "SUNOS-8000-J0", "event": { "class": "list.suspect", "code": "SUNOS-8000-J0", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "eft", "mod-version": "1.16", "scheme": "fmd", "version": 0 }, "diag-time": [ 1729703082, 591677 ], "fault-list": [ { "certainty": 50, "class": "defect.sunos.eft.unexpected_telemetry", "reason": "no valid path to component was found in ereport.io.pciex.rc.nfe-msg", "resource": { "device-path": "/pci@af,0", "scheme": "dev", "version": 0 }, "response": false, "retire": false, "version": 0 }, { "certainty": 50, "class": "fault.sunos.eft.unexpected_telemetry", "reason": "no valid path to component was found in ereport.io.pciex.rc.nfe-msg", "resource": { "device-path": "/pci@af,0", "scheme": "dev", "version": 0 }, "response": false, "retire": false, "version": 0 } ], "fault-list-sz": 2, "fault-status": [ 3, 3 ], "severity": "Major", "uuid": "cb7808a1-0ae4-4609-859f-772b541fdafb", "version": 0 }, "url": "http://illumos.org/msg/SUNOS-8000-J0", "uuid": "cb7808a1-0ae4-4609-859f-772b541fdafb" }, { "code": "SUNOS-8000-KL", "event": { "class": "list.suspect", "code": "SUNOS-8000-KL", "de": { "authority": { "chassis-id": "DL9016712A0001", "product-id": "R152-Z32-00", "server-id": "atrium", "version": 0 }, "mod-name": "software-diagnosis", "mod-version": "0.1", "scheme": "fmd", "version": 0 }, "diag-time": [ 1685805999, 471268 ], "fault-list": [ { "asru": { "object": { "path": "/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f" }, "scheme": "sw", "version": 0 }, "certainty": 100, "class": "defect.sunos.kernel.panic", "crashtime": 1685805741, "dump-dir": "/var/crash/atrium", "dump-files": [ "vmdump.3" ], "os-instance-uuid": "f389ce27-4486-e994-9c34-c5836914f27f", "panic-time": "Sat Jun 3 15:22:21 2023 UTC", "panicstack": "fffffffff78b52f3 () | unix:av_dispatch_nmivect+32 () | unix:nmiint+155 () | unix:i86_mwait+12 () | unix:cpu_idle_mwait+14b () | unix:cpu_idle_adaptive+19 () | unix:idle+11b () | unix:thread_start+b () | ", "panicstr": "NMI received\n", "resource": { "object": { "path": "/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f" }, "scheme": "sw", "version": 0 }, "savecore-succcess": true, "version": 0 } ], "fault-list-sz": 1, "fault-status": [ 1 ], "severity": "Major", "uuid": "f389ce27-4486-e994-9c34-c5836914f27f", "version": 0 }, "url": "http://illumos.org/msg/SUNOS-8000-KL", "uuid": "f389ce27-4486-e994-9c34-c5836914f27f" } ], "resources": [ { "case_id": "71b830c4-cef2-410b-afc8-9c6f504a3c02", "faulty": true, "fmri": "dev:////pci@af,0/pci1022,1483@3,5", "invisible": false, "unusable": false, "uuid": "d41964aa-62da-480b-bff1-35e0d442843c" }, { "case_id": "71b830c4-cef2-410b-afc8-9c6f504a3c02", "faulty": true, "fmri": "dev:////pci@af,0/pci1022,1483@3,5/pci1458,0@0", "invisible": false, "unusable": false, "uuid": "01f332af-ed19-42cf-a623-3b5767b513f7" }, { "case_id": "359346d5-c134-c44c-b0fa-db4a08a292d4", "faulty": true, "fmri": "sw:///:path=/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4", "invisible": false, "unusable": false, "uuid": "4206a805-00e3-cb06-bf1a-8bf69f8c8be1" }, { "case_id": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2", "faulty": true, "fmri": "sw:///:path=/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2", "invisible": false, "unusable": false, "uuid": "9a8e0d9a-a68c-6578-9d41-dacd39a4a819" }, { "case_id": "934d446d-d1db-4f12-88f2-eadd1d0cae22", "faulty": true, "fmri": "sw:///:path=/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22", "invisible": false, "unusable": false, "uuid": "607b8a84-4e32-44ad-97e6-b595faffdd1d" }, { "case_id": "f389ce27-4486-e994-9c34-c5836914f27f", "faulty": true, "fmri": "sw:///:path=/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f", "invisible": false, "unusable": false, "uuid": "79411ef0-9ffc-e7f3-b9d6-9aa908881603" } ] } } ```
1 parent 17ceb72 commit a3a58cc

42 files changed

Lines changed: 1063 additions & 70 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.cargo/xtask.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,12 @@ binary_allow_list = [
5050
"sled-agent",
5151
"sled-agent-sim",
5252
]
53+
54+
# libfmd_adm is the illumos Fault Management Daemon admin library, used by
55+
# sled-agent to collect FMD case/resource information for inventory.
56+
[libraries."libfmd_adm.so.1"]
57+
binary_allow_list = [
58+
"omicron-dev",
59+
"sled-agent",
60+
"sled-agent-sim",
61+
]

Cargo.lock

Lines changed: 50 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,8 @@ filetime = "0.2.26"
514514
flate2 = "1.1.2"
515515
float-ord = "0.3.2"
516516
flume = "0.11.1"
517+
fmd-adm = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" }
518+
fmd-adm-sys = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" }
517519
foreign-types = "0.3.2"
518520
fs-err = "3.1.1"
519521
futures = "0.3.31"

clients/sled-agent-client/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ progenitor::generate_api!(
6060
ExternalIpConfig = sled_agent_types_versions::latest::instance::ExternalIpConfig,
6161
ExternalIpv4Config = sled_agent_types_versions::latest::instance::ExternalIpv4Config,
6262
ExternalIpv6Config = sled_agent_types_versions::latest::instance::ExternalIpv6Config,
63+
FmdHostCase = sled_agent_types_versions::latest::inventory::FmdHostCase,
64+
FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory,
65+
FmdInventoryError = sled_agent_types_versions::latest::inventory::FmdInventoryError,
66+
FmdInventoryErrorKind = sled_agent_types_versions::latest::inventory::FmdInventoryErrorKind,
67+
FmdResource = sled_agent_types_versions::latest::inventory::FmdResource,
6368
Generation = omicron_common::api::external::Generation,
6469
Hostname = omicron_common::api::external::Hostname,
6570
ImportExportPolicy = sled_agent_types_versions::latest::early_networking::ImportExportPolicy,

cockroach-admin/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ toml.workspace = true
3939

4040
omicron-workspace-hack.workspace = true
4141

42+
[target.'cfg(target_os = "illumos")'.dependencies]
43+
# See omicron-rpaths for more about the "fmd-adm-sys" dependency.
44+
fmd-adm-sys.workspace = true
45+
4246
[dev-dependencies]
4347
expectorate.workspace = true
4448
nexus-test-utils.workspace = true

dev-tools/omdb/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ update-engine.workspace = true
9696
url.workspace = true
9797
uuid.workspace = true
9898

99+
[target.'cfg(target_os = "illumos")'.dependencies]
100+
# See omicron-rpaths for more about the "fmd-adm-sys" dependency.
101+
fmd-adm-sys.workspace = true
102+
99103
[dev-dependencies]
100104
camino-tempfile.workspace = true
101105
expectorate.workspace = true

dev-tools/omicron-dev/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ signal-hook-tokio.workspace = true
3131
tokio.workspace = true
3232
toml.workspace = true
3333

34+
[target.'cfg(target_os = "illumos")'.dependencies]
35+
# See omicron-rpaths for more about the "fmd-adm-sys" dependency.
36+
fmd-adm-sys.workspace = true
37+
3438
[dev-dependencies]
3539
expectorate.workspace = true
3640
omicron-dev-lib.workspace = true

end-to-end-tests/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@ name = "end-to-end-tests"
33
version = "0.1.0"
44
edition.workspace = true
55
license = "MPL-2.0"
6+
build = "build.rs"
67

78
[lints]
89
workspace = true
910

11+
[build-dependencies]
12+
omicron-rpaths.workspace = true
13+
1014
[dependencies]
1115
anstyle.workspace = true
1216
anyhow = { workspace = true, features = ["backtrace"] }
@@ -50,3 +54,7 @@ thiserror.workspace = true
5054
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
5155
toml.workspace = true
5256
uuid.workspace = true
57+
58+
[target.'cfg(target_os = "illumos")'.dependencies]
59+
# See omicron-rpaths for more about the "fmd-adm-sys" dependency.
60+
fmd-adm-sys.workspace = true

end-to-end-tests/build.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
// See omicron-rpaths for documentation.
6+
// NOTE: This file MUST be kept in sync with the other build.rs files in this
7+
// repository.
8+
fn main() {
9+
omicron_rpaths::configure_default_omicron_rpaths();
10+
}

nexus/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ omicron-workspace-hack.workspace = true
166166
omicron-uuid-kinds.workspace = true
167167
zip = { workspace = true, features = ["jiff-02"] }
168168

169+
[target.'cfg(target_os = "illumos")'.dependencies]
170+
# See omicron-rpaths for more about the "fmd-adm-sys" dependency.
171+
fmd-adm-sys.workspace = true
172+
169173
[dev-dependencies]
170174
async-bb8-diesel.workspace = true
171175
camino-tempfile.workspace = true

0 commit comments

Comments
 (0)