Commit a3a58cc
authored
Add FMD fault inventory to sled-agent API (#10283)
Exposes data from [fmd-adm](https://github.com/oxidecomputer/fmd-adm)
through the sled-agent inventory endpoint.
We're extracting:
- Cases: diagnosed faults with UUID, diagnostic code, URL, and the full
event nvlist serialized as JSON
- Resources: affected components with FMRI, fault status flags
I'm only exposing this through the API right now - Nexus isn't yet
shoving it into the DB. Soon! But wanted feedback
on this data first.
To give you a sense of "what does case/resource data look like", here's
what I pulled out of Atrium, using `fmd-adm`:
```
{
"type": "available",
"value": {
"cases": [
{
"code": "SUNOS-8000-KL",
"event": {
"class": "list.suspect",
"code": "SUNOS-8000-KL",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "software-diagnosis",
"mod-version": "0.1",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1667146378,
127967
],
"fault-list": [
{
"asru": {
"object": {
"path": "/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4"
},
"scheme": "sw",
"version": 0
},
"certainty": 100,
"class": "defect.sunos.kernel.panic",
"crashtime": 1667146083,
"dump-dir": "/var/crash/atrium",
"dump-files": [
"vmdump.2"
],
"os-instance-uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4",
"panic-time": "Sun Oct 30 16:08:03 2022 UTC",
"panicstack": "genunix:kadmin+627 () | genunix:uadmin+17d () | unix:brand_sys_syscall32+186 () | ",
"panicstr": "forced crash dump initiated at user request",
"resource": {
"object": {
"path": "/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4"
},
"scheme": "sw",
"version": 0
},
"savecore-succcess": true,
"version": 0
}
],
"fault-list-sz": 1,
"fault-status": [
1
],
"severity": "Major",
"uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4",
"version": 0
},
"url": "http://illumos.org/msg/SUNOS-8000-KL",
"uuid": "359346d5-c134-c44c-b0fa-db4a08a292d4"
},
{
"code": "PCIEX-8000-DJ",
"event": {
"class": "list.suspect",
"code": "PCIEX-8000-DJ",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "eft",
"mod-version": "1.16",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1729703082,
937698
],
"fault-list": [
{
"asru": {
"device-path": "/pci@af,0/pci1022,1483@3,5/pci1458,0@0",
"scheme": "dev",
"version": 0
},
"certainty": 40,
"class": "fault.io.pciex.device-interr",
"fru": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
}
],
"hc-root": "",
"scheme": "hc",
"version": 0
},
"location": "MB",
"resource": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
},
{
"hc-id": "19",
"hc-name": "hostbridge"
},
{
"hc-id": "19",
"hc-name": "pciexrc"
},
{
"hc-id": "195",
"hc-name": "pciexbus"
},
{
"hc-id": "0",
"hc-name": "pciexdev"
},
{
"hc-id": "0",
"hc-name": "pciexfn"
}
],
"hc-list-sz": 6,
"hc-root": "",
"scheme": "hc",
"version": 0
},
"version": 0
},
{
"asru": {
"device-path": "/pci@af,0/pci1022,1483@3,5",
"scheme": "dev",
"version": 0
},
"certainty": 20,
"class": "fault.io.pciex.device-interr",
"fru": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
}
],
"hc-root": "",
"scheme": "hc",
"version": 0
},
"location": "MB",
"resource": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
},
{
"hc-id": "19",
"hc-name": "hostbridge"
},
{
"hc-id": "19",
"hc-name": "pciexrc"
}
],
"hc-list-sz": 3,
"hc-root": "",
"scheme": "hc",
"version": 0
},
"version": 0
},
{
"asru": {
"device-path": "/pci@af,0/pci1022,1483@3,5/pci1458,0@0",
"scheme": "dev",
"version": 0
},
"certainty": 20,
"class": "fault.io.pciex.bus-noresp",
"fru": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
}
],
"hc-root": "",
"scheme": "hc",
"version": 0
},
"location": "MB",
"resource": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
},
{
"hc-id": "19",
"hc-name": "hostbridge"
},
{
"hc-id": "19",
"hc-name": "pciexrc"
},
{
"hc-id": "195",
"hc-name": "pciexbus"
},
{
"hc-id": "0",
"hc-name": "pciexdev"
},
{
"hc-id": "0",
"hc-name": "pciexfn"
}
],
"hc-list-sz": 6,
"hc-root": "",
"scheme": "hc",
"version": 0
},
"version": 0
},
{
"asru": {
"device-path": "/pci@af,0/pci1022,1483@3,5",
"scheme": "dev",
"version": 0
},
"certainty": 20,
"class": "fault.io.pciex.device-noresp",
"fru": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
}
],
"hc-root": "",
"scheme": "hc",
"version": 0
},
"location": "MB",
"resource": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium"
},
"hc-list": [
{
"hc-id": "0",
"hc-name": "motherboard"
},
{
"hc-id": "19",
"hc-name": "hostbridge"
},
{
"hc-id": "19",
"hc-name": "pciexrc"
}
],
"hc-list-sz": 3,
"hc-root": "",
"scheme": "hc",
"version": 0
},
"version": 0
}
],
"fault-list-sz": 4,
"fault-status": [
1,
1,
1,
1
],
"severity": "Major",
"uuid": "71b830c4-cef2-410b-afc8-9c6f504a3c02",
"version": 0
},
"url": "http://illumos.org/msg/PCIEX-8000-DJ",
"uuid": "71b830c4-cef2-410b-afc8-9c6f504a3c02"
},
{
"code": "SUNOS-8000-KL",
"event": {
"class": "list.suspect",
"code": "SUNOS-8000-KL",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "software-diagnosis",
"mod-version": "0.1",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1644520051,
664144
],
"fault-list": [
{
"asru": {
"object": {
"path": "/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2"
},
"scheme": "sw",
"version": 0
},
"certainty": 100,
"class": "defect.sunos.kernel.panic",
"crashtime": 1644519866,
"dump-dir": "/var/crash/atrium",
"dump-files": [
"vmdump.0"
],
"os-instance-uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2",
"panic-time": "Thu Feb 10 19:04:26 2022 UTC",
"panicstack": "unix:real_mode_stop_cpu_stage2_end+c60d () | unix:trap+1169 () | unix:cmntrap+e9 () | unix:bcopy+368 () | kstat:read_kstat_data+1c6 () | kstat:kstat_ioctl+5b () | genunix:cdev_ioctl+2b () | specfs:spec_ioctl+45 () | genunix:fop_ioctl+5b () | genunix:ioctl+153 () | unix:brand_sys_syscall32+186 () | ",
"panicstr": "BAD TRAP: type=e (#pf Page fault) rp=fffffe00f5df4910 addr=fffffeb691b5139c",
"resource": {
"object": {
"path": "/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2"
},
"scheme": "sw",
"version": 0
},
"savecore-succcess": true,
"version": 0
}
],
"fault-list-sz": 1,
"fault-status": [
1
],
"severity": "Major",
"uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2",
"version": 0
},
"url": "http://illumos.org/msg/SUNOS-8000-KL",
"uuid": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2"
},
{
"code": "SUNOS-8000-KL",
"event": {
"class": "list.suspect",
"code": "SUNOS-8000-KL",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "software-diagnosis",
"mod-version": "0.1",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1648150968,
960574
],
"fault-list": [
{
"asru": {
"object": {
"path": "/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22"
},
"scheme": "sw",
"version": 0
},
"certainty": 100,
"class": "defect.sunos.kernel.panic",
"crashtime": 1648150784,
"dump-dir": "/var/crash/atrium",
"dump-files": [
"vmdump.1"
],
"os-instance-uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22",
"panic-time": "Thu Mar 24 19:39:44 2022 UTC",
"panicstack": "unix:real_mode_stop_cpu_stage2_end+c60d () | unix:trap+1169 () | unix:cmntrap+e9 () | vmm:vmm_kstat_update_vcpu+23 () | kstat:read_kstat_data+f5 () | kstat:kstat_ioctl+5b () | genunix:cdev_ioctl+2b () | specfs:spec_ioctl+45 () | genunix:fop_ioctl+5b () | genunix:ioctl+153 () | unix:brand_sys_syscall32+186 () | ",
"panicstr": "BAD TRAP: type=e (#pf Page fault) rp=fffffe00f734a950 addr=fffffeb1e67c43d0",
"resource": {
"object": {
"path": "/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22"
},
"scheme": "sw",
"version": 0
},
"savecore-succcess": true,
"version": 0
}
],
"fault-list-sz": 1,
"fault-status": [
1
],
"severity": "Major",
"uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22",
"version": 0
},
"url": "http://illumos.org/msg/SUNOS-8000-KL",
"uuid": "934d446d-d1db-4f12-88f2-eadd1d0cae22"
},
{
"code": "SUNOS-8000-J0",
"event": {
"class": "list.suspect",
"code": "SUNOS-8000-J0",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "eft",
"mod-version": "1.16",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1729703082,
591677
],
"fault-list": [
{
"certainty": 50,
"class": "defect.sunos.eft.unexpected_telemetry",
"reason": "no valid path to component was found in ereport.io.pciex.rc.nfe-msg",
"resource": {
"device-path": "/pci@af,0",
"scheme": "dev",
"version": 0
},
"response": false,
"retire": false,
"version": 0
},
{
"certainty": 50,
"class": "fault.sunos.eft.unexpected_telemetry",
"reason": "no valid path to component was found in ereport.io.pciex.rc.nfe-msg",
"resource": {
"device-path": "/pci@af,0",
"scheme": "dev",
"version": 0
},
"response": false,
"retire": false,
"version": 0
}
],
"fault-list-sz": 2,
"fault-status": [
3,
3
],
"severity": "Major",
"uuid": "cb7808a1-0ae4-4609-859f-772b541fdafb",
"version": 0
},
"url": "http://illumos.org/msg/SUNOS-8000-J0",
"uuid": "cb7808a1-0ae4-4609-859f-772b541fdafb"
},
{
"code": "SUNOS-8000-KL",
"event": {
"class": "list.suspect",
"code": "SUNOS-8000-KL",
"de": {
"authority": {
"chassis-id": "DL9016712A0001",
"product-id": "R152-Z32-00",
"server-id": "atrium",
"version": 0
},
"mod-name": "software-diagnosis",
"mod-version": "0.1",
"scheme": "fmd",
"version": 0
},
"diag-time": [
1685805999,
471268
],
"fault-list": [
{
"asru": {
"object": {
"path": "/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f"
},
"scheme": "sw",
"version": 0
},
"certainty": 100,
"class": "defect.sunos.kernel.panic",
"crashtime": 1685805741,
"dump-dir": "/var/crash/atrium",
"dump-files": [
"vmdump.3"
],
"os-instance-uuid": "f389ce27-4486-e994-9c34-c5836914f27f",
"panic-time": "Sat Jun 3 15:22:21 2023 UTC",
"panicstack": "fffffffff78b52f3 () | unix:av_dispatch_nmivect+32 () | unix:nmiint+155 () | unix:i86_mwait+12 () | unix:cpu_idle_mwait+14b () | unix:cpu_idle_adaptive+19 () | unix:idle+11b () | unix:thread_start+b () | ",
"panicstr": "NMI received\n",
"resource": {
"object": {
"path": "/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f"
},
"scheme": "sw",
"version": 0
},
"savecore-succcess": true,
"version": 0
}
],
"fault-list-sz": 1,
"fault-status": [
1
],
"severity": "Major",
"uuid": "f389ce27-4486-e994-9c34-c5836914f27f",
"version": 0
},
"url": "http://illumos.org/msg/SUNOS-8000-KL",
"uuid": "f389ce27-4486-e994-9c34-c5836914f27f"
}
],
"resources": [
{
"case_id": "71b830c4-cef2-410b-afc8-9c6f504a3c02",
"faulty": true,
"fmri": "dev:////pci@af,0/pci1022,1483@3,5",
"invisible": false,
"unusable": false,
"uuid": "d41964aa-62da-480b-bff1-35e0d442843c"
},
{
"case_id": "71b830c4-cef2-410b-afc8-9c6f504a3c02",
"faulty": true,
"fmri": "dev:////pci@af,0/pci1022,1483@3,5/pci1458,0@0",
"invisible": false,
"unusable": false,
"uuid": "01f332af-ed19-42cf-a623-3b5767b513f7"
},
{
"case_id": "359346d5-c134-c44c-b0fa-db4a08a292d4",
"faulty": true,
"fmri": "sw:///:path=/var/crash/atrium/.359346d5-c134-c44c-b0fa-db4a08a292d4",
"invisible": false,
"unusable": false,
"uuid": "4206a805-00e3-cb06-bf1a-8bf69f8c8be1"
},
{
"case_id": "8fbb2f00-47e0-ef18-b56d-d5475cae27f2",
"faulty": true,
"fmri": "sw:///:path=/var/crash/atrium/.8fbb2f00-47e0-ef18-b56d-d5475cae27f2",
"invisible": false,
"unusable": false,
"uuid": "9a8e0d9a-a68c-6578-9d41-dacd39a4a819"
},
{
"case_id": "934d446d-d1db-4f12-88f2-eadd1d0cae22",
"faulty": true,
"fmri": "sw:///:path=/var/crash/atrium/.934d446d-d1db-4f12-88f2-eadd1d0cae22",
"invisible": false,
"unusable": false,
"uuid": "607b8a84-4e32-44ad-97e6-b595faffdd1d"
},
{
"case_id": "f389ce27-4486-e994-9c34-c5836914f27f",
"faulty": true,
"fmri": "sw:///:path=/var/crash/atrium/.f389ce27-4486-e994-9c34-c5836914f27f",
"invisible": false,
"unusable": false,
"uuid": "79411ef0-9ffc-e7f3-b9d6-9aa908881603"
}
]
}
}
```1 parent 17ceb72 commit a3a58cc
42 files changed
Lines changed: 1063 additions & 70 deletions
File tree
- .cargo
- clients/sled-agent-client/src
- cockroach-admin
- dev-tools
- omdb
- omicron-dev
- end-to-end-tests
- nexus
- db-queries
- src/db/datastore
- inventory
- src
- metrics-producer-gc
- mgs-updates/src/test_util
- reconfigurator
- cli-integration-tests
- execution
- planning/src
- mgs_updates
- saga-recovery
- test-utils
- ntp-admin
- openapi/sled-agent
- rpaths/src
- sled-agent
- api/src
- rack-setup/src
- plan
- src
- sim
- types/versions/src
- add_fmd_to_inventory
- uuid-kinds/src
Some content is hidden
Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
50 | 50 | | |
51 | 51 | | |
52 | 52 | | |
| 53 | + | |
| 54 | + | |
| 55 | + | |
| 56 | + | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | + | |
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
514 | 514 | | |
515 | 515 | | |
516 | 516 | | |
| 517 | + | |
| 518 | + | |
517 | 519 | | |
518 | 520 | | |
519 | 521 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
60 | 60 | | |
61 | 61 | | |
62 | 62 | | |
| 63 | + | |
| 64 | + | |
| 65 | + | |
| 66 | + | |
| 67 | + | |
63 | 68 | | |
64 | 69 | | |
65 | 70 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
39 | 39 | | |
40 | 40 | | |
41 | 41 | | |
| 42 | + | |
| 43 | + | |
| 44 | + | |
| 45 | + | |
42 | 46 | | |
43 | 47 | | |
44 | 48 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
96 | 96 | | |
97 | 97 | | |
98 | 98 | | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + | |
99 | 103 | | |
100 | 104 | | |
101 | 105 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
31 | 31 | | |
32 | 32 | | |
33 | 33 | | |
| 34 | + | |
| 35 | + | |
| 36 | + | |
| 37 | + | |
34 | 38 | | |
35 | 39 | | |
36 | 40 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
3 | 3 | | |
4 | 4 | | |
5 | 5 | | |
| 6 | + | |
6 | 7 | | |
7 | 8 | | |
8 | 9 | | |
9 | 10 | | |
| 11 | + | |
| 12 | + | |
| 13 | + | |
10 | 14 | | |
11 | 15 | | |
12 | 16 | | |
| |||
50 | 54 | | |
51 | 55 | | |
52 | 56 | | |
| 57 | + | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
| 1 | + | |
| 2 | + | |
| 3 | + | |
| 4 | + | |
| 5 | + | |
| 6 | + | |
| 7 | + | |
| 8 | + | |
| 9 | + | |
| 10 | + | |
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
166 | 166 | | |
167 | 167 | | |
168 | 168 | | |
| 169 | + | |
| 170 | + | |
| 171 | + | |
| 172 | + | |
169 | 173 | | |
170 | 174 | | |
171 | 175 | | |
| |||
0 commit comments