Skip to content

Commit 7029ca8

Browse files
authored
Add support for NVMe drives in smartmon.py (#2035)
Adds support for collecting SMART metrics from NVMe drives with the use of pysmart and smartctl JSON output. Includes updates to the deployment playbooks, tests, and dashboards.
1 parent 8207567 commit 7029ca8

11 files changed

Lines changed: 997 additions & 488 deletions

File tree

etc/kayobe/ansible/deployment/get-nvme-drives.yml

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,31 @@
33
hosts: overcloud
44
gather_facts: no
55
tasks:
6-
- name: Retrieve NVMe device information
7-
ansible.builtin.command: "nvme list -o json"
8-
register: nvme_list
6+
- name: Scan for NVMe devices with smartctl
7+
ansible.builtin.command: "smartctl -d nvme --scan -j"
8+
register: smartctl_scan
99
changed_when: false
1010
become: true
1111

12+
- name: Extract NVMe device paths
13+
ansible.builtin.set_fact:
14+
nvme_devices: "{{ smartctl_scan.stdout | from_json | json_query('devices[].info_name') | default([]) }}"
15+
changed_when: false
16+
17+
- name: Retrieve NVMe device information via smartctl
18+
ansible.builtin.command: "smartctl -i -j {{ item }}"
19+
register: smartctl_info
20+
loop: "{{ nvme_devices }}"
21+
loop_control:
22+
label: "{{ item }}"
23+
changed_when: false
24+
become: true
25+
when: nvme_devices | length > 0
26+
1227
- name: Parse NVMe device model names
1328
ansible.builtin.set_fact:
14-
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
15-
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
29+
nvme_models: "{{ nvme_models | default([]) + [item.model_name] }}"
30+
loop: "{{ smartctl_info.results | default([]) | map(attribute='stdout') | map('from_json') | selectattr('model_name', 'defined') | list }}"
1631
changed_when: false
1732

1833
- name: Set unique NVMe models as host facts

etc/kayobe/ansible/deployment/smartmon-tools.yml

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
- name: Install and set up SMART monitoring tools
33
hosts: overcloud
44
tasks:
5-
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
5+
- name: Ensure smartmontools, jq, and cron/cronie are installed
66
ansible.builtin.package:
77
name:
88
- smartmontools
9-
- nvme-cli
109
- jq
1110
- "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'cronie' }}"
1211
state: present
@@ -54,7 +53,7 @@
5453
enabled: true
5554
become: true
5655

57-
- name: Copy smartmon.py and nvmemon.sh from scripts folder
56+
- name: Copy smartmon.py from scripts folder
5857
ansible.builtin.copy:
5958
src: "{{ lookup('env', 'KAYOBE_CONFIG_PATH') }}/ansible/scripts/{{ item }}"
6059
dest: /usr/local/bin/{{ item }}
@@ -63,7 +62,6 @@
6362
mode: "0700"
6463
loop:
6564
- smartmon.py
66-
- nvmemon.sh
6765
become: true
6866

6967
- name: Set PATH Variable for cron
@@ -84,17 +82,6 @@
8482
mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom
8583
become: true
8684

87-
- name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file
88-
ansible.builtin.cron:
89-
name: SMART metrics for drive monitoring using nvmemon.sh
90-
user: root
91-
minute: "*/5"
92-
job: >-
93-
umask 0022 && /usr/local/bin/nvmemon.sh >
94-
/var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp &&
95-
mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom
96-
become: true
97-
9885
- name: Remove old cronjobs if present
9986
ansible.builtin.cron:
10087
name: SMART metrics for drive monitoring using {{ item }}
@@ -104,11 +91,15 @@
10491
loop:
10592
- smartmon
10693
- nvmemon
94+
- nvmemon.sh
10795

108-
- name: Remove old smartmon.sh if present
96+
- name: Remove old monitoring scripts if present
10997
ansible.builtin.file:
110-
path: /usr/local/bin/smartmon.sh
98+
path: /usr/local/bin/{{ item }}
11199
state: absent
100+
loop:
101+
- smartmon.sh
102+
- nvmemon.sh
112103
become: true
113104

114105
- name: Gather NVMe drives and generate dwpd ratings

etc/kayobe/ansible/scripts/generate_fixtures.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22
import json
33
import re
4+
import subprocess
45
from pySMART import DeviceList
56

67
SMARTMON_ATTRS = {
@@ -63,6 +64,8 @@
6364
"critical_comp_time",
6465
}
6566

67+
SMARTCTL_PATH = "/usr/sbin/smartctl"
68+
6669
DISK_INFO = {
6770
"name",
6871
"interface",
@@ -84,6 +87,15 @@ def camel_to_snake(name):
8487
"""
8588
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
8689

90+
def canonical_device_path(name):
91+
"""
92+
Ensure device name is returned as absolute /dev path for smartctl.
93+
94+
pySMART sometimes reports bare device names (e.g. 'nvme0'); smartctl on the
95+
CLI expects the canonical /dev path, so normalise here to avoid surprises.
96+
"""
97+
return name if name.startswith("/dev/") else f"/dev/{name}"
98+
8799
def attrs_to_dict(obj, allowed_keys):
88100
"""
89101
Build {attr: value} for every public, non-callable attribute whose
@@ -105,14 +117,52 @@ def attrs_to_dict(obj, allowed_keys):
105117
attributes[name] = value
106118
return attributes
107119

120+
def smartctl_json(device_name, device_type):
121+
"""
122+
Execute smartctl -x -j for the given device and return the parsed JSON payload.
123+
124+
The goal is to mirror the exact data smartmon.py consumes at runtime so our
125+
fixtures stay faithful to real hardware output.
126+
"""
127+
if not device_name:
128+
return {}
129+
130+
target = canonical_device_path(device_name)
131+
132+
cmd = [SMARTCTL_PATH, "-x", "-j", target]
133+
if device_type and device_type.lower() not in (None, "", "nvme"):
134+
cmd.insert(3, device_type)
135+
cmd.insert(3, "-d")
136+
137+
try:
138+
result = subprocess.run(
139+
cmd,
140+
stdout=subprocess.PIPE,
141+
stderr=subprocess.PIPE,
142+
check=False,
143+
text=True,
144+
)
145+
except OSError:
146+
return {}
147+
148+
if not result.stdout:
149+
return {}
150+
151+
try:
152+
return json.loads(result.stdout)
153+
except json.JSONDecodeError:
154+
return {}
155+
108156
for disk in DeviceList().devices:
109157

110158
fixtures = {}
111159
disk_info = attrs_to_dict(disk, DISK_INFO)
112160
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS)
161+
smartctl_payload = smartctl_json(disk.name, disk.interface)
113162

114163
fixtures["device_info"] = disk_info
115164
fixtures["if_attributes"] = if_stats
165+
fixtures["smartctl"] = smartctl_payload
116166

117167
print(f'Disk: {disk.name}: \n')
118168
print(json.dumps(fixtures, indent=2, default=str))

etc/kayobe/ansible/scripts/nvmemon.sh

Lines changed: 0 additions & 150 deletions
This file was deleted.

0 commit comments

Comments
 (0)