Skip to content

Commit 31bc5fa

Browse files
authored
Merge pull request #2322 from stackhpc/smartmon-historical-fails
Fix smartmon health status reporting
2 parents 6a27ea9 + f480e4a commit 31bc5fa

2 files changed

Lines changed: 79 additions & 1 deletion

File tree

etc/kayobe/ansible/scripts/smartmon.py

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,65 @@ def load_dwpd_ratings(path=DWPD_RATINGS_PATH):
154154
DWPD_RATINGS = load_dwpd_ratings()
155155

156156

157+
# Helper: Identify historical temperature/airflow attribute failures
158+
def is_historical_temperature_attr_failure(attribute):
159+
"""
160+
Return True when a pySMART attribute failure represents only a historical
161+
temperature/airflow threshold breach.
162+
163+
Some disks keep WHEN_FAILED=In_the_past forever after an overheating event.
164+
pySMART turns that into assessment=WARN, which is useful to expose, but it
165+
should not make the main smart_healthy metric look like an active disk
166+
failure.
167+
"""
168+
when_failed = str(getattr(attribute, "when_failed", "") or "").strip().lower()
169+
name = str(getattr(attribute, "name", "") or "").strip().lower()
170+
171+
if when_failed != "in_the_past":
172+
return False
173+
174+
return "temperature" in name or "airflow" in name
175+
176+
177+
def get_failed_smart_attributes(device):
178+
"""
179+
Return pySMART attributes with a meaningful WHEN_FAILED value.
180+
"""
181+
failed_attrs = []
182+
for attribute in getattr(device, "attributes", []) or []:
183+
when_failed = str(getattr(attribute, "when_failed", "") or "").strip().lower()
184+
if when_failed and when_failed not in {"-", "none", "never"}:
185+
failed_attrs.append(attribute)
186+
return failed_attrs
187+
188+
189+
def smart_health_value(device):
190+
"""
191+
Convert pySMART assessment into the exported healthy metric.
192+
193+
PASS is healthy. WARN is also treated as healthy only when every failed
194+
attribute is a historical temperature/airflow threshold breach. Other WARN
195+
states, FAIL states, current failures, and non-temperature historical
196+
failures remain unhealthy.
197+
"""
198+
assessment = str(device.assessment or "").strip().upper()
199+
200+
if assessment == "PASS":
201+
return 1
202+
203+
if assessment != "WARN":
204+
return 0
205+
206+
failed_attrs = get_failed_smart_attributes(device)
207+
if not failed_attrs:
208+
return 0
209+
210+
if all(is_historical_temperature_attr_failure(attribute) for attribute in failed_attrs):
211+
return 1
212+
213+
return 0
214+
215+
157216
def get_rated_dwpd(model_name):
158217
"""
159218
Look up DWPD rating for the given model name, defaulting to 1.0.
@@ -224,6 +283,7 @@ def parse_device_info(device):
224283
"device_model": device.model or "",
225284
"serial_number": serial_number,
226285
"firmware_version": device.firmware or "",
286+
"assessment": device.assessment or "",
227287
}
228288
sorted_labels = sorted(labels.items())
229289
label_str = ",".join(f'{k}="{v}"' for k, v in sorted_labels)
@@ -240,10 +300,17 @@ def parse_device_info(device):
240300
f'smartmon_device_smart_enabled{{{metric_labels}}} {float(1) if device.smart_enabled else float(0)}'
241301
)
242302
if device.assessment:
243-
is_healthy = 1 if device.assessment.upper() == "PASS" else 0
303+
is_healthy = smart_health_value(device)
244304
metrics.append(
245305
f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}'
246306
)
307+
failed_attrs = get_failed_smart_attributes(device)
308+
historical_temperature_attr_failure = 1 if failed_attrs and all(
309+
is_historical_temperature_attr_failure(attribute) for attribute in failed_attrs
310+
) else 0
311+
metrics.append(
312+
f'smartmon_device_historical_temperature_failure{{{metric_labels}}} {float(historical_temperature_attr_failure)}'
313+
)
247314

248315
# Explicitly collect top-level temperature if available (fixes SCSI temperature issue)
249316
# pySMART exposes 'temperature' as a top-level property which we can use for SCSI,
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
features:
3+
- |
4+
The smartmon exporter now treats historical SMART temperature and airflow
5+
threshold breaches as non-critical when calculating the
6+
smartmon_device_smart_healthy metric. This prevents disks with only a past
7+
over-temperature event from being reported as actively unhealthy.
8+
9+
A new smartmon_device_historical_temperature_failure metric is exported so
10+
these historical temperature or airflow threshold breaches can still be
11+
viewed and alerted on separately as a warning if required.

0 commit comments

Comments
 (0)