@@ -154,6 +154,65 @@ def load_dwpd_ratings(path=DWPD_RATINGS_PATH):
154154DWPD_RATINGS = load_dwpd_ratings ()
155155
156156
157+ # Helper: Identify historical temperature/airflow attribute failures
158+ def is_historical_temperature_attr_failure (attribute ):
159+ """
160+ Return True when a pySMART attribute failure represents only a historical
161+ temperature/airflow threshold breach.
162+
163+ Some disks keep WHEN_FAILED=In_the_past forever after an overheating event.
164+ pySMART turns that into assessment=WARN, which is useful to expose, but it
165+ should not make the main smart_healthy metric look like an active disk
166+ failure.
167+ """
168+ when_failed = str (getattr (attribute , "when_failed" , "" ) or "" ).strip ().lower ()
169+ name = str (getattr (attribute , "name" , "" ) or "" ).strip ().lower ()
170+
171+ if when_failed != "in_the_past" :
172+ return False
173+
174+ return "temperature" in name or "airflow" in name
175+
176+
177+ def get_failed_smart_attributes (device ):
178+ """
179+ Return pySMART attributes with a meaningful WHEN_FAILED value.
180+ """
181+ failed_attrs = []
182+ for attribute in getattr (device , "attributes" , []) or []:
183+ when_failed = str (getattr (attribute , "when_failed" , "" ) or "" ).strip ().lower ()
184+ if when_failed and when_failed not in {"-" , "none" , "never" }:
185+ failed_attrs .append (attribute )
186+ return failed_attrs
187+
188+
189+ def smart_health_value (device ):
190+ """
191+ Convert pySMART assessment into the exported healthy metric.
192+
193+ PASS is healthy. WARN is also treated as healthy only when every failed
194+ attribute is a historical temperature/airflow threshold breach. Other WARN
195+ states, FAIL states, current failures, and non-temperature historical
196+ failures remain unhealthy.
197+ """
198+ assessment = str (device .assessment or "" ).strip ().upper ()
199+
200+ if assessment == "PASS" :
201+ return 1
202+
203+ if assessment != "WARN" :
204+ return 0
205+
206+ failed_attrs = get_failed_smart_attributes (device )
207+ if not failed_attrs :
208+ return 0
209+
210+ if all (is_historical_temperature_attr_failure (attribute ) for attribute in failed_attrs ):
211+ return 1
212+
213+ return 0
214+
215+
157216def get_rated_dwpd (model_name ):
158217 """
159218 Look up DWPD rating for the given model name, defaulting to 1.0.
@@ -224,6 +283,7 @@ def parse_device_info(device):
224283 "device_model" : device .model or "" ,
225284 "serial_number" : serial_number ,
226285 "firmware_version" : device .firmware or "" ,
286+ "assessment" : device .assessment or "" ,
227287 }
228288 sorted_labels = sorted (labels .items ())
229289 label_str = "," .join (f'{ k } ="{ v } "' for k , v in sorted_labels )
@@ -240,10 +300,17 @@ def parse_device_info(device):
240300 f'smartmon_device_smart_enabled{{{ metric_labels } }} { float (1 ) if device .smart_enabled else float (0 )} '
241301 )
242302 if device .assessment :
243- is_healthy = 1 if device . assessment . upper () == "PASS" else 0
303+ is_healthy = smart_health_value ( device )
244304 metrics .append (
245305 f'smartmon_device_smart_healthy{{{ metric_labels } }} { float (is_healthy )} '
246306 )
307+ failed_attrs = get_failed_smart_attributes (device )
308+ historical_temperature_attr_failure = 1 if failed_attrs and all (
309+ is_historical_temperature_attr_failure (attribute ) for attribute in failed_attrs
310+ ) else 0
311+ metrics .append (
312+ f'smartmon_device_historical_temperature_failure{{{ metric_labels } }} { float (historical_temperature_attr_failure )} '
313+ )
247314
248315 # Explicitly collect top-level temperature if available (fixes SCSI temperature issue)
249316 # pySMART exposes 'temperature' as a top-level property which we can use for SCSI,
0 commit comments