1515 MessageReceived , NodeBase , WaypointPayload ,
1616 DevicePayload , TransmissionPayload , EnvironmentPayload ,
1717 NodeHealthCheck , InsufficientDataError ,
18- AdvertiseInstancePayload
18+ AdvertiseInstancePayload , AlertSettings
1919)
2020from meshtastic_listener .utils import coords_int_to_float , load_node_env_var , system_stats
2121
@@ -273,7 +273,7 @@ def __traceroute_upstream__(self) -> None:
273273 This function is designed to run in a thread in a loop.
274274 '''
275275
276- favorites = self .db .select_favorite_nodes ()
276+ favorites = self .db .get_favorite_nodes ()
277277 if len (favorites ) > 0 :
278278 logging .info (f'Favorite nodes set to: { [self .__sanitize_string__ (str (f .longName )) for f in favorites ]} ' )
279279 else :
@@ -328,6 +328,62 @@ def __advertise_instance__(self) -> None:
328328 self .__check_listener_instances__ ()
329329 self .__sleep_with_exit__ (60 )
330330
331+ def __check_traceroute_responses__ (self , alert_settings : AlertSettings , lookback_ts : int ) -> None :
332+ for node in self .db .get_favorite_nodes ():
333+ traceroute_results = self .db .get_traceroute_results_by_node (
334+ source_id = self .local_node_id ,
335+ target_id = node .nodeNum ,
336+ lookback_ts = lookback_ts
337+ )
338+ total = len (traceroute_results )
339+ if total == 0 :
340+ continue
341+
342+ successful = len ([t for t in traceroute_results if t .rxTime is not None ])
343+ rate = (successful / total * 100 )
344+
345+ if total >= 3 and rate <= alert_settings .tracerouteFailureThreshold :
346+ alert_msg = f'Low Traceroute Success Rate to favorite node { node .nodeNum } ({ self .__sanitize_string__ (str (node .longName ))} ): { rate :.2f} % over last { total } attempts.'
347+ logging .warning (alert_msg )
348+ self .__notify_admins__ (alert_msg , priority = True )
349+
350+ def __create_node_health_alert__ (self , alert_settings : AlertSettings , health_check_stats : NodeHealthCheck ) -> str :
351+ alert_context = ''
352+
353+ ### CHANNEL UTILIZATION ###
354+ if health_check_stats .channelUsage >= alert_settings .channelUsageThreshold :
355+ alert_context += f'High Channel Usage: { health_check_stats .channelUsage } %\n '
356+
357+ ### TRACEROUTE SUCCESS RATE ###
358+ trace_avg = health_check_stats .tracerouteStatistics .average ()
359+ if trace_avg <= alert_settings .tracerouteFailureThreshold and health_check_stats .tracerouteStatistics .total >= 30 :
360+ # 30 for minimum statistical significance
361+ alert_context += f'Low TR Success Rate: { trace_avg } %\n '
362+
363+ ### TEMPERATURE ###
364+ if health_check_stats .environmentMetrics .temperature is not None :
365+ # https://helium.nebra.com/datasheets/hotspots/outdoor/Nebra%20Outdoor%20Hotspot%20Datasheet.pdf
366+ # the rated ambient operating temperature for the Nebra Outdoor Miner is -20C to 80C
367+ # give a buffer of +-20C for high and low temp warnings
368+ if health_check_stats .environmentMetrics .temperature >= alert_settings .highTemperatureThreshold :
369+ alert_context += f'High Temperature: { health_check_stats .environmentMetrics .temperature } °C\n '
370+ elif health_check_stats .environmentMetrics .temperature <= alert_settings .lowTemperatureThreshold :
371+ alert_context += f'Low Temperature: { health_check_stats .environmentMetrics .temperature } °C\n '
372+
373+ ### HUMIDITY ###
374+ if health_check_stats .environmentMetrics .relativeHumidity is not None :
375+ if health_check_stats .environmentMetrics .relativeHumidity >= alert_settings .highHumidityThreshold :
376+ alert_context += f'High Humidity: { health_check_stats .environmentMetrics .relativeHumidity } %\n '
377+
378+ ### SYSTEM STATS ###
379+ if health_check_stats .systemResources .cpuUsagePercent >= alert_settings .cpuUsageThreshold :
380+ alert_context += f'High CPU Usage: { health_check_stats .systemResources .cpuUsagePercent } %\n '
381+ if health_check_stats .systemResources .memoryUsagePercent >= alert_settings .memoryUsageThreshold :
382+ alert_context += f'High Memory Usage: { health_check_stats .systemResources .memoryUsagePercent } %\n '
383+
384+ return alert_context
385+
386+
331387 def __check_node_health__ (self ) -> None :
332388 '''
333389 Using the software host node ID, pull the last n hours of metrics and see what general trends are.
@@ -341,11 +397,14 @@ def __check_node_health__(self) -> None:
341397 while not self .shutdown_flag .is_set ():
342398 try :
343399 settings = self .db .get_alert_settings ()
344- logging .debug (f'Fetched alert settings from DB: { settings .model_dump ()} ' )
345-
346400 now = time .time ()
347401 lookback_ts = int (now - timedelta (hours = lookback_hours ).total_seconds ())
348402
403+ self .__check_traceroute_responses__ (
404+ alert_settings = settings ,
405+ lookback_ts = lookback_ts
406+ )
407+
349408 health_check_stats = NodeHealthCheck (
350409 nodeNum = self .local_node_id ,
351410 startTs = lookback_ts ,
@@ -354,7 +413,7 @@ def __check_node_health__(self) -> None:
354413 node_num = self .local_node_id ,
355414 lookback_ts = lookback_ts
356415 ),
357- tracerouteStatistics = self .db .return_traceroute_success_rate (
416+ tracerouteStatistics = self .db .get_traceroute_success_rate (
358417 from_id = self .local_node_id ,
359418 lookback_ts = lookback_ts
360419 ),
@@ -365,39 +424,10 @@ def __check_node_health__(self) -> None:
365424 systemResources = system_stats ()
366425 )
367426
368- alert_context = ''
369-
370- ### CHANNEL UTILIZATION ###
371- if health_check_stats .channelUsage >= settings .channelUsageThreshold :
372- alert_context += f'High Channel Usage: { health_check_stats .channelUsage } %\n '
373-
374- ### TRACEROUTE SUCCESS RATE ###
375- trace_avg = health_check_stats .tracerouteStatistics .average ()
376- if trace_avg <= settings .tracerouteFailureThreshold and health_check_stats .tracerouteStatistics .total >= 30 :
377- # 30 for minimum statistical significance
378- alert_context += f'Low TR Success Rate: { trace_avg } %\n '
379-
380- ### TEMPERATURE ###
381- if health_check_stats .environmentMetrics .temperature is not None :
382- # https://helium.nebra.com/datasheets/hotspots/outdoor/Nebra%20Outdoor%20Hotspot%20Datasheet.pdf
383- # the rated ambient operating temperature for the Nebra Outdoor Miner is -20C to 80C
384- # give a buffer of +-20C for high and low temp warnings
385- if health_check_stats .environmentMetrics .temperature >= settings .highTemperatureThreshold :
386- alert_context += f'High Temperature: { health_check_stats .environmentMetrics .temperature } °C\n '
387- elif health_check_stats .environmentMetrics .temperature <= settings .lowTemperatureThreshold :
388- alert_context += f'Low Temperature: { health_check_stats .environmentMetrics .temperature } °C\n '
389-
390- ### HUMIDITY ###
391- if health_check_stats .environmentMetrics .relativeHumidity is not None :
392- if health_check_stats .environmentMetrics .relativeHumidity >= settings .highHumidityThreshold :
393- alert_context += f'High Humidity: { health_check_stats .environmentMetrics .relativeHumidity } %\n '
394-
395- ### SYSTEM STATS ###
396- if health_check_stats .systemResources .cpuUsagePercent >= settings .cpuUsageThreshold :
397- alert_context += f'High CPU Usage: { health_check_stats .systemResources .cpuUsagePercent } %\n '
398- if health_check_stats .systemResources .memoryUsagePercent >= settings .memoryUsageThreshold :
399- alert_context += f'High Memory Usage: { health_check_stats .systemResources .memoryUsagePercent } %\n '
400-
427+ alert_context = self .__create_node_health_alert__ (
428+ alert_settings = settings ,
429+ health_check_stats = health_check_stats
430+ )
401431 if alert_context != '' :
402432 self .__notify_admins__ (f'Node: { self .interface .getLongName ()} \n { alert_context .strip ()} ' , priority = True )
403433
0 commit comments