Skip to content

Commit 8bde2e5

Browse files
committed
tests: add basic system health tests
Signed-off-by: Paul Spooren <mail@aparcar.org>
1 parent effe22c commit 8bde2e5

1 file changed

Lines changed: 192 additions & 0 deletions

File tree

tests/test_system_health.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""System health monitoring tests for OpenWrt."""
2+
3+
import re
4+
5+
6+
class TestSystemHealth:
7+
"""Tests for monitoring system health and resource usage."""
8+
9+
def test_cpu_load(self, ssh_command, results_bag):
10+
"""Test CPU load is within acceptable limits."""
11+
# Get load average for 1, 5, and 15 minutes
12+
output = ssh_command.run_check("uptime")
13+
load_match = re.search(r"load average: ([\d.]+), ([\d.]+), ([\d.]+)", output[0])
14+
15+
assert load_match, "Could not parse load average"
16+
17+
load_1min = float(load_match.group(1))
18+
load_5min = float(load_match.group(2))
19+
load_15min = float(load_match.group(3))
20+
21+
results_bag["cpu_load"] = {
22+
"1min": load_1min,
23+
"5min": load_5min,
24+
"15min": load_15min,
25+
}
26+
27+
# Load should generally be less than 2x CPU count for healthy system
28+
assert load_15min < 1, f"15-minute load average {load_15min} is stragely high"
29+
30+
def test_memory_usage(self, ssh_command, results_bag):
31+
"""Test memory usage and check for memory leaks."""
32+
# Parse memory information
33+
mem_output = ssh_command.run_check("free -m")
34+
mem_lines = mem_output[1].split()
35+
36+
total_mem = int(mem_lines[1])
37+
used_mem = int(mem_lines[2])
38+
free_mem = int(mem_lines[3])
39+
available_mem = int(mem_lines[6]) if len(mem_lines) > 6 else free_mem
40+
41+
# Calculate percentage
42+
mem_percent = (used_mem / total_mem) * 100
43+
44+
results_bag["memory_usage"] = {
45+
"total_mb": total_mem,
46+
"used_mb": used_mem,
47+
"free_mb": free_mem,
48+
"available_mb": available_mem,
49+
"percent_used": round(mem_percent, 2),
50+
}
51+
52+
# Memory usage should be less than 90%
53+
assert mem_percent < 90, f"Memory usage {mem_percent:.1f}% is too high"
54+
55+
# Should have at least 10MB available
56+
assert available_mem > 10, f"Only {available_mem}MB available memory"
57+
58+
def test_filesystem_usage(self, ssh_command, results_bag):
59+
"""Test filesystem usage on critical mount points."""
60+
# Check key filesystems
61+
filesystems = ["/", "/tmp", "/overlay"]
62+
fs_usage = {}
63+
64+
for fs in filesystems:
65+
# Skip if filesystem doesn't exist
66+
if ssh_command.run(f"test -d {fs}")[2] != 0:
67+
continue
68+
69+
df_output = ssh_command.run_check(f"df -h {fs} | tail -1")
70+
parts = df_output[0].split()
71+
72+
if len(parts) >= 5:
73+
fs_usage[fs] = {
74+
"filesystem": parts[0],
75+
"size": parts[1],
76+
"used": parts[2],
77+
"available": parts[3],
78+
"percent_used": int(parts[4].rstrip("%")),
79+
}
80+
81+
results_bag["filesystem_usage"] = fs_usage
82+
83+
# Check critical filesystems aren't full
84+
for fs, usage in fs_usage.items():
85+
assert usage["percent_used"] < 95, (
86+
f"Filesystem {fs} is {usage['percent_used']}% full"
87+
)
88+
89+
def test_system_uptime(self, ssh_command, results_bag):
90+
"""Test and record system uptime."""
91+
uptime_output = ssh_command.run_check("cat /proc/uptime")
92+
uptime_seconds = float(uptime_output[0].split()[0])
93+
94+
assert uptime_seconds > 3600.0, "System uptime is over 1 hour"
95+
96+
def test_temperature_sensors(self, ssh_command, results_bag):
97+
"""Test temperature sensors if available."""
98+
# Check if thermal zones exist
99+
thermal_zones = ssh_command.run(
100+
"ls /sys/class/thermal/thermal_zone*/temp 2>/dev/null"
101+
)[0]
102+
103+
if thermal_zones:
104+
temperatures = {}
105+
for zone in thermal_zones:
106+
if zone:
107+
temp_raw = ssh_command.run_check(f"cat {zone}")[0]
108+
temp_celsius = int(temp_raw) / 1000
109+
zone_name = zone.split("/")[-2]
110+
temperatures[zone_name] = temp_celsius
111+
112+
results_bag["temperatures"] = temperatures
113+
114+
# Check if any temperature is critically high (>85°C)
115+
for zone, temp in temperatures.items():
116+
assert temp < 85, f"Temperature in {zone} is critically high: {temp}°C"
117+
118+
def test_kernel_ring_buffer(self, ssh_command):
119+
"""Check kernel ring buffer for critical errors."""
120+
dmesg_output = "\n".join(ssh_command.run_check("dmesg"))
121+
122+
# Critical error patterns to check
123+
critical_patterns = [
124+
r"Out of memory",
125+
r"Kernel panic",
126+
r"BUG:",
127+
r"WARNING:",
128+
r"Unable to handle kernel",
129+
r"Oops:",
130+
r"segfault",
131+
r"stack overflow",
132+
r"corruption",
133+
r"hung task",
134+
]
135+
136+
errors_found = []
137+
for pattern in critical_patterns:
138+
matches = re.findall(f".*{pattern}.*", dmesg_output, re.IGNORECASE)
139+
if matches:
140+
errors_found.extend(matches)
141+
142+
assert not errors_found, (
143+
f"Critical errors found in kernel log: {errors_found[:5]}"
144+
) # Show first 5
145+
146+
def test_process_count(self, ssh_command, results_bag):
147+
"""Test number of running processes is reasonable."""
148+
proc_count = int(ssh_command.run_check("ps | wc -l")[0])
149+
150+
results_bag["process_count"] = proc_count
151+
152+
# Alert if too many processes (possible fork bomb or resource leak)
153+
assert proc_count < 300, f"Too many processes running: {proc_count}"
154+
155+
# Alert if too few processes (system might not be fully functional)
156+
assert proc_count > 20, f"Too few processes running: {proc_count}"
157+
158+
def test_zombie_processes(self, ssh_command):
159+
"""Check for zombie processes."""
160+
zombies = ssh_command.run("ps | grep -E '\\s+Z\\s+' | grep -v grep")[0]
161+
162+
zombie_count = len(zombies)
163+
164+
assert zombie_count == 0, f"Found {zombie_count} zombie processes:\n{zombies}"
165+
166+
def test_entropy_available(self, ssh_command):
167+
"""Test that sufficient entropy is available for cryptographic operations."""
168+
entropy = int(
169+
ssh_command.run_check("cat /proc/sys/kernel/random/entropy_avail")[0]
170+
)
171+
172+
# Should have at least 256 bits of entropy
173+
assert entropy >= 256, f"Insufficient entropy available: {entropy} bits"
174+
175+
def test_open_file_descriptors(self, ssh_command, results_bag):
176+
"""Test system-wide open file descriptors."""
177+
# Get current and max file descriptors
178+
fd_info = ssh_command.run_check("cat /proc/sys/fs/file-nr")[0].split()
179+
180+
allocated_fds = int(fd_info[0])
181+
max_fds = int(fd_info[2])
182+
183+
fd_percent = (allocated_fds / max_fds) * 100
184+
185+
results_bag["file_descriptors"] = {
186+
"allocated": allocated_fds,
187+
"maximum": max_fds,
188+
"percent_used": round(fd_percent, 2),
189+
}
190+
191+
# Should not be close to the limit
192+
assert fd_percent < 80, f"Too many file descriptors in use: {fd_percent:.1f}%"

0 commit comments

Comments
 (0)