Skip to content

Commit 77e0499

Browse files
committed
Merge branch 'cluster-health'
2 parents 71eaeec + 512189e commit 77e0499

8 files changed

Lines changed: 357 additions & 2 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ check_vsphere
99
check_vsphere_*
1010
*.zip
1111
dist/
12+
constraints.txt

CHANGES.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# v0.3.21
2+
3+
* introduce `cluster-health` mode
4+
15
# v0.3.20
26

37
* fix some exception handling bug

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
.PHONY: all clean test
22

33
PYTHON=python3
4+
CONSTRAINTS := $(if $(wildcard constraints.txt),-c constraints.txt,)
45

56
all: check_vsphere_bundle check_vsphere
67

78
check_vsphere_bundle:
8-
pip install --no-cache-dir --no-compile --target allinone .
9+
pip install $(CONSTRAINTS) --no-cache-dir --no-compile --target allinone .
910
mv allinone/bin/check_vsphere allinone/__main__.py
1011
$(PYTHON) -m zipapp -c -p '/usr/bin/env python3' allinone
1112
rm -rf allinone
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import unittest
2+
from monplugin import Status
3+
from typing import List
4+
5+
6+
def check_cluster_health(failed: int, members: int, thresholds: List[str]) -> Status:
7+
def parse_threshold(t_str: str) -> tuple:
8+
"""Parse threshold string and return (max_members, warn_str, crit_str)."""
9+
parts = t_str.split(":")
10+
11+
if len(parts) == 3:
12+
try:
13+
max_m = int(parts[0])
14+
except ValueError:
15+
raise ValueError(f"Invalid max_members value: '{parts[0]}'")
16+
return (max_m, parts[1], parts[2])
17+
elif len(parts) == 2:
18+
return (float("inf"), parts[0], parts[1])
19+
else:
20+
raise ValueError(f"Malformed threshold string: '{t_str}'")
21+
22+
def resolve_value(s: str) -> float:
23+
"""Resolve a threshold value (can be a number or percentage)."""
24+
s = s.strip()
25+
if s.endswith("%"):
26+
return (float(s[:-1]) / 100.0) * members
27+
return float(s)
28+
29+
# Parse and validate all thresholds
30+
parsed = {}
31+
for t_str in thresholds:
32+
max_m, warn_str, crit_str = parse_threshold(t_str)
33+
34+
if max_m in parsed:
35+
label = (
36+
"Fallback (Infinity)"
37+
if max_m == float("inf")
38+
else f"max_members {max_m}"
39+
)
40+
raise ValueError(f"Duplicate threshold configuration found for {label}")
41+
42+
parsed[max_m] = (warn_str, crit_str)
43+
44+
if float("inf") not in parsed:
45+
raise ValueError(
46+
"No fallback threshold (Infinity) provided in thresholds list."
47+
)
48+
49+
# Select the tightest threshold applicable to `members`
50+
warn_str, crit_str = parsed[min(m for m in parsed if m >= members)]
51+
52+
# Compare failed count against resolved limits
53+
if failed >= resolve_value(crit_str):
54+
return Status.CRITICAL
55+
if failed >= resolve_value(warn_str):
56+
return Status.WARNING
57+
return Status.OK
58+
59+
60+
class TestThreshold(unittest.TestCase):
61+
def test_it(self):
62+
thresholds = ["3:1:1", "4:1:2", "5:1:2", "10:1:30%", "3:30%"]
63+
self.assertEqual(check_cluster_health(0, 1, thresholds), Status.OK)
64+
self.assertEqual(check_cluster_health(0, 2, thresholds), Status.OK)
65+
self.assertEqual(check_cluster_health(0, 3, thresholds), Status.OK)
66+
self.assertEqual(check_cluster_health(0, 4, thresholds), Status.OK)
67+
self.assertEqual(check_cluster_health(0, 5, thresholds), Status.OK)
68+
self.assertEqual(check_cluster_health(0, 6, thresholds), Status.OK)
69+
self.assertEqual(check_cluster_health(0, 9, thresholds), Status.OK)
70+
self.assertEqual(check_cluster_health(0, 10, thresholds), Status.OK)
71+
self.assertEqual(check_cluster_health(0, 11, thresholds), Status.OK)
72+
self.assertEqual(check_cluster_health(1, 11, thresholds), Status.OK)
73+
self.assertEqual(check_cluster_health(0, 100, thresholds), Status.OK)
74+
self.assertEqual(check_cluster_health(1, 100, thresholds), Status.OK)
75+
self.assertEqual(check_cluster_health(2, 100, thresholds), Status.OK)
76+
77+
self.assertEqual(check_cluster_health(1, 4, thresholds), Status.WARNING)
78+
self.assertEqual(check_cluster_health(1, 5, thresholds), Status.WARNING)
79+
self.assertEqual(check_cluster_health(1, 6, thresholds), Status.WARNING)
80+
self.assertEqual(check_cluster_health(1, 9, thresholds), Status.WARNING)
81+
self.assertEqual(check_cluster_health(1, 10, thresholds), Status.WARNING)
82+
self.assertEqual(check_cluster_health(2, 10, thresholds), Status.WARNING)
83+
self.assertEqual(check_cluster_health(3, 100, thresholds), Status.WARNING)
84+
self.assertEqual(check_cluster_health(28, 100, thresholds), Status.WARNING)
85+
self.assertEqual(check_cluster_health(29, 100, thresholds), Status.WARNING)
86+
87+
self.assertEqual(check_cluster_health(1, 1, thresholds), Status.CRITICAL)
88+
self.assertEqual(check_cluster_health(2, 1, thresholds), Status.CRITICAL)
89+
self.assertEqual(check_cluster_health(1, 2, thresholds), Status.CRITICAL)
90+
self.assertEqual(check_cluster_health(9, 3, thresholds), Status.CRITICAL)
91+
self.assertEqual(check_cluster_health(2, 4, thresholds), Status.CRITICAL)
92+
self.assertEqual(check_cluster_health(3, 4, thresholds), Status.CRITICAL)
93+
self.assertEqual(check_cluster_health(3, 4, thresholds), Status.CRITICAL)
94+
self.assertEqual(check_cluster_health(2, 5, thresholds), Status.CRITICAL)
95+
self.assertEqual(check_cluster_health(3, 5, thresholds), Status.CRITICAL)
96+
self.assertEqual(check_cluster_health(3, 10, thresholds), Status.CRITICAL)
97+
self.assertEqual(check_cluster_health(30, 100, thresholds), Status.CRITICAL)
98+
self.assertEqual(check_cluster_health(39, 100, thresholds), Status.CRITICAL)
99+
self.assertEqual(check_cluster_health(1000, 100, thresholds), Status.CRITICAL)
100+
101+
self.assertEqual(check_cluster_health(0, 10, ["1:100%"]), Status.OK)
102+
self.assertEqual(check_cluster_health(1, 10, ["1:100%"]), Status.WARNING)
103+
self.assertEqual(check_cluster_health(1, 1, ["1:100%"]), Status.CRITICAL)
104+
105+
def test_duplicate_error(self):
106+
with self.assertRaises(ValueError):
107+
check_cluster_health(0, 5, ["10:1:2", "10:2:3", "5:6"])
108+
with self.assertRaises(ValueError):
109+
check_cluster_health(0, 5, ["10:1:2", "2:3", "5:6"])
110+
111+
def test_too_large_warn(self):
112+
self.assertEqual(check_cluster_health(2, 10, ["20:10%"]), Status.CRITICAL)
113+
114+
def test_missing_fallback_error(self):
115+
with self.assertRaises(ValueError):
116+
check_cluster_health(0, 5, ["10:1:2"])
117+
with self.assertRaises(ValueError):
118+
check_cluster_health(0, 5, ["10:1:2", "20:2:3"])

checkvsphere/tools/helper.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

1717
import re
18+
import argparse
1819
from pyVmomi import vim, vmodl
1920
from . import serviceutil
2021

@@ -121,6 +122,17 @@ def get_metric(perfMgr, perfCounterStr, perfInstance):
121122
return (None, None)
122123

123124

125+
def cluster_health_type(value):
126+
parts = value.split(":")
127+
128+
if len(parts) not in (2, 3):
129+
raise argparse.ArgumentTypeError(
130+
"argument must be [max_members:]warn_threshold:crit_threshold"
131+
)
132+
133+
return value
134+
135+
124136
class CheckArgument:
125137
def __init__(self):
126138
pass
@@ -138,6 +150,21 @@ def __init__(self):
138150
},
139151
}
140152

153+
CLUSTER_NAME = {
154+
'name_or_flags': ['--cluster-name'],
155+
'options': {'action': 'store', 'help': 'Cluster name'}
156+
}
157+
158+
CLUSTER_THRESHOLD = {
159+
'name_or_flags': ['--cluster-threshold'],
160+
'options': {
161+
'default': [],
162+
'action': 'append',
163+
'type': cluster_health_type,
164+
'help': 'cluster threshold: [max_members:]warn_threshold:crit_threshold'
165+
},
166+
}
167+
141168
WARNING_THRESHOLD = {
142169
'name_or_flags': ['--warning'],
143170
'options': {'action': 'store', 'help': 'warning threshold'},

checkvsphere/vcmd/clusterhealth.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#!/usr/bin/env python3
2+
3+
# Copyright (C) 2023 ConSol Consulting & Solutions Software GmbH
4+
#
5+
# This program is free software: you can redistribute it and/or modify
6+
# it under the terms of the GNU Affero General Public License as
7+
# published by the Free Software Foundation, either version 3 of the
8+
# License, or (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
17+
18+
"""
19+
checks the health of a cluster in vsphere
20+
"""
21+
22+
__cmd__ = 'cluster-health'
23+
24+
import logging
25+
from pyVmomi import vim, vmodl
26+
from pprint import pprint as pp
27+
from monplugin import Check, Status
28+
from ..tools import cli, service_instance
29+
from ..tools.cluster_health import check_cluster_health
30+
from ..tools.helper import (
31+
CheckArgument,
32+
find_entity_views,
33+
process_retrieve_content
34+
)
35+
36+
args = None
37+
38+
def host_is_failed(args, h):
39+
name = h['name']
40+
faulty = set(args.faulty)
41+
overall = h['overallStatus'].capitalize()
42+
43+
if f'overallStatus{overall}' in faulty:
44+
logging.debug(f"{name} overallStatus {h['overallStatus']}")
45+
return True
46+
47+
if "inMaintenance" in faulty and h['runtime'].inMaintenanceMode:
48+
logging.debug(f"{name} in maintenance")
49+
return True
50+
51+
if "inQuarantine" in faulty and h['runtime'].inQuarantineMode:
52+
logging.debug(f"{name} quarantined")
53+
return True
54+
55+
if "notconnected" in faulty and h['runtime'].connectionState != "connected":
56+
logging.debug(f"{name} connectionState {h['runtime'].connectionState}")
57+
return True
58+
59+
return False
60+
61+
def host_in_standby(h):
62+
r = h['runtime'].standbyMode == "none"
63+
if not r:
64+
logging.debug(f"{h['name']} in standby, not considered part of cluster")
65+
66+
def run():
67+
global args
68+
parser = cli.Parser()
69+
parser.add_required_arguments(CheckArgument.CLUSTER_NAME)
70+
parser.add_required_arguments(CheckArgument.CLUSTER_THRESHOLD)
71+
parser.add_optional_arguments({
72+
'name_or_flags': ['--nostandby'],
73+
'options': {
74+
'action': 'store_true',
75+
'default': False,
76+
'help': 'Standby nodes are not considered part of the cluster'
77+
}
78+
})
79+
parser.add_optional_arguments({
80+
'name_or_flags': ['--faulty'],
81+
'options': {
82+
'action': 'append',
83+
'default': [],
84+
'help': 'Things that are considered faulty (*=default): *inMaintenance, '
85+
'*notconnected, inStandby, inQurantine, overallStatusRed, '
86+
'overallStatusYellow, overallStatusGrey'
87+
}
88+
})
89+
args = parser.get_args()
90+
if not args.faulty:
91+
args.faulty = ['notconnected', 'inMaintenance']
92+
93+
# if yellow is faulty, red is definitly faulty as well
94+
if 'overallStatusYellow' in args.faulty:
95+
args.faulty.append('overallStatusRed')
96+
97+
si = service_instance.connect(args)
98+
check = Check()
99+
hosts = []
100+
101+
try:
102+
res = find_entity_views(
103+
si,
104+
vim.ClusterComputeResource,
105+
begin_entity=si.content.rootFolder,
106+
sieve=( {'name': args.cluster_name} ),
107+
properties=["name", "host"],
108+
)[0]
109+
hosts = res['props']['host']
110+
except IndexError:
111+
check.exit(Status.UNKNOWN, f"{args.cluster_name} not found")
112+
113+
if not hosts:
114+
check.exit(Status.CRITICAL, "Cluster is empty")
115+
116+
hosts = resolve_hosts(si, hosts)
117+
# Hosts in standby are not considered part of the cluster
118+
hosts = list(filter(lambda h: not host_in_standby(h), hosts))
119+
120+
cluster_size = len(hosts)
121+
failed_members = sum(1 for h in hosts if host_is_failed(args, h))
122+
cluster_state = check_cluster_health(failed_members, cluster_size, args.cluster_threshold)
123+
check.exit(
124+
cluster_state,
125+
f"{cluster_size-failed_members} out of {cluster_size} cluster members are ok"
126+
)
127+
128+
129+
def resolve_hosts(si, hosts):
130+
ObjectSpec = vmodl.query.PropertyCollector.ObjectSpec
131+
retrieve = si.content.propertyCollector.RetrieveContents
132+
propspec = vmodl.query.PropertyCollector.PropertySpec(
133+
all=False,
134+
pathSet=['name', 'overallStatus', 'runtime'],
135+
type=vim.HostSystem
136+
)
137+
filter_spec = vmodl.query.PropertyCollector.FilterSpec(
138+
objectSet = list(map(lambda x: ObjectSpec(obj=x), hosts)),
139+
propSet = [propspec],
140+
)
141+
objs = retrieve( [filter_spec] )
142+
143+
def props_mapping(obj):
144+
props = {}
145+
for p in obj.propSet:
146+
props[p.name] = p.val
147+
return props
148+
149+
result = list(map(lambda x: props_mapping(x), objs))
150+
#pp(result)
151+
return result
152+
153+
if __name__ == "__main__":
154+
run()

docs/cmd/cluster-health.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
title: cluster-health
3+
---
4+
5+
## Description
6+
7+
The `cluster-health` command checks the health of a vSphere cluster, evaluating node status against user‑defined thresholds.
8+
9+
## Options
10+
11+
Besides the [general options](../../general-options/) this command supports the following options:
12+
13+
| option | description |
14+
|---|---|
15+
| `--cluster-name CLUSTER_NAME` | Name of the cluster to check |
16+
| `--cluster-threshold CLUSTER_THRESHOLD` | Cluster threshold: `[max_members:]warn:crit`. Numbers or percentages; max_members optional. |
17+
| `--nostandby` | Standby nodes are not considered part of the cluster |
18+
| `--faulty FAULTY` | Fault conditions to treat as failures (e.g., `*inMaintenance`, `*notconnected`, `inStandby`, `inQuarantine`, `overallStatusRed`, `overallStatusYellow`, `overallStatusGrey`). `*` marks default entries |
19+
20+
## --cluster-threshold details
21+
22+
`--cluster-threshold CLUSTER_THRESHOLD`
23+
24+
The syntax is `[max_members:]warn_threshold:crit_threshold`.
25+
26+
- `max_members` (optional) – applies the rule only when the cluster has this many nodes; if omitted it serves as a fallback for any size.
27+
- `warn_threshold` – number or percentage of faulty nodes that triggers a **WARNING**.
28+
- `crit_threshold` – number or percentage of faulty nodes that triggers a **CRITICAL**.
29+
30+
Thresholds can be absolute numbers (e.g., `1`) or percentages (e.g., `30%`). Mixed forms are allowed (e.g., `4:1:3`). Multiple `--cluster-threshold` flags may be given for different cluster sizes. Exactly one `--cluster-threshold` must omit `max_members` and acts as a fallback for any cluster size.
31+
32+
Examples:
33+
34+
- `1:1:1` – a 1‑node cluster is critical if any node fails.
35+
- `4:1:3` – a 4‑node cluster warns at 1 faulty node, critical at 3.
36+
- `30%:50%` – for clusters larger than 4 nodes, warn at ≥30 % failures, critical at ≥50 % failures.
37+
38+
## Examples
39+
40+
```bash
41+
check_vsphere cluster-health \
42+
--host vcenter.example.com \
43+
-nossl \
44+
-u naemon@vsphere.local \
45+
-p "PW" \
46+
--cluster-threshold '1:1:1' \
47+
--cluster-threshold '4:1:3' \
48+
--cluster-threshold '30%:50%' \
49+
--cluster-name CLUSTERNAME
50+
```

0 commit comments

Comments
 (0)