Skip to content

Commit ac4a21b

Browse files
20035 FIX Service discovery: Fix timeout when enabling/disabling a service on hosts with many disabled services
When enabling or disabling a single service during service discovery on a host that already had many disabled services, the operation could run into the automation timeout (around 110 seconds) and fail. For every service change, Checkmk verifies the resulting "Disabled services" rules. Due to a regression, all services that were already disabled on the host were needlessly re-processed on every save, triggering one rule-matching automation call per disabled service. On hosts with a hundred or more disabled services, the accumulated time exceeded the automation timeout. Only the services whose state actually changes are now processed, so enabling or disabling a service is fast again regardless of how many disabled services the host has. SUP-29608 Change-Id: Iaffc3e388b178c87be5269ffa1daab33fc673541
1 parent 8e3db41 commit ac4a21b

4 files changed

Lines changed: 161 additions & 1 deletion

File tree

.werks/20035.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[//]: # (werk v3)
2+
# Service discovery: Fix timeout when enabling/disabling a service on hosts with many disabled services
3+
4+
key | value
5+
---------- | ---
6+
date | 2026-06-25T12:21:35+00:00
7+
version | 2.5.0p8
8+
class | fix
9+
edition | community
10+
component | checks
11+
level | 1
12+
compatible | yes
13+
14+
When enabling or disabling a single service during service discovery on a host that already had many disabled services, the operation could run into the automation timeout (around 110 seconds) and fail.
15+
16+
For every service change, Checkmk verifies the resulting "Disabled services" rules.
17+
Due to a regression, all services that were already disabled on the host were needlessly re-processed on every save, triggering one rule-matching automation call per disabled service.
18+
On hosts with a hundred or more disabled services, the accumulated time exceeded the automation timeout.
19+
20+
Only the services whose state actually changes are now processed, so enabling or disabling a service is fast again regardless of how many disabled services the host has.
21+

cmk/gui/watolib/services.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,13 @@ def _case_ignored(
10501050
remove_disabled_rule.add(descr)
10511051
if table_target == DiscoveryState.MONITORED:
10521052
autochecks_to_save[key] = value
1053+
# Note: disabled services must not be written to the autochecks file (CMK-33299), hence
1054+
# they are excluded from autochecks_to_save above. They must, however, still be registered
1055+
# in saved_services: this set is what cancels already-disabled, unchanged services out of
1056+
# add_disabled_rule in compute_discovery_transition. Omitting them here would push every
1057+
# already-disabled service of the host into add_disabled_rule on each save, causing one
1058+
# rule-match automation per service in EnabledDisabledServicesEditor (automation timeout).
1059+
if table_target in [DiscoveryState.MONITORED, DiscoveryState.IGNORED]:
10531060
saved_services.add(descr)
10541061
if table_target == DiscoveryState.IGNORED:
10551062
add_disabled_rule.add(descr)

tests/unit/cmk/gui/watolib/test_do_discovery.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,11 @@ def _get_combinations() -> list:
167167
{MOCK_DESC},
168168
),
169169
(DiscoveryState.IGNORED, DiscoveryState.IGNORED): (
170+
# An already-disabled service that stays disabled must not be written to the autochecks
171+
# file (empty autochecks_to_save), but it is registered in saved_services so that
172+
# compute_discovery_transition cancels it out of add_disabled_rule again. See CMK-26792.
170173
{},
171-
set(),
174+
{MOCK_DESC},
172175
{MOCK_DESC},
173176
set(),
174177
),

tests/unit/cmk/gui/watolib/test_services.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,6 +1250,135 @@ def test_perform_discovery_single_update__ignore(
12501250
assert add_disabled_rule == {"MSSQL S2DT Instance"}
12511251

12521252

1253+
def test_perform_discovery_single_update__ignore_does_not_re_add_existing_disabled_services(
1254+
mocker: MockerFixture,
1255+
sample_host_name: HostName,
1256+
sample_host: Host,
1257+
mock_set_autochecks: MagicMock,
1258+
) -> None:
1259+
"""Disabling a single service must not funnel all already-disabled services of the host into
1260+
add_disabled_rule again.
1261+
1262+
Each entry in add_disabled_rule triggers a separate analyze_service_rule_matches automation in
1263+
EnabledDisabledServicesEditor. On a host with many already-disabled services this caused one
1264+
automation call per service and ran into the automation timeout, even though the user only
1265+
toggled a single service. See SUP-29608, CMK-26792, CMK-33299.
1266+
"""
1267+
already_disabled = [f"Disabled service {i}" for i in range(5)]
1268+
mock_save_function = mocker.patch(
1269+
"cmk.gui.watolib.services.Discovery._save_host_service_enable_disable_rules",
1270+
return_value=None,
1271+
)
1272+
mocker.patch(
1273+
"cmk.gui.watolib.services.local_discovery_preview",
1274+
return_value=ServiceDiscoveryPreviewResult(
1275+
output="",
1276+
check_table=[],
1277+
nodes_check_table={},
1278+
host_labels={},
1279+
new_labels={},
1280+
vanished_labels={},
1281+
changed_labels={},
1282+
source_results={"agent": (0, "Success")},
1283+
labels_by_host={sample_host_name: []},
1284+
config_warnings=(),
1285+
),
1286+
)
1287+
1288+
previous_discovery_result = DiscoveryResult(
1289+
job_status={
1290+
"state": "finished",
1291+
"started": 1764593093.764405,
1292+
"pid": 604583,
1293+
"loginfo": {"JobProgressUpdate": [], "JobResult": [], "JobException": []},
1294+
"is_active": False,
1295+
"duration": 0.36932802200317383,
1296+
"title": "Refresh",
1297+
"stoppable": True,
1298+
"deletable": True,
1299+
"user": "cmkadmin",
1300+
"estimated_duration": 0.0,
1301+
"ppid": 604485,
1302+
"logfile_path": "~/var/log/web.log",
1303+
"acknowledged_by": None,
1304+
"lock_wato": False,
1305+
"host_name": sample_host_name,
1306+
},
1307+
check_table_created=1764596025,
1308+
check_table=[
1309+
# The single monitored service the user explicitly disables.
1310+
CheckPreviewEntry(
1311+
check_source="unchanged",
1312+
check_plugin_name="mssql_instance",
1313+
ruleset_name="mssql_instance",
1314+
discovery_ruleset_name=None,
1315+
item="S2DT",
1316+
old_discovered_parameters={},
1317+
new_discovered_parameters={},
1318+
effective_parameters={},
1319+
description="MSSQL S2DT Instance",
1320+
state=0,
1321+
output="nobody cares",
1322+
metrics=[],
1323+
old_labels={},
1324+
new_labels={},
1325+
found_on_nodes=[sample_host_name],
1326+
),
1327+
# A large number of services that are already disabled and stay disabled. These are
1328+
# not selected by the user and must not end up in add_disabled_rule.
1329+
*[
1330+
CheckPreviewEntry(
1331+
check_source="ignored",
1332+
check_plugin_name="local",
1333+
ruleset_name=None,
1334+
discovery_ruleset_name=None,
1335+
item=descr,
1336+
old_discovered_parameters={},
1337+
new_discovered_parameters={},
1338+
effective_parameters={},
1339+
description=descr,
1340+
state=0,
1341+
output="nobody cares",
1342+
metrics=[],
1343+
old_labels={},
1344+
new_labels={},
1345+
found_on_nodes=[sample_host_name],
1346+
)
1347+
for descr in already_disabled
1348+
],
1349+
],
1350+
nodes_check_table={},
1351+
host_labels={},
1352+
new_labels={},
1353+
vanished_labels={},
1354+
changed_labels={},
1355+
labels_by_host={sample_host_name: []},
1356+
sources={"agent": (0, "[agent] Success")},
1357+
config_warnings=(),
1358+
)
1359+
1360+
perform_service_discovery(
1361+
action=DiscoveryAction.SINGLE_UPDATE,
1362+
discovery_result=previous_discovery_result,
1363+
selected_services=(("mssql_instance", "S2DT"),),
1364+
update_source="unchanged",
1365+
update_target="ignored",
1366+
host=sample_host,
1367+
raise_errors=True,
1368+
automation_config=LocalAutomationConfig(),
1369+
user_permission_config=UserPermissionSerializableConfig({}, {}, []),
1370+
pprint_value=False,
1371+
debug=False,
1372+
use_git=False,
1373+
)
1374+
1375+
mock_save_function.assert_called_once()
1376+
remove_disabled_rule, add_disabled_rule, *_ = mock_save_function.call_args_list[0][0]
1377+
assert len(remove_disabled_rule) == 0
1378+
# Only the explicitly selected service, none of the already-disabled ones.
1379+
assert add_disabled_rule == {"MSSQL S2DT Instance"}
1380+
1381+
12531382
@pytest.mark.usefixtures("inline_background_jobs")
12541383
class TestPerformDiscoverySingleUpdate:
12551384
check_table = [

0 commit comments

Comments
 (0)