diff --git a/changelog/69492.fixed.md b/changelog/69492.fixed.md new file mode 100644 index 000000000000..43a4b3e9f2e3 --- /dev/null +++ b/changelog/69492.fixed.md @@ -0,0 +1 @@ +Fixed ``lgpo_reg`` failures on Windows when ``Registry.pol`` is temporarily locked by the Group Policy service or other processes. Salt now uses ``EnterCriticalPolicySection`` / ``LeaveCriticalPolicySection`` from ``userenv.dll`` — the same synchronization primitive used by the GP engine — to serialize read-modify-write access to ``Registry.pol``. A retry loop with configurable attempts and delay is also applied for non-GP lockers such as antivirus scanners or VSS snapshots that do not participate in the GP critical section handshake. diff --git a/salt/modules/win_lgpo_reg.py b/salt/modules/win_lgpo_reg.py index 0290cd534808..5fb505998620 100644 --- a/salt/modules/win_lgpo_reg.py +++ b/salt/modules/win_lgpo_reg.py @@ -523,29 +523,31 @@ def set_value( msg = f"{v_type} data must be an integer" raise SaltInvocationError(msg) - pol_data = read_reg_pol(policy_class=policy_class) - - found_key, found_name = _find_value(pol_data, key, v_name) - - if found_key: - if found_name: - if "**del." in found_name: - log.debug("LGPO_REG Mod: Found disabled name: %s", found_name) - pol_data[found_key][v_name] = pol_data[found_key].pop(found_name) - found_name = v_name - log.debug("LGPO_REG Mod: Updating value: %s", found_name) - pol_data[found_key][found_name] = {"data": v_data, "type": v_type} + machine = policy_class == "Machine" + with salt.utils.win_lgpo_reg._policy_lock(machine=machine): + pol_data = read_reg_pol(policy_class=policy_class) + + found_key, found_name = _find_value(pol_data, key, v_name) + + if found_key: + if found_name: + if "**del." in found_name: + log.debug("LGPO_REG Mod: Found disabled name: %s", found_name) + pol_data[found_key][v_name] = pol_data[found_key].pop(found_name) + found_name = v_name + log.debug("LGPO_REG Mod: Updating value: %s", found_name) + pol_data[found_key][found_name] = {"data": v_data, "type": v_type} + else: + log.debug("LGPO_REG Mod: Setting new value: %s", found_name) + pol_data[found_key][v_name] = {"data": v_data, "type": v_type} else: - log.debug("LGPO_REG Mod: Setting new value: %s", found_name) - pol_data[found_key][v_name] = {"data": v_data, "type": v_type} - else: - log.debug("LGPO_REG Mod: Adding new key and value: %s", found_name) - pol_data[key] = {v_name: {"data": v_data, "type": v_type}} + log.debug("LGPO_REG Mod: Adding new key and value: %s", found_name) + pol_data[key] = {v_name: {"data": v_data, "type": v_type}} - success = True - if not write_reg_pol(pol_data, policy_class=policy_class): - log.error("LGPO_REG Mod: Failed to write registry.pol file") - success = False + success = True + if not write_reg_pol(pol_data, policy_class=policy_class): + log.error("LGPO_REG Mod: Failed to write registry.pol file") + success = False # Resolve auto-detect: skip registry write on Domain Controllers where # HKLM\SOFTWARE\Policies\ is write-protected by AD security hardening. @@ -661,40 +663,42 @@ def disable_value( else: raise SaltInvocationError("An invalid policy class was specified") - pol_data = read_reg_pol(policy_class=policy_class) + machine = policy_class == "Machine" + with salt.utils.win_lgpo_reg._policy_lock(machine=machine): + pol_data = read_reg_pol(policy_class=policy_class) - found_key, found_name = _find_value(pol_data, key, v_name) + found_key, found_name = _find_value(pol_data, key, v_name) - pol_modified = False - if found_key: - if found_name: - if "**del." in found_name: - log.debug("LGPO_REG Mod: Already disabled: %s", v_name) + pol_modified = False + if found_key: + if found_name: + if "**del." in found_name: + log.debug("LGPO_REG Mod: Already disabled: %s", v_name) + else: + log.debug("LGPO_REG Mod: Disabling value name: %s", v_name) + pol_data[found_key].pop(found_name) + found_name = f"**del.{found_name}" + pol_data[found_key][found_name] = {"data": " ", "type": "REG_SZ"} + pol_modified = True else: - log.debug("LGPO_REG Mod: Disabling value name: %s", v_name) - pol_data[found_key].pop(found_name) - found_name = f"**del.{found_name}" - pol_data[found_key][found_name] = {"data": " ", "type": "REG_SZ"} + log.debug("LGPO_REG Mod: Setting new disabled value name: %s", v_name) + pol_data[found_key][f"**del.{v_name}"] = { + "data": " ", + "type": "REG_SZ", + } pol_modified = True else: - log.debug("LGPO_REG Mod: Setting new disabled value name: %s", v_name) - pol_data[found_key][f"**del.{v_name}"] = { - "data": " ", - "type": "REG_SZ", - } + log.debug( + "LGPO_REG Mod: Adding new key and disabled value name: %s", found_name + ) + pol_data[key] = {f"**del.{v_name}": {"data": " ", "type": "REG_SZ"}} pol_modified = True - else: - log.debug( - "LGPO_REG Mod: Adding new key and disabled value name: %s", found_name - ) - pol_data[key] = {f"**del.{v_name}": {"data": " ", "type": "REG_SZ"}} - pol_modified = True - - success = True - if pol_modified: - if not write_reg_pol(pol_data, policy_class=policy_class): - log.error("LGPO_REG Mod: Failed to write registry.pol file") - success = False + + success = True + if pol_modified: + if not write_reg_pol(pol_data, policy_class=policy_class): + log.error("LGPO_REG Mod: Failed to write registry.pol file") + success = False # Resolve auto-detect: skip registry delete on Domain Controllers. if write_registry is None: @@ -813,29 +817,31 @@ def delete_value( else: raise SaltInvocationError("An invalid policy class was specified") - pol_data = read_reg_pol(policy_class=policy_class) + machine = policy_class == "Machine" + with salt.utils.win_lgpo_reg._policy_lock(machine=machine): + pol_data = read_reg_pol(policy_class=policy_class) - found_key, found_name = _find_value(pol_data, key, v_name) + found_key, found_name = _find_value(pol_data, key, v_name) - pol_modified = False - if found_key: - if found_name: - log.debug("LGPO_REG Mod: Removing value name: %s", found_name) - pol_data[found_key].pop(found_name) - pol_modified = True - if len(pol_data[found_key]) == 0: - log.debug("LGPO_REG Mod: Removing empty key: %s", found_key) - pol_data.pop(found_key) + pol_modified = False + if found_key: + if found_name: + log.debug("LGPO_REG Mod: Removing value name: %s", found_name) + pol_data[found_key].pop(found_name) + pol_modified = True + if len(pol_data[found_key]) == 0: + log.debug("LGPO_REG Mod: Removing empty key: %s", found_key) + pol_data.pop(found_key) + else: + log.debug("LGPO_REG Mod: Value name not found: %s", v_name) else: - log.debug("LGPO_REG Mod: Value name not found: %s", v_name) - else: - log.debug("LGPO_REG Mod: Key not found: %s", key) + log.debug("LGPO_REG Mod: Key not found: %s", key) - success = True - if pol_modified: - if not write_reg_pol(pol_data, policy_class=policy_class): - log.error("LGPO_REG Mod: Failed to write registry.pol file") - success = False + success = True + if pol_modified: + if not write_reg_pol(pol_data, policy_class=policy_class): + log.error("LGPO_REG Mod: Failed to write registry.pol file") + success = False # Resolve auto-detect: skip registry delete on Domain Controllers. if write_registry is None: diff --git a/salt/utils/win_lgpo_reg.py b/salt/utils/win_lgpo_reg.py index 801c8c808199..171be874ebc3 100644 --- a/salt/utils/win_lgpo_reg.py +++ b/salt/utils/win_lgpo_reg.py @@ -4,10 +4,13 @@ """ import ctypes +import ctypes.wintypes import logging import os import re import struct +import time +from contextlib import contextmanager import salt.modules.win_file import salt.utils.files @@ -151,12 +154,82 @@ def read_reg_pol_file(reg_pol_path): return return_data +def _write_with_retry(path, data, mode, retry_count, retry_delay): + """ + Write data to a file, retrying on Windows sharing violations (winerror 32). + Fails immediately on any other error (e.g. winerror 5, true access denied). + """ + for attempt in range(1, retry_count + 1): + try: + with salt.utils.files.fopen(path, mode) as f: + f.write(data) + return + except PermissionError as e: + if e.winerror != 32 or attempt == retry_count: + raise + log.warning( + "LGPO_REG Util: %s is locked (attempt %d/%d). " + "Retrying in %d seconds...", + path, + attempt, + retry_count, + retry_delay, + ) + time.sleep(retry_delay) + + +@contextmanager +def _policy_lock(machine=True): + """ + Context manager that holds the Windows GP critical section for the duration + of a read-modify-write cycle on Registry.pol. + + EnterCriticalPolicySection / LeaveCriticalPolicySection are the same + primitives gpsvc uses internally, so holding this lock prevents the GP + service from opening the policy file concurrently. + + EnterCriticalPolicySection is a blocking call — it does not return until + the critical section is acquired. If gpsvc currently holds it (e.g. during + a background GP refresh), this call blocks until gpsvc releases it, at which + point gpsvc will also have released any file lock on Registry.pol. No retry + loop is needed here; the blocking behavior is the wait mechanism. + + The only residual risk after acquiring the critical section is a non-GP + locker (AV scanner, VSS) that does not participate in this handshake; the + _write_with_retry layer handles those. + + If the lock cannot be acquired (returns NULL), a CommandExecutionError is + raised. NULL from this API always indicates a genuine system error (handle + exhaustion, access denied creating the kernel object, etc.) — not a "GP + service is busy" condition. The blocking behavior handles the busy case. + Proceeding silently without the lock would risk an uncoordinated + read-modify-write followed by a confusing downstream write error. + """ + userenv = ctypes.WinDLL("userenv.dll") + userenv.EnterCriticalPolicySection.restype = ctypes.wintypes.HANDLE + userenv.EnterCriticalPolicySection.argtypes = [ctypes.c_bool] + userenv.LeaveCriticalPolicySection.restype = ctypes.c_bool + userenv.LeaveCriticalPolicySection.argtypes = [ctypes.wintypes.HANDLE] + + handle = userenv.EnterCriticalPolicySection(machine) + if not handle: + raise CommandExecutionError( + "LGPO_REG Util: Failed to acquire GP critical section" + ) + try: + yield + finally: + userenv.LeaveCriticalPolicySection(handle) + + def write_reg_pol_data( data_to_write, policy_file_path, gpt_extension, gpt_extension_guid, gpt_ini_path=GPT_INI_PATH, + retry_count=10, + retry_delay=5, ): """ Helper function to actually write the data to a Registry.pol file @@ -178,6 +251,17 @@ def write_reg_pol_data( gpt_ini_path (str): The path to the gpt.ini file + retry_count (int): Number of attempts to make when a write fails due + to a sharing violation (``winerror 32``). Sharing violations occur + when a process such as an antivirus scanner or VSS holds the file + open with an incompatible sharing mode. The GP critical section + (see :func:`_policy_lock`) prevents races with ``gpsvc`` itself, + so retries are primarily a fallback for those other lockers. + Default is ``10``. + + retry_delay (int): Seconds to wait between retry attempts when a + sharing violation is encountered. Default is ``5``. + Returns: bool: True if successful @@ -191,14 +275,14 @@ def write_reg_pol_data( if data_to_write is None: data_to_write = b"" try: - with salt.utils.files.fopen(policy_file_path, "wb") as pol_file: - reg_pol_header = REG_POL_HEADER.encode("utf-16-le") - if not data_to_write.startswith(reg_pol_header): - log.debug("LGPO_REG Util: Writing header to %s", policy_file_path) - pol_file.write(reg_pol_header) - log.debug("LGPO_REG Util: Writing to %s", policy_file_path) - pol_file.write(data_to_write) - # TODO: This needs to be more specific + reg_pol_header = REG_POL_HEADER.encode("utf-16-le") + if not data_to_write.startswith(reg_pol_header): + log.debug("LGPO_REG Util: Writing header to %s", policy_file_path) + data_to_write = reg_pol_header + data_to_write + log.debug("LGPO_REG Util: Writing to %s", policy_file_path) + _write_with_retry( + policy_file_path, data_to_write, "wb", retry_count, retry_delay + ) except Exception as e: # pylint: disable=broad-except msg = ( "An error occurred attempting to write to registry.pol\n" @@ -297,12 +381,10 @@ def write_reg_pol_data( ) if gpt_ini_data: try: - with salt.utils.files.fopen(gpt_ini_path, "w") as gpt_file: - gpt_file.write(gpt_ini_data) - # TODO: This needs to be more specific + _write_with_retry(gpt_ini_path, gpt_ini_data, "w", retry_count, retry_delay) except Exception as e: # pylint: disable=broad-except msg = ( - "An error occurred attempting to write the gpg.ini file.\n" + "An error occurred attempting to write the gpt.ini file.\n" "path: {}\n" "exception: {}".format(gpt_ini_path, e) ) diff --git a/tests/pytests/unit/modules/test_win_lgpo_reg.py b/tests/pytests/unit/modules/test_win_lgpo_reg.py index 0e95ac9653cc..2ebcb361756f 100644 --- a/tests/pytests/unit/modules/test_win_lgpo_reg.py +++ b/tests/pytests/unit/modules/test_win_lgpo_reg.py @@ -751,3 +751,78 @@ def test_delete_value_warns_on_domain_gpo(reg_pol_mach): ) as mock_log: lgpo_reg.delete_value(key="SOFTWARE\\MyKey1", v_name="MyValue1") mock_log.warning.assert_called_once() + + +# --------------------------------------------------------------------------- +# _policy_lock integration — verify each RMW function acquires the lock +# --------------------------------------------------------------------------- + + +@pytest.fixture +def _mock_rmw(): + """Patch out read/write and _policy_lock so RMW functions run without I/O.""" + from contextlib import contextmanager + + lock_calls = [] + + @contextmanager + def fake_lock(machine=True): + lock_calls.append(machine) + yield + + with patch( + "salt.utils.win_lgpo_reg._policy_lock", side_effect=fake_lock + ), patch.object(lgpo_reg, "read_reg_pol", return_value={}), patch.object( + lgpo_reg, "write_reg_pol", return_value=True + ): + yield lock_calls + + +def test_set_value_acquires_machine_lock(_mock_rmw): + """set_value acquires the machine critical section for policy_class=Machine.""" + lgpo_reg.set_value( + key="SOFTWARE\\MyKey", + v_name="MyVal", + v_data=1, + v_type="REG_DWORD", + policy_class="Machine", + ) + assert _mock_rmw == [True] + + +def test_set_value_acquires_user_lock(_mock_rmw): + """set_value acquires the user critical section for policy_class=User.""" + lgpo_reg.set_value( + key="SOFTWARE\\MyKey", + v_name="MyVal", + v_data="x", + v_type="REG_SZ", + policy_class="User", + ) + assert _mock_rmw == [False] + + +def test_disable_value_acquires_machine_lock(_mock_rmw): + """disable_value acquires the machine critical section.""" + lgpo_reg.disable_value( + key="SOFTWARE\\MyKey", v_name="MyVal", policy_class="Machine" + ) + assert _mock_rmw == [True] + + +def test_disable_value_acquires_user_lock(_mock_rmw): + """disable_value acquires the user critical section.""" + lgpo_reg.disable_value(key="SOFTWARE\\MyKey", v_name="MyVal", policy_class="User") + assert _mock_rmw == [False] + + +def test_delete_value_acquires_machine_lock(_mock_rmw): + """delete_value acquires the machine critical section.""" + lgpo_reg.delete_value(key="SOFTWARE\\MyKey", v_name="MyVal", policy_class="Machine") + assert _mock_rmw == [True] + + +def test_delete_value_acquires_user_lock(_mock_rmw): + """delete_value acquires the user critical section.""" + lgpo_reg.delete_value(key="SOFTWARE\\MyKey", v_name="MyVal", policy_class="User") + assert _mock_rmw == [False] diff --git a/tests/pytests/unit/utils/test_win_lgpo_reg.py b/tests/pytests/unit/utils/test_win_lgpo_reg.py index 57e12e3a1160..8e50fe99c67f 100644 --- a/tests/pytests/unit/utils/test_win_lgpo_reg.py +++ b/tests/pytests/unit/utils/test_win_lgpo_reg.py @@ -675,3 +675,108 @@ def test_refresh_policy_returns_false_on_exception(): ): result = win_lgpo_reg.refresh_policy() assert result is False + + +# --------------------------------------------------------------------------- +# _write_with_retry +# --------------------------------------------------------------------------- + + +def _sharing_violation(): + """Return a PermissionError that looks like ERROR_SHARING_VIOLATION (32).""" + err = PermissionError(13, "Permission denied") + err.winerror = 32 + return err + + +def _access_denied(): + """Return a PermissionError that looks like ERROR_ACCESS_DENIED (5).""" + err = PermissionError(13, "Permission denied") + err.winerror = 5 + return err + + +def test_write_with_retry_sharing_violation_retries(): + """Retries on winerror 32 (sharing violation) and succeeds once the lock clears.""" + mock_fopen = MagicMock() + mock_fopen.side_effect = [_sharing_violation(), _sharing_violation(), MagicMock()] + + with patch("salt.utils.files.fopen", mock_fopen), patch( + "salt.utils.win_lgpo_reg.time.sleep" + ) as mock_sleep: + win_lgpo_reg._write_with_retry("test.pol", b"data", "wb", 5, 1) + + assert mock_fopen.call_count == 3 + assert mock_sleep.call_count == 2 + + +def test_write_with_retry_access_denied_no_retry(): + """Does not retry on winerror 5 (true access denied); fails on first attempt.""" + mock_fopen = MagicMock(side_effect=_access_denied()) + + with patch("salt.utils.files.fopen", mock_fopen), patch( + "salt.utils.win_lgpo_reg.time.sleep" + ) as mock_sleep: + with pytest.raises(PermissionError): + win_lgpo_reg._write_with_retry("test.pol", b"data", "wb", 10, 1) + + assert mock_fopen.call_count == 1 + mock_sleep.assert_not_called() + + +def test_write_with_retry_sharing_violation_exhausted(): + """Raises after retry_count attempts when the lock never releases.""" + mock_fopen = MagicMock(side_effect=_sharing_violation()) + + with patch("salt.utils.files.fopen", mock_fopen), patch( + "salt.utils.win_lgpo_reg.time.sleep" + ): + with pytest.raises(PermissionError): + win_lgpo_reg._write_with_retry("test.pol", b"data", "wb", 3, 1) + + assert mock_fopen.call_count == 3 + + +# --------------------------------------------------------------------------- +# _policy_lock +# --------------------------------------------------------------------------- + + +def _make_userenv_mock(handle_value): + """Return a mock userenv.dll where EnterCriticalPolicySection returns handle_value.""" + mock_dll = MagicMock() + mock_dll.EnterCriticalPolicySection.return_value = handle_value + mock_dll.LeaveCriticalPolicySection.return_value = True + return mock_dll + + +def test_policy_lock_acquires_and_releases(): + """Acquires the critical section and releases it after the with block.""" + handle = 12345 + mock_dll = _make_userenv_mock(handle) + with patch("salt.utils.win_lgpo_reg.ctypes.WinDLL", return_value=mock_dll): + with win_lgpo_reg._policy_lock(machine=True): + pass + mock_dll.EnterCriticalPolicySection.assert_called_once_with(True) + mock_dll.LeaveCriticalPolicySection.assert_called_once_with(handle) + + +def test_policy_lock_releases_on_exception(): + """Releases the critical section even when the body raises.""" + handle = 99 + mock_dll = _make_userenv_mock(handle) + with patch("salt.utils.win_lgpo_reg.ctypes.WinDLL", return_value=mock_dll): + with pytest.raises(RuntimeError): + with win_lgpo_reg._policy_lock(machine=False): + raise RuntimeError("body error") + mock_dll.LeaveCriticalPolicySection.assert_called_once_with(handle) + + +def test_policy_lock_null_handle_raises(): + """Raises CommandExecutionError when EnterCriticalPolicySection returns NULL.""" + mock_dll = _make_userenv_mock(0) + with patch("salt.utils.win_lgpo_reg.ctypes.WinDLL", return_value=mock_dll): + with pytest.raises(CommandExecutionError): + with win_lgpo_reg._policy_lock(): + pass # should not be reached + mock_dll.LeaveCriticalPolicySection.assert_not_called()