Skip to content

Commit df87f47

Browse files
committed
[fix] Reconcile stuck "modified" config status on checksum request #1330
When a device applies a new configuration but fails to report its status back (e.g. due to a transient HTTP 502 from the controller), the config remains in "modified" state on the server forever. The device's agent will compare its local checksum with the remote checksum on the next polling cycle and find them identical, so it will not re-download or re-report. Without manual intervention the status stays "modified" indefinitely. Detect this condition on the DeviceChecksumView: if the config has been in "modified" state longer than a 5 minute grace period and the device is actively polling (proven by the very checksum request we are handling), set the status to "applied". Implementation notes: - Use the cached device object's config status as a fast path: if the cached status is not "modified" we return immediately with zero extra database queries, preserving the existing zero-query guarantee of the cached checksum path. - A second fast path checks the cached "modified" timestamp: if the grace period has not yet elapsed based on the cached timestamp alone, we skip the DB query entirely. - Only when both fast paths pass do we re-query Config fresh from the database inside a transaction with select_for_update to avoid race conditions with concurrent requests. - The re-query uses .only() on the fields we need to keep it cheap. - A 5 minute grace period avoids fighting an in-flight apply that has not had time to report yet. - The entire method is wrapped in try/except to ensure reconciliation never fails the checksum endpoint. Closes #1330
1 parent 0d17acd commit df87f47

File tree

2 files changed

+148
-0
lines changed

2 files changed

+148
-0
lines changed

openwisp_controller/config/controller/views.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from django.core.exceptions import FieldDoesNotExist, ValidationError
99
from django.db import transaction
1010
from django.db.models import Q
11+
from django.utils import timezone
1112
from django.utils.decorators import method_decorator
1213
from django.utils.translation import gettext_lazy as _
1314
from django.views.decorators.csrf import csrf_exempt
@@ -144,6 +145,13 @@ class DeviceChecksumView(UpdateLastIpMixin, GetDeviceView):
144145
returns device's configuration checksum
145146
"""
146147

148+
# Grace period before reconciling a "modified" status.
149+
# If the config has been in "modified" state for longer than this,
150+
# and the device is actively requesting its checksum (proving it's
151+
# online and polling), we assume a previous report_status call was
152+
# lost due to a transient error and reconcile the status to "applied".
153+
_STATUS_RECONCILE_GRACE_SECONDS = 300 # 5 minutes
154+
147155
def get(self, request, pk):
148156
device = self.get_device()
149157
bad_request = forbid_unallowed(request, "GET", "key", device.key)
@@ -156,10 +164,90 @@ def get(self, request, pk):
156164
checksum_requested.send(
157165
sender=device.__class__, instance=device, request=request
158166
)
167+
self._reconcile_modified_status(device)
159168
return ControllerResponse(
160169
device.config.get_cached_checksum(), content_type="text/plain"
161170
)
162171

172+
@staticmethod
173+
def _reconcile_modified_status(device):
174+
"""
175+
Reconciles config status for devices stuck in "modified" state.
176+
177+
When a device applies a new configuration but fails to report its
178+
status back (e.g., due to a transient HTTP error like 502), the
179+
config remains in "modified" state on the controller even though
180+
the device already has the current configuration.
181+
182+
The device's agent will compare its local checksum with the
183+
remote checksum on the next polling cycle and find they match,
184+
so it won't re-download or re-report. The status stays "modified"
185+
indefinitely.
186+
187+
This method detects this condition: if the config has been in
188+
"modified" state for longer than the grace period and the device
189+
is actively polling (proven by this very checksum request), we
190+
set the status to "applied".
191+
192+
Fast path: the cached device object (from cache_memoize) is used
193+
first to check the status. Only when the cached status indicates
194+
"modified" do we re-query the database for the fresh state. This
195+
keeps the checksum endpoint zero-query for the common case where
196+
the status is already "applied".
197+
"""
198+
# Best-effort: never let reconciliation fail the checksum endpoint.
199+
try:
200+
# Fast path 1: if the cached device's config is not "modified",
201+
# there is nothing to reconcile.
202+
try:
203+
cached_config = device.config
204+
except Config.DoesNotExist:
205+
return
206+
if cached_config is None or cached_config.status != "modified":
207+
return
208+
209+
# Fast path 2: even if cached status is "modified", the cached
210+
# `modified` timestamp gives a lower bound on the real elapsed
211+
# time. If that lower bound is below the grace period, the real
212+
# value cannot be above it either, so skip the DB query entirely.
213+
grace = DeviceChecksumView._STATUS_RECONCILE_GRACE_SECONDS
214+
cached_elapsed = (timezone.now() - cached_config.modified).total_seconds()
215+
if cached_elapsed < grace:
216+
return
217+
218+
# Slow path: re-read config fresh from the database because
219+
# cache_memoize has a 30-day TTL and may be serving a stale
220+
# "modified" status that was already reconciled earlier.
221+
# Use select_for_update inside a transaction to avoid race
222+
# conditions with concurrent requests.
223+
with transaction.atomic():
224+
try:
225+
config = (
226+
Config.objects.select_for_update()
227+
.only("id", "status", "modified")
228+
.get(device=device)
229+
)
230+
except Config.DoesNotExist:
231+
return
232+
if config.status != "modified":
233+
return
234+
elapsed = (timezone.now() - config.modified).total_seconds()
235+
if elapsed < grace:
236+
return
237+
config.set_status_applied()
238+
logger.info(
239+
"Reconciled config status for device %s: was 'modified' "
240+
"for %d seconds with device actively polling, "
241+
"setting to 'applied'.",
242+
device,
243+
int(elapsed),
244+
)
245+
except Exception:
246+
logger.exception(
247+
"Failed to reconcile config status for device %s",
248+
device,
249+
)
250+
163251
@cache_memoize(
164252
timeout=Config._CHECKSUM_CACHE_TIMEOUT, args_rewrite=get_device_args_rewrite
165253
)

openwisp_controller/config/tests/test_controller.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,66 @@ def test_device_checksum_requested_signal_is_emitted(self):
270270
request=response.wsgi_request,
271271
)
272272

273+
def test_device_checksum_reconciles_modified_status(self):
274+
"""
275+
When a device with status "modified" requests its checksum,
276+
and enough time has passed (grace period), the status should
277+
be automatically reconciled to "applied".
278+
"""
279+
from datetime import timedelta
280+
281+
from django.utils import timezone
282+
283+
d = self._create_device_config()
284+
c = d.config
285+
c.set_status_modified()
286+
self.assertEqual(c.status, "modified")
287+
url = reverse("controller:device_checksum", args=[d.pk])
288+
289+
# First request within grace period: status should stay "modified"
290+
response = self.client.get(url, {"key": d.key})
291+
self.assertEqual(response.status_code, 200)
292+
c.refresh_from_db()
293+
self.assertEqual(c.status, "modified")
294+
295+
# Simulate that grace period has elapsed by backdating the
296+
# modified timestamp.
297+
Config.objects.filter(pk=c.pk).update(
298+
modified=timezone.now()
299+
- timedelta(
300+
seconds=DeviceChecksumView._STATUS_RECONCILE_GRACE_SECONDS + 300
301+
)
302+
)
303+
304+
# Second request after grace period: status should be reconciled
305+
# to "applied"
306+
response = self.client.get(url, {"key": d.key})
307+
self.assertEqual(response.status_code, 200)
308+
c.refresh_from_db()
309+
self.assertEqual(c.status, "applied")
310+
311+
def test_device_checksum_no_reconcile_for_applied(self):
312+
"""Status "applied" should not be changed."""
313+
d = self._create_device_config()
314+
c = d.config
315+
c.set_status_applied()
316+
url = reverse("controller:device_checksum", args=[d.pk])
317+
response = self.client.get(url, {"key": d.key})
318+
self.assertEqual(response.status_code, 200)
319+
c.refresh_from_db()
320+
self.assertEqual(c.status, "applied")
321+
322+
def test_device_checksum_no_reconcile_within_grace_period(self):
323+
"""Status should not be reconciled if within the grace period."""
324+
d = self._create_device_config()
325+
c = d.config
326+
c.set_status_modified()
327+
url = reverse("controller:device_checksum", args=[d.pk])
328+
response = self.client.get(url, {"key": d.key})
329+
self.assertEqual(response.status_code, 200)
330+
c.refresh_from_db()
331+
self.assertEqual(c.status, "modified")
332+
273333
def test_device_checksum_bad_uuid(self):
274334
d = self._create_device_config()
275335
pk = "{}-wrong".format(d.pk)

0 commit comments

Comments
 (0)