Skip to content

Commit c1b47bc

Browse files
ldrozdz93claude
andcommitted
perf(plugin): retry batch apply on Postgres deadlock
The whole batch runs inside an outer transaction.atomic(), so all data + tag + audit writes for the batch become visible to readers in one atomic transition. The trade-off is a longer lock-hold window per batch, which can cause Postgres deadlocks (SQLSTATE 40P01) when two concurrent batches touch the same rows in different orders. Wrap BulkApplyView's outer transaction.atomic() in a bounded retry loop. On a Postgres deadlock or serialization failure (40001), the whole batch transaction is rolled back; the retry loop re-enters with fresh deferred_changelog accumulators (no stale state from the aborted attempt). Behaviour: - New plugin setting: batch_apply_deadlock_retry_max_count, default 3 (three retries after the initial attempt = up to 4 attempts total). - 0 disables retries entirely. - Backoff is jittered exponential: 50ms * 2^attempt * U(0.5, 1.5). - On success after retries, the response carries X-Diode-Batch-Retries: <n> so the reconciler-side log can correlate. - On exhaustion, the original OperationalError is re-raised; caller handles the failure as before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent fae4c3d commit c1b47bc

3 files changed

Lines changed: 72 additions & 7 deletions

File tree

netbox_diode_plugin/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ class NetBoxDiodePluginConfig(PluginConfig):
4747
# Override the displayed Diode target URL without affecting internal
4848
# communication (e.g. to show the external ingress address).
4949
"diode_target_display": None,
50+
51+
# Max number of retries when the batch apply endpoint hits a
52+
# Postgres deadlock (40P01) or serialization failure (40001).
53+
# 0 disables retries; default 3 means up to three retries after
54+
# the initial attempt (4 attempts total).
55+
"batch_apply_deadlock_retry_max_count": 3,
5056
}
5157

5258

netbox_diode_plugin/api/views.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
# Copyright 2025 NetBox Labs, Inc.
33
"""Diode NetBox Plugin - API Views."""
44
import logging
5+
import random
56
import re
7+
import time
68

79
from django.apps import apps
810
from django.db import transaction
11+
from django.db.utils import OperationalError
912
from rest_framework import status, views
1013
from rest_framework.exceptions import ValidationError
1114
from rest_framework.response import Response
1215

16+
from ..plugin_config import get_batch_apply_deadlock_retry_max_count
1317
from .applier import apply_changeset
1418
from .authentication import DiodeOAuth2Authentication
1519
from .common import (
@@ -27,6 +31,20 @@
2731
require_scopes,
2832
)
2933

34+
# Postgres SQLSTATEs we retry on. 40P01 is deadlock_detected; 40001 is
35+
# serialization_failure (can show up on the SERIALIZABLE isolation
36+
# level path, harmless to retry the same way).
37+
_DEADLOCK_PGCODES = ("40P01", "40001")
38+
39+
40+
def _extract_pgcode(exc: OperationalError) -> str | None:
41+
"""Return the Postgres SQLSTATE on a wrapped Django OperationalError, or None."""
42+
inner = exc.__cause__
43+
pgcode = getattr(inner, "pgcode", None)
44+
if pgcode is None:
45+
pgcode = getattr(exc, "pgcode", None)
46+
return pgcode
47+
3048
logger = logging.getLogger("netbox.diode_data")
3149

3250

@@ -364,6 +382,46 @@ def _post(self, request, *args, **kwargs):
364382
if len(change_sets) == 0:
365383
raise ValidationError({"change_sets": ["change_sets must not be empty"]})
366384

385+
max_retries = int(get_batch_apply_deadlock_retry_max_count() or 0)
386+
attempt = 0
387+
while True:
388+
try:
389+
results = self._apply_batch(change_sets, request)
390+
break
391+
except OperationalError as exc:
392+
pgcode = _extract_pgcode(exc)
393+
if pgcode not in _DEADLOCK_PGCODES:
394+
raise
395+
if attempt >= max_retries:
396+
logger.error(
397+
"batch apply: deadlock retries exhausted "
398+
"(attempts=%d, pgcode=%s)",
399+
attempt + 1, pgcode,
400+
)
401+
raise
402+
# Jittered exponential backoff: 50ms * 2^attempt * U(0.5, 1.5)
403+
sleep_s = 0.05 * (2 ** attempt) * (0.5 + random.random())
404+
logger.warning(
405+
"batch apply: %s, retrying "
406+
"(attempt=%d/%d, sleep=%.3fs)",
407+
"deadlock" if pgcode == "40P01" else "serialization failure",
408+
attempt + 1, max_retries + 1, sleep_s,
409+
)
410+
time.sleep(sleep_s)
411+
attempt += 1
412+
413+
http_status = (
414+
status.HTTP_207_MULTI_STATUS
415+
if any(r.get("errors") for r in results)
416+
else status.HTTP_200_OK
417+
)
418+
resp = Response({"results": results}, status=http_status)
419+
if attempt > 0:
420+
resp["X-Diode-Batch-Retries"] = str(attempt)
421+
return resp
422+
423+
def _apply_batch(self, change_sets, request):
424+
"""Apply all changesets in one outer transaction; caller wraps for retries."""
367425
results = []
368426
with deferred_changelog() as defc:
369427
for entry in change_sets:
@@ -390,13 +448,7 @@ def _post(self, request, *args, **kwargs):
390448
).to_dict()
391449
results.append(result)
392450
defc.flush()
393-
394-
http_status = (
395-
status.HTTP_207_MULTI_STATUS
396-
if any(r.get("errors") for r in results)
397-
else status.HTTP_200_OK
398-
)
399-
return Response({"results": results}, status=http_status)
451+
return results
400452

401453

402454
class GetDefaultBranchView(views.APIView):

netbox_diode_plugin/plugin_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from netbox.plugins import get_plugin_config
1111

1212
__all__ = (
13+
"get_batch_apply_deadlock_retry_max_count",
1314
"get_diode_auth_introspect_url",
1415
"get_diode_user",
1516
)
@@ -99,3 +100,9 @@ def get_diode_user():
99100
def get_required_token_audience():
100101
"""Returns the require token audience."""
101102
return get_plugin_config("netbox_diode_plugin", "required_token_audience")
103+
104+
def get_batch_apply_deadlock_retry_max_count():
105+
"""Max retries on Postgres deadlock for the batch apply endpoint."""
106+
return get_plugin_config(
107+
"netbox_diode_plugin", "batch_apply_deadlock_retry_max_count"
108+
)

0 commit comments

Comments
 (0)