Skip to content

Commit de8aa33

Browse files
ldrozdz93claude
andcommitted
perf(plugin): retry batch apply on Postgres deadlock
The whole batch runs inside an outer transaction.atomic(), so all data + tag + audit writes for the batch become visible to readers in one atomic transition. The trade-off is a longer lock-hold window per batch, which can cause Postgres deadlocks (SQLSTATE 40P01) when two concurrent batches touch the same rows in different orders. Wrap BulkApplyView's outer transaction.atomic() in a bounded retry loop. On a Postgres deadlock or serialization failure (40001), the whole batch transaction is rolled back; the retry loop re-enters with fresh deferred_changelog accumulators (no stale state from the aborted attempt). Behaviour: - New plugin setting: batch_apply_deadlock_retry_max_count, default 3 (three retries after the initial attempt = up to 4 attempts total). - 0 disables retries entirely. - Backoff is jittered exponential: 50ms * 2^attempt * U(0.5, 1.5). - On success after retries, the response carries X-Diode-Batch-Retries: <n> so the reconciler-side log can correlate. - On exhaustion, the original OperationalError is re-raised; caller handles the failure as before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a126d0d commit de8aa33

3 files changed

Lines changed: 72 additions & 7 deletions

File tree

netbox_diode_plugin/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ class NetBoxDiodePluginConfig(PluginConfig):
4747
# Override the displayed Diode target URL without affecting internal
4848
# communication (e.g. to show the external ingress address).
4949
"diode_target_display": None,
50+
51+
# Max number of retries when the batch apply endpoint hits a
52+
# Postgres deadlock (40P01) or serialization failure (40001).
53+
# 0 disables retries; default 3 means up to three retries after
54+
# the initial attempt (4 attempts total).
55+
"batch_apply_deadlock_retry_max_count": 3,
5056
}
5157

5258

netbox_diode_plugin/api/views.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
# Copyright 2025 NetBox Labs, Inc.
33
"""Diode NetBox Plugin - API Views."""
44
import logging
5+
import random
56
import re
7+
import time
68

79
from django.apps import apps
810
from django.db import transaction
11+
from django.db.utils import OperationalError
912
from rest_framework import status, views
1013
from rest_framework.exceptions import ValidationError
1114
from rest_framework.response import Response
1215

16+
from ..plugin_config import get_batch_apply_deadlock_retry_max_count
1317
from .applier import apply_changeset
1418
from .authentication import DiodeOAuth2Authentication
1519
from .common import (
@@ -27,6 +31,20 @@
2731
require_scopes,
2832
)
2933

34+
# Postgres SQLSTATEs we retry on. 40P01 is deadlock_detected; 40001 is
35+
# serialization_failure (can show up on the SERIALIZABLE isolation
36+
# level path, harmless to retry the same way).
37+
_DEADLOCK_PGCODES = ("40P01", "40001")
38+
39+
40+
def _extract_pgcode(exc: OperationalError) -> str | None:
41+
"""Return the Postgres SQLSTATE on a wrapped Django OperationalError, or None."""
42+
inner = exc.__cause__
43+
pgcode = getattr(inner, "pgcode", None)
44+
if pgcode is None:
45+
pgcode = getattr(exc, "pgcode", None)
46+
return pgcode
47+
3048
logger = logging.getLogger("netbox.diode_data")
3149

3250

@@ -367,6 +385,46 @@ def _post(self, request, *args, **kwargs):
367385
if len(change_sets) == 0:
368386
raise ValidationError({"change_sets": ["change_sets must not be empty"]})
369387

388+
max_retries = int(get_batch_apply_deadlock_retry_max_count() or 0)
389+
attempt = 0
390+
while True:
391+
try:
392+
results = self._apply_batch(change_sets, request)
393+
break
394+
except OperationalError as exc:
395+
pgcode = _extract_pgcode(exc)
396+
if pgcode not in _DEADLOCK_PGCODES:
397+
raise
398+
if attempt >= max_retries:
399+
logger.error(
400+
"batch apply: deadlock retries exhausted "
401+
"(attempts=%d, pgcode=%s)",
402+
attempt + 1, pgcode,
403+
)
404+
raise
405+
# Jittered exponential backoff: 50ms * 2^attempt * U(0.5, 1.5)
406+
sleep_s = 0.05 * (2 ** attempt) * (0.5 + random.random())
407+
logger.warning(
408+
"batch apply: %s, retrying "
409+
"(attempt=%d/%d, sleep=%.3fs)",
410+
"deadlock" if pgcode == "40P01" else "serialization failure",
411+
attempt + 1, max_retries + 1, sleep_s,
412+
)
413+
time.sleep(sleep_s)
414+
attempt += 1
415+
416+
http_status = (
417+
status.HTTP_207_MULTI_STATUS
418+
if any(r.get("errors") for r in results)
419+
else status.HTTP_200_OK
420+
)
421+
resp = Response({"results": results}, status=http_status)
422+
if attempt > 0:
423+
resp["X-Diode-Batch-Retries"] = str(attempt)
424+
return resp
425+
426+
def _apply_batch(self, change_sets, request):
427+
"""Apply all changesets in one outer transaction; caller wraps for retries."""
370428
results = []
371429
# Outer transaction.atomic() makes the whole batch atomic to readers:
372430
# data writes + tag-link writes + audit-log bulk_create commit together.
@@ -398,13 +456,7 @@ def _post(self, request, *args, **kwargs):
398456
).to_dict()
399457
results.append(result)
400458
defc.flush()
401-
402-
http_status = (
403-
status.HTTP_207_MULTI_STATUS
404-
if any(r.get("errors") for r in results)
405-
else status.HTTP_200_OK
406-
)
407-
return Response({"results": results}, status=http_status)
459+
return results
408460

409461

410462
class GetDefaultBranchView(views.APIView):

netbox_diode_plugin/plugin_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from netbox.plugins import get_plugin_config
1111

1212
__all__ = (
13+
"get_batch_apply_deadlock_retry_max_count",
1314
"get_diode_auth_introspect_url",
1415
"get_diode_user",
1516
)
@@ -99,3 +100,9 @@ def get_diode_user():
99100
def get_required_token_audience():
100101
"""Returns the require token audience."""
101102
return get_plugin_config("netbox_diode_plugin", "required_token_audience")
103+
104+
def get_batch_apply_deadlock_retry_max_count():
105+
"""Max retries on Postgres deadlock for the batch apply endpoint."""
106+
return get_plugin_config(
107+
"netbox_diode_plugin", "batch_apply_deadlock_retry_max_count"
108+
)

0 commit comments

Comments
 (0)