Skip to content

Commit 44caf6d

Browse files
perf: address PR review feedback for large-scale delete safety
- Stream finding IDs via iterator()+batched instead of materializing the full ID list into memory. Prevents OOM on 4.5M+ finding deletes. - Add SET LOCAL statement_timeout (300s) and deadlock error logging to cascade_delete SQL execution. Prevents runaway queries from holding locks indefinitely and surfaces deadlock errors in logs. - Reuse scope_ids subquery variable and replace .exists()+.count() with a single .count() call to avoid evaluating the subquery twice. - Add comment explaining why FileUpload uses per-object ORM delete (custom delete() removes files from disk; file attachments are rare). - Scope fix_loop_duplicates to the deletion set instead of scanning the full findings table. The double self-join is cheap when filtered to only findings in the scope being deleted. - Document that pre_bulk_delete_findings signal receivers must not materialize the full queryset (use .filter()/.iterator() instead). - Add skip_m2m_for parameter to cascade_delete so bulk_delete_findings can tell it Finding M2M was already cleaned by bulk_clear_finding_m2m, avoiding redundant tag count aggregation queries.
1 parent 161abd0 commit 44caf6d

4 files changed

Lines changed: 71 additions & 32 deletions

File tree

dojo/finding/helper.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -632,13 +632,13 @@ def prepare_duplicates_for_delete(obj):
632632

633633
logger.debug("prepare_duplicates_for_delete: %s %d", type(obj).__name__, obj.id)
634634

635-
# should not be needed in normal healthy instances.
636-
# but in that case it's a cheap count query and we might as well run it to be safe
637-
fix_loop_duplicates()
638-
639635
# Build scope as a subquery — never materialized into Python memory
640636
scope_ids_subquery = Finding.objects.filter(**{scope_field: obj}).values_list("id", flat=True)
641637

638+
# Fix any transitive duplicate loops within scope before reconfiguring clusters.
639+
# Scoped to the deletion set to avoid a full-table self-join on large instances.
640+
fix_loop_duplicates(scope_qs=Finding.objects.filter(**{scope_field: obj}))
641+
642642
if not scope_ids_subquery.exists():
643643
logger.debug("no findings in scope, nothing to prepare")
644644
return
@@ -747,7 +747,10 @@ def bulk_clear_finding_m2m(finding_qs):
747747
count, through_model._meta.db_table,
748748
)
749749

750-
# Delete FileUpload objects via ORM so custom delete() removes files from disk
750+
# Delete FileUpload objects via ORM one-by-one so the custom
751+
# FileUpload.delete() method fires and removes files from disk storage.
752+
# Bulk deletion would orphan files on disk. File attachments are uncommon
753+
# so the per-object overhead is negligible in practice.
751754
if file_ids:
752755
for file_upload in FileUpload.objects.filter(id__in=file_ids).iterator():
753756
file_upload.delete()
@@ -771,41 +774,52 @@ def bulk_delete_findings(finding_qs, chunk_size=1000):
771774

772775
pre_bulk_delete_findings.send(sender=Finding, finding_qs=finding_qs)
773776
bulk_clear_finding_m2m(finding_qs)
774-
finding_ids = list(finding_qs.values_list("id", flat=True).order_by("id"))
775-
total_chunks = (len(finding_ids) + chunk_size - 1) // chunk_size
776-
for i in range(0, len(finding_ids), chunk_size):
777-
chunk = finding_ids[i:i + chunk_size]
777+
for chunk_num, chunk_ids in enumerate(
778+
batched(
779+
finding_qs.values_list("id", flat=True).order_by("id").iterator(chunk_size=chunk_size),
780+
chunk_size,
781+
strict=False,
782+
),
783+
start=1,
784+
):
778785
with transaction.atomic():
779-
cascade_delete(Finding, Finding.objects.filter(id__in=chunk), skip_relations={Finding})
786+
cascade_delete(Finding, Finding.objects.filter(id__in=chunk_ids), skip_relations={Finding}, skip_m2m_for={Finding})
780787
logger.info(
781-
"bulk_delete_findings: deleted chunk %d/%d (%d findings)",
782-
i // chunk_size + 1, total_chunks, len(chunk),
788+
"bulk_delete_findings: deleted chunk %d (%d findings)",
789+
chunk_num, len(chunk_ids),
783790
)
784791

785792

786-
def fix_loop_duplicates():
793+
def fix_loop_duplicates(scope_qs=None):
787794
"""Due to bugs in the past and even currently when under high parallel load, there can be transitive duplicates."""
788795
""" i.e. A -> B -> C. This can lead to problems when deleting findingns, performing deduplication, etc """
789796
# Build base queryset without selecting full rows to minimize memory
790-
loop_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
797+
base_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
798+
if scope_qs is not None:
799+
base_qs = base_qs.filter(id__in=scope_qs.values_list("id", flat=True))
791800

792801
# Use COUNT(*) at the DB instead of materializing the queryset
793-
loop_count = loop_qs.count()
802+
loop_count = base_qs.count()
794803

795804
if loop_count > 0:
796805
deduplicationLogger.warning("fix_loop_duplicates: found %d findings with duplicate loops", loop_count)
797806
# Stream IDs only in descending order to avoid loading full Finding rows
798-
for find_id in loop_qs.order_by("-id").values_list("id", flat=True).iterator(chunk_size=1000):
807+
for find_id in base_qs.order_by("-id").values_list("id", flat=True).iterator(chunk_size=1000):
799808
deduplicationLogger.warning("fix_loop_duplicates: fixing loop for finding %d", find_id)
800809
removeLoop(find_id, 50)
801810

802-
new_originals = Finding.objects.filter(duplicate_finding__isnull=True, duplicate=True)
803-
for f in new_originals:
811+
new_originals_qs = Finding.objects.filter(duplicate_finding__isnull=True, duplicate=True)
812+
if scope_qs is not None:
813+
new_originals_qs = new_originals_qs.filter(id__in=scope_qs.values_list("id", flat=True))
814+
for f in new_originals_qs:
804815
deduplicationLogger.info(f"New Original: {f.id}")
805816
f.duplicate = False
806817
super(Finding, f).save(skip_validation=True)
807818

808-
loop_count = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False).count()
819+
recheck_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
820+
if scope_qs is not None:
821+
recheck_qs = recheck_qs.filter(id__in=scope_qs.values_list("id", flat=True))
822+
loop_count = recheck_qs.count()
809823
deduplicationLogger.info(f"{loop_count} Finding found which still has Loops, please run fix loop duplicates again")
810824
return loop_count
811825

dojo/signals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,8 @@
33
# Sent before bulk-deleting findings via cascade_delete.
44
# Receivers can dispatch integrator notifications, collect metrics, etc.
55
# Provides: finding_qs (QuerySet of findings about to be deleted)
6+
#
7+
# IMPORTANT: The queryset may contain millions of rows. Receivers MUST NOT
8+
# call list(), len(), or otherwise materialize the full queryset into memory.
9+
# Use .filter(), .iterator(), or aggregation queries instead.
610
pre_bulk_delete_findings = Signal()

dojo/utils.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2094,13 +2094,15 @@ def async_delete_task(obj, **kwargs):
20942094
# Step 3: Delete outside-scope duplicates first — these point to findings
20952095
# in the main scope via duplicate_finding FK, so they must be removed before
20962096
# the originals to avoid FK violations during chunked deletion.
2097+
scope_ids = finding_qs.values_list("id", flat=True)
20972098
outside_dupes_qs = (
2098-
Finding.objects.filter(duplicate_finding_id__in=finding_qs.values_list("id", flat=True))
2099-
.exclude(id__in=finding_qs.values_list("id", flat=True))
2099+
Finding.objects.filter(duplicate_finding_id__in=scope_ids)
2100+
.exclude(id__in=scope_ids)
21002101
)
21012102
chunk_size = get_setting("ASYNC_OBEJECT_DELETE_CHUNK_SIZE")
2102-
if outside_dupes_qs.exists():
2103-
logger.info("ASYNC_DELETE: Deleting %d outside-scope duplicates first", outside_dupes_qs.count())
2103+
outside_count = outside_dupes_qs.count()
2104+
if outside_count:
2105+
logger.info("ASYNC_DELETE: Deleting %d outside-scope duplicates first", outside_count)
21042106
bulk_delete_findings(outside_dupes_qs, chunk_size=chunk_size)
21052107

21062108
# Step 4: Delete the main scope findings
@@ -2109,6 +2111,8 @@ def async_delete_task(obj, **kwargs):
21092111
# Step 5: Delete the top-level object and all remaining children (Tests,
21102112
# Engagements, Endpoints, etc.) via cascade_delete. Findings are already
21112113
# gone, so skip_relations={Finding} avoids walking empty relations.
2114+
# Single transaction is fine here — the heavy relations (Findings,
2115+
# Endpoint_Status) are already deleted; only lightweight rows remain.
21122116
pk_query = type(obj).objects.filter(pk=obj.pk)
21132117
with transaction.atomic():
21142118
cascade_delete(type(obj), pk_query, skip_relations={Finding})

dojo/utils_cascade_delete.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import logging
1212

13-
from django.db import models, transaction
13+
from django.db import OperationalError, models, transaction
1414
from django.db.models.sql.compiler import SQLDeleteCompiler
1515

1616
logger = logging.getLogger(__name__)
@@ -35,11 +35,19 @@ def get_update_sql(query, **updatespec):
3535
return q.get_compiler(query.db).as_sql()
3636

3737

38+
STATEMENT_TIMEOUT = "300s"
39+
40+
3841
def execute_compiled_sql(sql, params=None):
3942
"""Execute compiled SQL directly via connection.cursor()."""
40-
with transaction.get_connection().cursor() as cur:
41-
cur.execute(sql, params or None)
42-
return cur.rowcount
43+
try:
44+
with transaction.get_connection().cursor() as cur:
45+
cur.execute(f"SET LOCAL statement_timeout = '{STATEMENT_TIMEOUT}'")
46+
cur.execute(sql, params or None)
47+
return cur.rowcount
48+
except OperationalError:
49+
logger.exception("cascade_delete SQL failed (possible deadlock or timeout): %s", sql[:200])
50+
raise
4351

4452

4553
def execute_delete_sql(query):
@@ -52,7 +60,7 @@ def execute_update_sql(query, **updatespec):
5260
return execute_compiled_sql(*get_update_sql(query, **updatespec))
5361

5462

55-
def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_model=None, level=0):
63+
def cascade_delete(from_model, instance_pk_query, skip_relations=None, skip_m2m_for=None, base_model=None, level=0):
5664
"""
5765
Recursively walk Django model relations and execute compiled SQL
5866
to perform cascade DELETE / SET_NULL without the Collector.
@@ -67,6 +75,8 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
6775
from_model: The model class to delete from.
6876
instance_pk_query: QuerySet selecting the records to delete.
6977
skip_relations: Set of model classes to skip (e.g. self-referential FKs).
78+
skip_m2m_for: Set of model classes whose M2M cleanup was already done
79+
by the caller (avoids redundant tag count queries).
7080
base_model: Root model class (set automatically on first call).
7181
level: Recursion depth (for logging only).
7282
@@ -76,6 +86,8 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
7686
"""
7787
if skip_relations is None:
7888
skip_relations = set()
89+
if skip_m2m_for is None:
90+
skip_m2m_for = set()
7991
if base_model is None:
8092
base_model = from_model
8193

@@ -122,6 +134,7 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
122134
cascade_delete(
123135
related_model, related_pk_query,
124136
skip_relations=skip_relations,
137+
skip_m2m_for=skip_m2m_for,
125138
base_model=base_model,
126139
level=level + 1,
127140
)
@@ -139,15 +152,19 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
139152
)
140153

141154
# Clear M2M through tables before deleting (not discovered by _meta.related_objects).
142-
# Tag fields are handled via bulk_remove_all_tags to maintain tag counts correctly.
143-
from dojo.tag_utils import bulk_remove_all_tags # noqa: PLC0415 circular import
155+
# Skip if the caller already handled M2M cleanup for this model (e.g. bulk_clear_finding_m2m).
156+
if from_model not in skip_m2m_for:
157+
from dojo.tag_utils import bulk_remove_all_tags # noqa: PLC0415 circular import
144158

145-
bulk_remove_all_tags(from_model, instance_pk_query)
159+
bulk_remove_all_tags(from_model, instance_pk_query)
146160

147161
for m2m_field in from_model._meta.many_to_many:
148-
# Skip tag fields — already handled above
162+
# Skip tag fields — handled by bulk_remove_all_tags above
149163
if hasattr(m2m_field, "tag_options"):
150164
continue
165+
# Skip if caller already cleaned M2M for this model
166+
if from_model in skip_m2m_for:
167+
continue
151168
through_model = m2m_field.remote_field.through
152169
fk_column = None
153170
for field in through_model._meta.get_fields():

0 commit comments

Comments
 (0)