perf: address PR review feedback for large-scale delete safety

valentijnscholten · valentijnscholten · commit 44caf6db2a36 · 2026-03-24T21:49:09.000+01:00
- Stream finding IDs via iterator()+batched instead of materializing
  the full ID list into memory. Prevents OOM on 4.5M+ finding deletes.

- Add SET LOCAL statement_timeout (300s) and deadlock error logging to
  cascade_delete SQL execution. Prevents runaway queries from holding
  locks indefinitely and surfaces deadlock errors in logs.

- Reuse scope_ids subquery variable and replace .exists()+.count()
  with a single .count() call to avoid evaluating the subquery twice.

- Add comment explaining why FileUpload uses per-object ORM delete
  (custom delete() removes files from disk; file attachments are rare).

- Scope fix_loop_duplicates to the deletion set instead of scanning
  the full findings table. The double self-join is cheap when filtered
  to only findings in the scope being deleted.

- Document that pre_bulk_delete_findings signal receivers must not
  materialize the full queryset (use .filter()/.iterator() instead).

- Add skip_m2m_for parameter to cascade_delete so bulk_delete_findings
  can tell it Finding M2M was already cleaned by bulk_clear_finding_m2m,
  avoiding redundant tag count aggregation queries.
diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py
@@ -632,13 +632,13 @@ def prepare_duplicates_for_delete(obj):
 
     logger.debug("prepare_duplicates_for_delete: %s %d", type(obj).__name__, obj.id)
 
-    # should not be needed in normal healthy instances.
-    # but in that case it's a cheap count query and we might as well run it to be safe
-    fix_loop_duplicates()
-
     # Build scope as a subquery — never materialized into Python memory
     scope_ids_subquery = Finding.objects.filter(**{scope_field: obj}).values_list("id", flat=True)
 
+    # Fix any transitive duplicate loops within scope before reconfiguring clusters.
+    # Scoped to the deletion set to avoid a full-table self-join on large instances.
+    fix_loop_duplicates(scope_qs=Finding.objects.filter(**{scope_field: obj}))
+
     if not scope_ids_subquery.exists():
         logger.debug("no findings in scope, nothing to prepare")
         return
@@ -747,7 +747,10 @@ def bulk_clear_finding_m2m(finding_qs):
                     count, through_model._meta.db_table,
                 )
 
-    # Delete FileUpload objects via ORM so custom delete() removes files from disk
+    # Delete FileUpload objects via ORM one-by-one so the custom
+    # FileUpload.delete() method fires and removes files from disk storage.
+    # Bulk deletion would orphan files on disk. File attachments are uncommon
+    # so the per-object overhead is negligible in practice.
     if file_ids:
         for file_upload in FileUpload.objects.filter(id__in=file_ids).iterator():
             file_upload.delete()
@@ -771,41 +774,52 @@ def bulk_delete_findings(finding_qs, chunk_size=1000):
 
     pre_bulk_delete_findings.send(sender=Finding, finding_qs=finding_qs)
     bulk_clear_finding_m2m(finding_qs)
-    finding_ids = list(finding_qs.values_list("id", flat=True).order_by("id"))
-    total_chunks = (len(finding_ids) + chunk_size - 1) // chunk_size
-    for i in range(0, len(finding_ids), chunk_size):
-        chunk = finding_ids[i:i + chunk_size]
+    for chunk_num, chunk_ids in enumerate(
+        batched(
+            finding_qs.values_list("id", flat=True).order_by("id").iterator(chunk_size=chunk_size),
+            chunk_size,
+            strict=False,
+        ),
+        start=1,
+    ):
         with transaction.atomic():
-            cascade_delete(Finding, Finding.objects.filter(id__in=chunk), skip_relations={Finding})
+            cascade_delete(Finding, Finding.objects.filter(id__in=chunk_ids), skip_relations={Finding}, skip_m2m_for={Finding})
         logger.info(
-            "bulk_delete_findings: deleted chunk %d/%d (%d findings)",
-            i // chunk_size + 1, total_chunks, len(chunk),
+            "bulk_delete_findings: deleted chunk %d (%d findings)",
+            chunk_num, len(chunk_ids),
         )
 
 
-def fix_loop_duplicates():
+def fix_loop_duplicates(scope_qs=None):
     """Due to bugs in the past and even currently when under high parallel load, there can be transitive duplicates."""
     """ i.e. A -> B -> C. This can lead to problems when deleting findingns, performing deduplication, etc """
     # Build base queryset without selecting full rows to minimize memory
-    loop_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
+    base_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
+    if scope_qs is not None:
+        base_qs = base_qs.filter(id__in=scope_qs.values_list("id", flat=True))
 
     # Use COUNT(*) at the DB instead of materializing the queryset
-    loop_count = loop_qs.count()
+    loop_count = base_qs.count()
 
     if loop_count > 0:
         deduplicationLogger.warning("fix_loop_duplicates: found %d findings with duplicate loops", loop_count)
         # Stream IDs only in descending order to avoid loading full Finding rows
-        for find_id in loop_qs.order_by("-id").values_list("id", flat=True).iterator(chunk_size=1000):
+        for find_id in base_qs.order_by("-id").values_list("id", flat=True).iterator(chunk_size=1000):
             deduplicationLogger.warning("fix_loop_duplicates: fixing loop for finding %d", find_id)
             removeLoop(find_id, 50)
 
-        new_originals = Finding.objects.filter(duplicate_finding__isnull=True, duplicate=True)
-        for f in new_originals:
+        new_originals_qs = Finding.objects.filter(duplicate_finding__isnull=True, duplicate=True)
+        if scope_qs is not None:
+            new_originals_qs = new_originals_qs.filter(id__in=scope_qs.values_list("id", flat=True))
+        for f in new_originals_qs:
             deduplicationLogger.info(f"New Original: {f.id}")
             f.duplicate = False
             super(Finding, f).save(skip_validation=True)
 
-        loop_count = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False).count()
+        recheck_qs = Finding.objects.filter(duplicate_finding__isnull=False, original_finding__isnull=False)
+        if scope_qs is not None:
+            recheck_qs = recheck_qs.filter(id__in=scope_qs.values_list("id", flat=True))
+        loop_count = recheck_qs.count()
         deduplicationLogger.info(f"{loop_count} Finding found which still has Loops, please run fix loop duplicates again")
     return loop_count
 
diff --git a/dojo/signals.py b/dojo/signals.py
@@ -3,4 +3,8 @@
 # Sent before bulk-deleting findings via cascade_delete.
 # Receivers can dispatch integrator notifications, collect metrics, etc.
 # Provides: finding_qs (QuerySet of findings about to be deleted)
+#
+# IMPORTANT: The queryset may contain millions of rows. Receivers MUST NOT
+# call list(), len(), or otherwise materialize the full queryset into memory.
+# Use .filter(), .iterator(), or aggregation queries instead.
 pre_bulk_delete_findings = Signal()
diff --git a/dojo/utils.py b/dojo/utils.py
@@ -2094,13 +2094,15 @@ def async_delete_task(obj, **kwargs):
         # Step 3: Delete outside-scope duplicates first — these point to findings
         # in the main scope via duplicate_finding FK, so they must be removed before
         # the originals to avoid FK violations during chunked deletion.
+        scope_ids = finding_qs.values_list("id", flat=True)
         outside_dupes_qs = (
-            Finding.objects.filter(duplicate_finding_id__in=finding_qs.values_list("id", flat=True))
-            .exclude(id__in=finding_qs.values_list("id", flat=True))
+            Finding.objects.filter(duplicate_finding_id__in=scope_ids)
+            .exclude(id__in=scope_ids)
         )
         chunk_size = get_setting("ASYNC_OBEJECT_DELETE_CHUNK_SIZE")
-        if outside_dupes_qs.exists():
-            logger.info("ASYNC_DELETE: Deleting %d outside-scope duplicates first", outside_dupes_qs.count())
+        outside_count = outside_dupes_qs.count()
+        if outside_count:
+            logger.info("ASYNC_DELETE: Deleting %d outside-scope duplicates first", outside_count)
             bulk_delete_findings(outside_dupes_qs, chunk_size=chunk_size)
 
         # Step 4: Delete the main scope findings
@@ -2109,6 +2111,8 @@ def async_delete_task(obj, **kwargs):
     # Step 5: Delete the top-level object and all remaining children (Tests,
     # Engagements, Endpoints, etc.) via cascade_delete. Findings are already
     # gone, so skip_relations={Finding} avoids walking empty relations.
+    # Single transaction is fine here — the heavy relations (Findings,
+    # Endpoint_Status) are already deleted; only lightweight rows remain.
     pk_query = type(obj).objects.filter(pk=obj.pk)
     with transaction.atomic():
         cascade_delete(type(obj), pk_query, skip_relations={Finding})
diff --git a/dojo/utils_cascade_delete.py b/dojo/utils_cascade_delete.py
@@ -10,7 +10,7 @@
 
 import logging
 
-from django.db import models, transaction
+from django.db import OperationalError, models, transaction
 from django.db.models.sql.compiler import SQLDeleteCompiler
 
 logger = logging.getLogger(__name__)
@@ -35,11 +35,19 @@ def get_update_sql(query, **updatespec):
     return q.get_compiler(query.db).as_sql()
 
 
+STATEMENT_TIMEOUT = "300s"
+
+
 def execute_compiled_sql(sql, params=None):
     """Execute compiled SQL directly via connection.cursor()."""
-    with transaction.get_connection().cursor() as cur:
-        cur.execute(sql, params or None)
-        return cur.rowcount
+    try:
+        with transaction.get_connection().cursor() as cur:
+            cur.execute(f"SET LOCAL statement_timeout = '{STATEMENT_TIMEOUT}'")
+            cur.execute(sql, params or None)
+            return cur.rowcount
+    except OperationalError:
+        logger.exception("cascade_delete SQL failed (possible deadlock or timeout): %s", sql[:200])
+        raise
 
 
 def execute_delete_sql(query):
@@ -52,7 +60,7 @@ def execute_update_sql(query, **updatespec):
     return execute_compiled_sql(*get_update_sql(query, **updatespec))
 
 
-def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_model=None, level=0):
+def cascade_delete(from_model, instance_pk_query, skip_relations=None, skip_m2m_for=None, base_model=None, level=0):
     """
     Recursively walk Django model relations and execute compiled SQL
     to perform cascade DELETE / SET_NULL without the Collector.
@@ -67,6 +75,8 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
         from_model: The model class to delete from.
         instance_pk_query: QuerySet selecting the records to delete.
         skip_relations: Set of model classes to skip (e.g. self-referential FKs).
+        skip_m2m_for: Set of model classes whose M2M cleanup was already done
+                      by the caller (avoids redundant tag count queries).
         base_model: Root model class (set automatically on first call).
         level: Recursion depth (for logging only).
 
@@ -76,6 +86,8 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
     """
     if skip_relations is None:
         skip_relations = set()
+    if skip_m2m_for is None:
+        skip_m2m_for = set()
     if base_model is None:
         base_model = from_model
 
@@ -122,6 +134,7 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
             cascade_delete(
                 related_model, related_pk_query,
                 skip_relations=skip_relations,
+                skip_m2m_for=skip_m2m_for,
                 base_model=base_model,
                 level=level + 1,
             )
@@ -139,15 +152,19 @@ def cascade_delete(from_model, instance_pk_query, skip_relations=None, base_mode
             )
 
     # Clear M2M through tables before deleting (not discovered by _meta.related_objects).
-    # Tag fields are handled via bulk_remove_all_tags to maintain tag counts correctly.
-    from dojo.tag_utils import bulk_remove_all_tags  # noqa: PLC0415 circular import
+    # Skip if the caller already handled M2M cleanup for this model (e.g. bulk_clear_finding_m2m).
+    if from_model not in skip_m2m_for:
+        from dojo.tag_utils import bulk_remove_all_tags  # noqa: PLC0415 circular import
 
-    bulk_remove_all_tags(from_model, instance_pk_query)
+        bulk_remove_all_tags(from_model, instance_pk_query)
 
     for m2m_field in from_model._meta.many_to_many:
-        # Skip tag fields — already handled above
+        # Skip tag fields — handled by bulk_remove_all_tags above
         if hasattr(m2m_field, "tag_options"):
             continue
+        # Skip if caller already cleaned M2M for this model
+        if from_model in skip_m2m_for:
+            continue
         through_model = m2m_field.remote_field.through
         fk_column = None
         for field in through_model._meta.get_fields():