Skip to content

Commit 2282fb6

Browse files
refactor: scope prepare_duplicates_for_delete to full object, not per-engagement
Adds product= and product_type= parameters so the entire deletion scope is handled in one call, avoiding unnecessary reconfiguration of findings that are about to be deleted anyway. Uses subqueries instead of materializing ID sets, and chunks the originals loop with prefetch to bound memory. Reverts finding_delete to use ORM .delete() for single finding cascade deletes.
1 parent 4a79e34 commit 2282fb6

3 files changed

Lines changed: 59 additions & 42 deletions

File tree

dojo/finding/helper.py

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
from contextlib import suppress
33
from datetime import datetime
4+
from itertools import batched
45
from time import strftime
56

67
from django.conf import settings
@@ -563,8 +564,7 @@ def finding_delete(instance, **kwargs):
563564
duplicate_cluster = instance.original_finding.all()
564565
if duplicate_cluster:
565566
if settings.DUPLICATE_CLUSTER_CASCADE_DELETE:
566-
# Delete the entire duplicate cluster efficiently via bulk_delete_findings
567-
bulk_delete_findings(duplicate_cluster)
567+
duplicate_cluster.order_by("-id").delete()
568568
else:
569569
reconfigure_duplicate_cluster(instance, duplicate_cluster)
570570
else:
@@ -615,53 +615,65 @@ def reconfigure_duplicate_cluster(original, cluster_outside):
615615
cluster_outside.exclude(id=new_original.id).update(duplicate_finding=new_original)
616616

617617

618-
def prepare_duplicates_for_delete(test=None, engagement=None):
619-
logger.debug("prepare duplicates for delete, test: %s, engagement: %s", test.id if test else None, engagement.id if engagement else None)
620-
if test is None and engagement is None:
621-
logger.warning("nothing to prepare as test and engagement are None")
618+
def prepare_duplicates_for_delete(test=None, engagement=None, product=None, product_type=None):
619+
logger.debug(
620+
"prepare duplicates for delete, test: %s, engagement: %s, product: %s, product_type: %s",
621+
test.id if test else None,
622+
engagement.id if engagement else None,
623+
product.id if product else None,
624+
product_type.id if product_type else None,
625+
)
626+
if test is None and engagement is None and product is None and product_type is None:
627+
logger.warning("nothing to prepare as no scope object provided")
622628
return
623629

624630
# should not be needed in normal healthy instances.
625631
# but in that case it's a cheap count query and we might as well run it to be safe
626632
fix_loop_duplicates()
627633

628-
# Build scope filter
629-
scope_filter = {}
630-
if engagement:
631-
scope_filter["test__engagement"] = engagement
632-
if test:
633-
scope_filter["test"] = test
634+
# Build scope as a subquery — never materialized into Python memory
635+
if product_type:
636+
scope_filter = {"test__engagement__product__prod_type": product_type}
637+
elif product:
638+
scope_filter = {"test__engagement__product": product}
639+
elif engagement:
640+
scope_filter = {"test__engagement": engagement}
641+
else:
642+
scope_filter = {"test": test}
634643

635-
scope_finding_ids = set(
636-
Finding.objects.filter(**scope_filter).values_list("id", flat=True),
637-
)
638-
if not scope_finding_ids:
644+
scope_ids_subquery = Finding.objects.filter(**scope_filter).values_list("id", flat=True)
645+
646+
if not scope_ids_subquery.exists():
639647
logger.debug("no findings in scope, nothing to prepare")
640648
return
641649

642650
# Bulk-reset inside-scope duplicates: single UPDATE instead of per-original mass_model_updater.
643-
# Clears the duplicate_finding FK so Django's Collector won't trip over dangling references
644-
# when deleting findings in this scope.
651+
# Clears the duplicate_finding FK so cascade_delete won't trip over dangling self-references.
645652
inside_reset_count = Finding.objects.filter(
646653
duplicate=True,
647-
duplicate_finding_id__in=scope_finding_ids,
648-
id__in=scope_finding_ids,
654+
duplicate_finding_id__in=scope_ids_subquery,
655+
id__in=scope_ids_subquery,
649656
).update(duplicate_finding=None, duplicate=False)
650657
logger.debug("bulk-reset %d inside-scope duplicates", inside_reset_count)
651658

652659
# Reconfigure outside-scope duplicates: still per-original because each cluster
653660
# needs a new original chosen, status copied, and found_by updated.
654-
# Pre-filter to only originals that have at least one duplicate outside scope,
655-
# avoiding a per-original .exists() check.
656-
originals_with_outside_dupes = Finding.objects.filter(
657-
id__in=scope_finding_ids,
658-
original_finding__in=Finding.objects.exclude(id__in=scope_finding_ids),
659-
).distinct().prefetch_related("original_finding")
660-
661-
for original in originals_with_outside_dupes:
662-
# Inside-scope duplicates were already unlinked by the bulk UPDATE above,
663-
# so original_finding.all() now only contains outside-scope duplicates.
664-
reconfigure_duplicate_cluster(original, original.original_finding.all())
661+
# Chunked with prefetch_related to bound memory while avoiding N+1 queries.
662+
originals_ids = (
663+
Finding.objects.filter(
664+
id__in=scope_ids_subquery,
665+
original_finding__in=Finding.objects.exclude(id__in=scope_ids_subquery),
666+
)
667+
.distinct()
668+
.values_list("id", flat=True)
669+
.iterator(chunk_size=500)
670+
)
671+
672+
for chunk_ids in batched(originals_ids, 500):
673+
for original in Finding.objects.filter(id__in=chunk_ids).prefetch_related("original_finding"):
674+
# Inside-scope duplicates were already unlinked by the bulk UPDATE above,
675+
# so original_finding.all() now only contains outside-scope duplicates.
676+
reconfigure_duplicate_cluster(original, original.original_finding.all())
665677

666678

667679
@receiver(pre_delete, sender=Test)

dojo/utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
Languages,
7070
Notifications,
7171
Product,
72+
Product_Type,
7273
System_Settings,
7374
Test,
7475
Test_Type,
@@ -2094,11 +2095,12 @@ def async_delete_crawl_task(obj, model_list, **kwargs):
20942095
# Step 2: Prepare duplicate clusters (must happen before any deletion)
20952096
# When CASCADE_DELETE=True, reconfigure_duplicate_cluster skips deletion —
20962097
# we handle that below by expanding scope to include outside duplicates.
2097-
if isinstance(obj, Engagement):
2098-
prepare_duplicates_for_delete(engagement=obj)
2098+
if isinstance(obj, Product_Type):
2099+
prepare_duplicates_for_delete(product_type=obj)
20992100
elif isinstance(obj, Product):
2100-
for engagement in obj.engagement_set.all():
2101-
prepare_duplicates_for_delete(engagement=engagement)
2101+
prepare_duplicates_for_delete(product=obj)
2102+
elif isinstance(obj, Engagement):
2103+
prepare_duplicates_for_delete(engagement=obj)
21022104
elif isinstance(obj, Test):
21032105
prepare_duplicates_for_delete(test=obj)
21042106

unittests/test_prepare_duplicates_for_delete.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -224,20 +224,23 @@ def test_mixed_inside_and_outside_duplicates(self):
224224
self.assertIsNone(outside_dupe.duplicate_finding)
225225

226226
@override_settings(DUPLICATE_CLUSTER_CASCADE_DELETE=True)
227-
def test_cascade_delete_setting(self):
228-
"""When DUPLICATE_CLUSTER_CASCADE_DELETE=True, outside duplicates are deleted."""
227+
def test_cascade_delete_skips_outside_reconfigure(self):
228+
"""When DUPLICATE_CLUSTER_CASCADE_DELETE=True, outside duplicates are left untouched.
229+
230+
The caller (async_delete_crawl_task) handles deletion of outside-scope
231+
duplicates separately via bulk_delete_findings.
232+
"""
229233
original = self._create_finding(self.test1, "Original")
230234
outside_dupe = self._create_finding(self.test2, "Outside Dupe")
231235
self._make_duplicate(outside_dupe, original)
232-
outside_dupe_id = outside_dupe.id
233236

234237
with impersonate(self.testuser):
235238
prepare_duplicates_for_delete(test=self.test1)
236239

237-
self.assertFalse(
238-
Finding.objects.filter(id=outside_dupe_id).exists(),
239-
"Outside duplicate should be cascade-deleted",
240-
)
240+
outside_dupe.refresh_from_db()
241+
# Outside dupe is still a duplicate — not reconfigured or deleted
242+
self.assertTrue(outside_dupe.duplicate)
243+
self.assertEqual(outside_dupe.duplicate_finding_id, original.id)
241244

242245
def test_multiple_originals(self):
243246
"""Multiple originals in the same test each get their clusters handled."""

0 commit comments

Comments
 (0)