Skip to content

Commit a7c6646

Browse files
perf: batch duplicate marking in batch deduplication (#14458)
* perf: batch duplicate marking in batch deduplication Instead of saving each duplicate finding individually, collect all modified findings during a batch deduplication run and flush them in a single bulk_update call. Original (existing) findings are still saved individually to preserve auto_now timestamp updates and post_save signal behavior, but are deduplicated by id so each is saved at most once per batch. Reduces DB writes from O(2N) individual saves to 1 bulk_update + O(unique originals) saves for a batch of N duplicates. Performance test shows -23 queries on a second import with duplicates. * perf: restrict SELECT columns for batch deduplication via only() Add Finding.DEDUPLICATION_FIELDS — the union of all Finding fields needed across every deduplication algorithm — and apply it as an only() clause in get_finding_models_for_deduplication. This avoids loading large text columns (description, mitigation, impact, references, steps_to_reproduce, severity_justification, etc.) when loading findings for the batch deduplication task, reducing data transferred from the database without affecting query count. build_candidate_scope_queryset is intentionally excluded: it is also used for reimport matching (which accesses severity, numerical_severity and other fields outside this set) and applying only() there would cause deferred-field extra queries. * perf(dedup): defer large text fields on candidate queryset - Add Finding.DEDUPLICATION_DEFERRED_FIELDS constant listing large text columns (description, mitigation, impact, references, etc.) that are never read during deduplication or candidate matching. - Apply .defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS) in build_candidate_scope_queryset to avoid loading those columns for the potentially large candidate pool fetched per dedup batch. Reduces deduplication second-import query count from 213 to 183 (-30). --------- Co-authored-by: Matt Tesauro <mtesauro@gmail.com>
1 parent b569b8c commit a7c6646

3 files changed

Lines changed: 125 additions & 14 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def get_finding_models_for_deduplication(finding_ids):
3232

3333
return list(
3434
Finding.objects.filter(id__in=finding_ids)
35+
.only(*Finding.DEDUPLICATION_FIELDS)
3536
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
3637
.prefetch_related(
3738
"endpoints",
@@ -112,7 +113,37 @@ def deduplicate_uid_or_hash_code(new_finding):
112113
_dedupe_batch_uid_or_hash([new_finding])
113114

114115

115-
def set_duplicate(new_finding, existing_finding):
116+
def set_duplicate(new_finding, existing_finding, *, save=True):
117+
"""
118+
Mark new_finding as a duplicate of existing_finding.
119+
120+
Sets duplicate=True, active=False, verified=False, and duplicate_finding=existing_finding
121+
on new_finding, then flattens any transitive duplicates: if any findings already point to
122+
new_finding as their original, they are re-pointed directly to existing_finding (so the
123+
duplicate chain never has more than one level of indirection).
124+
125+
The test_type of new_finding is added to existing_finding.found_by if not already present.
126+
127+
Args:
128+
new_finding: The finding to mark as a duplicate.
129+
existing_finding: The original finding that new_finding is a duplicate of.
130+
Must not itself be a duplicate.
131+
save: When True (default), each modified finding and existing_finding are
132+
saved to the database immediately via super().save(skip_validation=True).
133+
Pass save=False in batch contexts to defer persistence; the caller is
134+
then responsible for bulk-saving the returned list and existing_finding.
135+
136+
Returns:
137+
A list of all Finding instances whose fields were modified by this call, including
138+
new_finding itself and any transitively re-pointed findings. The caller must persist
139+
these when save=False.
140+
141+
Raises:
142+
Exception: if existing_finding is itself a duplicate, if new_finding == existing_finding,
143+
if marking would reopen a mitigated finding via a duplicate chain, or if
144+
new_finding is already a duplicate and existing_finding is mitigated.
145+
146+
"""
116147
deduplicationLogger.debug(f"new_finding.status(): {new_finding.id} {new_finding.status()}")
117148
deduplicationLogger.debug(f"existing_finding.status(): {existing_finding.id} {existing_finding.status()}")
118149
if existing_finding.duplicate:
@@ -135,6 +166,8 @@ def set_duplicate(new_finding, existing_finding):
135166
new_finding.verified = False
136167
new_finding.duplicate_finding = existing_finding
137168

169+
all_modified = [new_finding]
170+
138171
# Make sure transitive duplication is flattened
139172
# if A -> B and B is made a duplicate of C here, afterwards:
140173
# A -> C and B -> C should be true
@@ -143,7 +176,7 @@ def set_duplicate(new_finding, existing_finding):
143176
# order_by here to prevent bypassing the prefetch cache.
144177
for find in new_finding.original_finding.all():
145178
new_finding.original_finding.remove(find)
146-
set_duplicate(find, existing_finding)
179+
all_modified.extend(set_duplicate(find, existing_finding, save=save))
147180
# Only add test type to found_by if it is not already present.
148181
# This is efficient because `found_by` is prefetched for candidates via `build_dedupe_scope_queryset()`.
149182
test_type = getattr(getattr(new_finding, "test", None), "test_type", None)
@@ -152,10 +185,14 @@ def set_duplicate(new_finding, existing_finding):
152185

153186
# existing_finding.found_by.add(new_finding.test.test_type)
154187

155-
logger.debug("saving new finding: %d", new_finding.id)
156-
super(Finding, new_finding).save(skip_validation=True)
157-
logger.debug("saving existing finding: %d", existing_finding.id)
158-
super(Finding, existing_finding).save(skip_validation=True)
188+
if save:
189+
for f in all_modified:
190+
logger.debug("saving new finding: %d", f.id)
191+
super(Finding, f).save(skip_validation=True)
192+
logger.debug("saving existing finding: %d", existing_finding.id)
193+
super(Finding, existing_finding).save(skip_validation=True)
194+
195+
return all_modified
159196

160197

161198
def is_duplicate_reopen(new_finding, existing_finding) -> bool:
@@ -311,6 +348,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):
311348

312349
return (
313350
queryset
351+
.defer(*Finding.DEDUPLICATION_DEFERRED_FIELDS)
314352
.select_related("test", "test__engagement", "test__test_type")
315353
.prefetch_related(*prefetch_list)
316354
)
@@ -654,21 +692,39 @@ def get_matches_from_legacy_candidates(new_finding, candidates_by_title, candida
654692
yield candidate
655693

656694

695+
def _flush_duplicate_changes(modified_new_findings):
696+
"""
697+
Persist duplicate field changes collected during a batch deduplication run.
698+
699+
Bulk-updates all modified new findings in one round-trip instead of one
700+
save() call per finding. Uses bulk_update (no signals) which is consistent
701+
with the original code that called super(Finding, ...).save(skip_validation=True),
702+
bypassing Finding.save() in both cases.
703+
"""
704+
if modified_new_findings:
705+
Finding.objects.bulk_update(
706+
modified_new_findings,
707+
["duplicate", "active", "verified", "duplicate_finding"],
708+
)
709+
710+
657711
def _dedupe_batch_hash_code(findings):
658712
if not findings:
659713
return
660714
test = findings[0].test
661715
candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
662716
if not candidates_by_hash:
663717
return
718+
modified_new_findings = []
664719
for new_finding in findings:
665720
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE")
666721
for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
667722
try:
668-
set_duplicate(new_finding, match)
723+
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
669724
break
670725
except Exception as e:
671726
deduplicationLogger.debug(str(e))
727+
_flush_duplicate_changes(modified_new_findings)
672728

673729

674730
def _dedupe_batch_unique_id(findings):
@@ -678,16 +734,18 @@ def _dedupe_batch_unique_id(findings):
678734
candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
679735
if not candidates_by_uid:
680736
return
737+
modified_new_findings = []
681738
for new_finding in findings:
682739
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL")
683740
for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
684741
deduplicationLogger.debug(f"Trying to deduplicate finding {new_finding.id} against candidate {match.id}")
685742
try:
686-
set_duplicate(new_finding, match)
743+
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
687744
deduplicationLogger.debug(f"Successfully deduplicated finding {new_finding.id} against candidate {match.id}")
688745
break
689746
except Exception as e:
690747
deduplicationLogger.debug(f"Exception when deduplicating finding {new_finding.id} against candidate {match.id}: {e!s}")
748+
_flush_duplicate_changes(modified_new_findings)
691749

692750

693751
def _dedupe_batch_uid_or_hash(findings):
@@ -698,17 +756,19 @@ def _dedupe_batch_uid_or_hash(findings):
698756
candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
699757
if not (candidates_by_uid or existing_by_hash):
700758
return
759+
modified_new_findings = []
701760
for new_finding in findings:
702761
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE")
703762
if new_finding.duplicate:
704763
continue
705764

706765
for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
707766
try:
708-
set_duplicate(new_finding, match)
767+
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
709768
break
710769
except Exception as e:
711770
deduplicationLogger.debug(str(e))
771+
_flush_duplicate_changes(modified_new_findings)
712772

713773

714774
def _dedupe_batch_legacy(findings):
@@ -718,14 +778,16 @@ def _dedupe_batch_legacy(findings):
718778
candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
719779
if not (candidates_by_title or candidates_by_cwe):
720780
return
781+
modified_new_findings = []
721782
for new_finding in findings:
722783
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY")
723784
for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
724785
try:
725-
set_duplicate(new_finding, match)
786+
modified_new_findings.extend(set_duplicate(new_finding, match, save=False))
726787
break
727788
except Exception as e:
728789
deduplicationLogger.debug(str(e))
790+
_flush_duplicate_changes(modified_new_findings)
729791

730792

731793
def dedupe_batch_of_findings(findings, *args, **kwargs):

dojo/models.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2381,6 +2381,55 @@ def __str__(self):
23812381

23822382

23832383
class Finding(BaseModel):
2384+
# Fields loaded when performing deduplication (used by get_finding_models_for_deduplication
2385+
# and build_candidate_scope_queryset to restrict the SELECT to only what is needed).
2386+
# Covers the union of all deduplication algorithms so that a single queryset works
2387+
# regardless of which algorithm is in use. Large text fields (description, mitigation,
2388+
# impact, references, …) are intentionally excluded.
2389+
DEDUPLICATION_FIELDS = [
2390+
"id",
2391+
# FK required for select_related("test") — must not be deferred
2392+
"test",
2393+
# Fields written by set_duplicate
2394+
"duplicate",
2395+
"active",
2396+
"verified",
2397+
"duplicate_finding",
2398+
# Guard checks in set_duplicate
2399+
"is_mitigated",
2400+
"mitigated",
2401+
"out_of_scope",
2402+
"false_p",
2403+
# Accessed by status() (debug logging only)
2404+
"under_review",
2405+
"risk_accepted",
2406+
# Used by hash-code and legacy algorithms for endpoint/location matching
2407+
"dynamic_finding",
2408+
"static_finding",
2409+
# Algorithm-specific matching fields
2410+
"hash_code", # hash_code, uid_or_hash, legacy
2411+
"unique_id_from_tool", # unique_id, uid_or_hash
2412+
"title", # legacy
2413+
"cwe", # legacy
2414+
"file_path", # legacy
2415+
"line", # legacy
2416+
]
2417+
2418+
# Large text fields deferred in build_candidate_scope_queryset. These are
2419+
# never accessed during deduplication or reimport candidate matching, so
2420+
# excluding them reduces the data loaded for every candidate finding.
2421+
DEDUPLICATION_DEFERRED_FIELDS = [
2422+
"description",
2423+
"mitigation",
2424+
"impact",
2425+
"steps_to_reproduce",
2426+
"severity_justification",
2427+
"references",
2428+
"url",
2429+
"cvssv3",
2430+
"cvssv4",
2431+
]
2432+
23842433
title = models.CharField(max_length=511,
23852434
verbose_name=_("Title"),
23862435
help_text=_("A short description of the flaw."))

unittests/test_importers_performance.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
261261
@override_settings(ENABLE_AUDITLOG=True)
262262
def test_import_reimport_reimport_performance_pghistory_async(self):
263263
"""
264-
This test checks the performance of the importers when using django-pghistory with async enabled.
264+
This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode
265265
Query counts will need to be determined by running the test initially.
266266
"""
267267
configure_audit_system()
@@ -279,7 +279,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
279279
@override_settings(ENABLE_AUDITLOG=True)
280280
def test_import_reimport_reimport_performance_pghistory_no_async(self):
281281
"""
282-
This test checks the performance of the importers when using django-pghistory with async disabled.
282+
This test checks the performance of the importers when using django-pghistory and celery tasks in sync mode.
283283
Query counts will need to be determined by running the test initially.
284284
"""
285285
configure_audit_system()
@@ -445,7 +445,7 @@ def test_deduplication_performance_pghistory_async(self):
445445

446446
@override_settings(ENABLE_AUDITLOG=True)
447447
def test_deduplication_performance_pghistory_no_async(self):
448-
"""Test deduplication performance with django-pghistory and async tasks disabled."""
448+
"""Test deduplication performance with django-pghistory and celery tasks in sync mode."""
449449
configure_audit_system()
450450
configure_pghistory_triggers()
451451

@@ -459,6 +459,6 @@ def test_deduplication_performance_pghistory_no_async(self):
459459
self._deduplication_performance(
460460
expected_num_queries1=271,
461461
expected_num_async_tasks1=7,
462-
expected_num_queries2=236,
462+
expected_num_queries2=183,
463463
expected_num_async_tasks2=7,
464464
)

0 commit comments

Comments
 (0)