Skip to content

Commit 70031e2

Browse files
optimize prefetching
1 parent a2f4b20 commit 70031e2

2 files changed

Lines changed: 29 additions & 21 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,12 @@ def set_duplicate(new_finding, existing_finding):
9090
new_finding.duplicate_finding = existing_finding
9191

9292
# Make sure transitive duplication is flattened
93-
# if A -> B and B is made a duplicate of C here, aferwards:
93+
# if A -> B and B is made a duplicate of C here, afterwards:
9494
# A -> C and B -> C should be true
95-
for find in new_finding.original_finding.all().order_by("-id"):
95+
# Ordering is ensured by the prefetch in post_process_findings_batch
96+
# (we prefetch "original_finding" ordered by -id), so avoid calling
97+
# order_by here to prevent bypassing the prefetch cache.
98+
for find in new_finding.original_finding.all():
9699
new_finding.original_finding.remove(find)
97100
set_duplicate(find, existing_finding)
98101
existing_finding.found_by.add(new_finding.test.test_type)
@@ -181,10 +184,14 @@ def build_dedupe_scope_queryset(test):
181184
| Q(test__engagement__deduplication_on_engagement=False)
182185
)
183186

184-
return Finding.objects.filter(scope_q)
187+
return (
188+
Finding.objects.filter(scope_q)
189+
.select_related("test", "test__engagement", "test__test_type")
190+
.prefetch_related("endpoints")
191+
)
185192

186193

187-
def find_candidates_for_deduplication_hash(test, findings, *, include_product_scope_filter):
194+
def find_candidates_for_deduplication_hash(test, findings):
188195
base_queryset = build_dedupe_scope_queryset(test)
189196
hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
190197
if not hash_codes:
@@ -202,7 +209,7 @@ def find_candidates_for_deduplication_hash(test, findings, *, include_product_sc
202209
return existing_by_hash
203210

204211

205-
def find_candidates_for_deduplication_unique_id(test, findings, *, include_product_scope_filter):
212+
def find_candidates_for_deduplication_unique_id(test, findings):
206213
base_queryset = build_dedupe_scope_queryset(test)
207214
unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
208215
if not unique_ids:
@@ -250,7 +257,7 @@ def deduplicate_uid_or_hash_code_old(new_finding):
250257
continue
251258

252259

253-
def find_candidates_for_deduplication_uid_or_hash(test, findings, *, include_product_scope_filter):
260+
def find_candidates_for_deduplication_uid_or_hash(test, findings):
254261
base_queryset = build_dedupe_scope_queryset(test)
255262
hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
256263
unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
@@ -279,22 +286,15 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings, *, include_pro
279286
return existing_by_uid, existing_by_hash
280287

281288

282-
def find_candidates_for_deduplication_legacy(test, findings, *, include_product_scope_filter):
289+
def find_candidates_for_deduplication_legacy(test, findings):
283290
base_queryset = build_dedupe_scope_queryset(test)
284291
titles = {f.title for f in findings if getattr(f, "title", None)}
285292
cwes = {f.cwe for f in findings if getattr(f, "cwe", 0)}
286293
cwes.discard(0)
287294
if not titles and not cwes:
288295
return {}, {}
289296

290-
existing_qs = base_queryset.filter(Q(title__in=titles) | Q(cwe__in=cwes)).exclude(duplicate=True).prefetch_related(
291-
"endpoints",
292-
"test",
293-
"test__engagement",
294-
"found_by",
295-
"original_finding",
296-
"test__test_type",
297-
).order_by("id")
297+
existing_qs = base_queryset.filter(Q(title__in=titles) | Q(cwe__in=cwes)).exclude(duplicate=True).order_by("id")
298298

299299
by_title = {}
300300
by_cwe = {}
@@ -436,7 +436,7 @@ def _dedupe_batch_hash_code(findings):
436436
if not findings:
437437
return
438438
test = findings[0].test
439-
candidates_by_hash = find_candidates_for_deduplication_hash(test, findings, include_product_scope_filter=True)
439+
candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
440440
if not candidates_by_hash:
441441
return
442442
for new_finding in findings:
@@ -453,7 +453,7 @@ def _dedupe_batch_unique_id(findings):
453453
if not findings:
454454
return
455455
test = findings[0].test
456-
candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings, include_product_scope_filter=True)
456+
candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
457457
if not candidates_by_uid:
458458
return
459459
for new_finding in findings:
@@ -471,7 +471,7 @@ def _dedupe_batch_uid_or_hash(findings):
471471
return
472472

473473
test = findings[0].test
474-
candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings, include_product_scope_filter=True)
474+
candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
475475
if not (candidates_by_uid or existing_by_hash):
476476
return
477477
for new_finding in findings:
@@ -492,7 +492,7 @@ def _dedupe_batch_legacy(findings):
492492
if not findings:
493493
return
494494
test = findings[0].test
495-
candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings, include_product_scope_filter=True)
495+
candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
496496
if not (candidates_by_title or candidates_by_cwe):
497497
return
498498
for new_finding in findings:

dojo/finding/helper.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from time import strftime
44

55
from django.conf import settings
6+
from django.db.models import Prefetch
67
from django.db.models.query_utils import Q
78
from django.db.models.signals import post_delete, pre_delete
89
from django.db.utils import IntegrityError
@@ -478,8 +479,15 @@ def post_process_findings_batch(finding_ids, *args, dedupe_option=True, rules_op
478479
# use list() to force a complete query execution and related objects to be loaded once
479480
findings = list(
480481
Finding.objects.filter(id__in=finding_ids)
481-
.select_related("test", "test__engagement", "test__engagement__product")
482-
.prefetch_related("endpoints"),
482+
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
483+
.prefetch_related(
484+
"endpoints",
485+
# Prefetch duplicates of each new finding to avoid N+1 when set_duplicate iterates
486+
Prefetch(
487+
"original_finding",
488+
queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
489+
),
490+
),
483491
)
484492

485493
if not findings:

0 commit comments

Comments
 (0)