Skip to content

Commit ab18a94

Browse files
author
Valentijn Scholten
committed
prefetch better in dedupe command
1 parent edd8c04 commit ab18a94

3 files changed

Lines changed: 51 additions & 48 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import hyperlink
55
from django.conf import settings
6+
from django.db.models import Prefetch
67
from django.db.models.query_utils import Q
78

89
from dojo.celery import app
@@ -13,28 +14,22 @@
1314
deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
1415

1516

16-
@dojo_model_to_id
17-
@dojo_async_task
18-
@app.task
19-
@dojo_model_from_id
20-
def do_dedupe_finding_task(new_finding, *args, **kwargs):
21-
return do_dedupe_finding(new_finding, *args, **kwargs)
17+
def get_finding_models_for_deduplication(finding_ids):
18+
"""
19+
Load findings with optimal prefetching for deduplication operations.
20+
This avoids N+1 queries when accessing test, engagement, product, endpoints, and original_finding.
2221
22+
Args:
23+
finding_ids: A list of Finding IDs
24+
25+
Returns:
26+
A list of Finding models with related objects prefetched
2327
24-
@dojo_async_task
25-
@app.task
26-
def do_dedupe_batch_task(finding_ids, *args, **kwargs):
27-
"""
28-
Async task to deduplicate a batch of findings. The findings are assumed to be in the same test.
29-
Similar to post_process_findings_batch but focused only on deduplication.
3028
"""
3129
if not finding_ids:
32-
return
33-
34-
from django.db.models import Prefetch # noqa: PLC0415
30+
return []
3531

36-
# Load findings with proper prefetching
37-
findings = list(
32+
return list(
3833
Finding.objects.filter(id__in=finding_ids)
3934
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
4035
.prefetch_related(
@@ -47,6 +42,25 @@ def do_dedupe_batch_task(finding_ids, *args, **kwargs):
4742
),
4843
)
4944

45+
46+
@dojo_model_to_id
47+
@dojo_async_task
48+
@app.task
49+
@dojo_model_from_id
50+
def do_dedupe_finding_task(new_finding, *args, **kwargs):
51+
return do_dedupe_finding(new_finding, *args, **kwargs)
52+
53+
54+
@dojo_async_task
55+
@app.task
56+
def do_dedupe_batch_task(finding_ids, *args, **kwargs):
57+
"""
58+
Async task to deduplicate a batch of findings. The findings are assumed to be in the same test.
59+
Similar to post_process_findings_batch but focused only on deduplication.
60+
"""
61+
# Load findings with proper prefetching
62+
findings = get_finding_models_for_deduplication(finding_ids)
63+
5064
if not findings:
5165
logger.debug(f"no findings found for batch deduplication with IDs: {finding_ids}")
5266
return

dojo/finding/helper.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from time import strftime
44

55
from django.conf import settings
6-
from django.db.models import Prefetch
76
from django.db.models.query_utils import Q
87
from django.db.models.signals import post_delete, pre_delete
98
from django.db.utils import IntegrityError
@@ -18,7 +17,11 @@
1817
from dojo.decorators import dojo_async_task, dojo_model_from_id, dojo_model_to_id
1918
from dojo.endpoint.utils import save_endpoints_to_add
2019
from dojo.file_uploads.helper import delete_related_files
21-
from dojo.finding.deduplication import dedupe_batch_of_findings, do_dedupe_finding
20+
from dojo.finding.deduplication import (
21+
dedupe_batch_of_findings,
22+
do_dedupe_finding,
23+
get_finding_models_for_deduplication,
24+
)
2225
from dojo.models import (
2326
Endpoint,
2427
Endpoint_Status,
@@ -477,19 +480,7 @@ def post_process_findings_batch(finding_ids, *args, dedupe_option=True, rules_op
477480
system_settings = System_Settings.objects.get()
478481

479482
# use list() to force a complete query execution and related objects to be loaded once
480-
findings = list(
481-
Finding.objects.filter(id__in=finding_ids)
482-
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
483-
.exclude(duplicate=True)
484-
.prefetch_related(
485-
"endpoints",
486-
# Prefetch duplicates of each new finding to avoid N+1 when set_duplicate iterates
487-
Prefetch(
488-
"original_finding",
489-
queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
490-
),
491-
),
492-
)
483+
findings = get_finding_models_for_deduplication(finding_ids)
493484

494485
if not findings:
495486
logger.debug(f"no findings found for batch deduplication with IDs: {finding_ids}")

dojo/management/commands/dedupe.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
do_dedupe_batch_task,
99
do_dedupe_finding,
1010
do_dedupe_finding_task,
11+
get_finding_models_for_deduplication,
1112
)
1213
from dojo.models import Finding, Product
1314
from dojo.utils import (
@@ -71,13 +72,22 @@ def handle(self, *args, **options):
7172
findings = Finding.objects.all().filter(id__gt=0).exclude(duplicate=True)
7273
logger.info("######## Will process the full database with %d findings ########", findings.count())
7374

75+
# Prefetch related objects for synchronous deduplication
76+
findings = findings.select_related(
77+
"test", "test__engagement", "test__engagement__product", "test__test_type",
78+
).prefetch_related(
79+
"endpoints",
80+
Prefetch(
81+
"original_finding",
82+
queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
83+
),
84+
)
85+
7486
# Phase 1: update hash_codes without deduplicating
7587
if not dedupe_only:
7688
logger.info("######## Start Updating Hashcodes (foreground) ########")
7789

78-
# only prefetch here for hash_code calculation
79-
finds = findings.prefetch_related("endpoints", "test__test_type")
80-
mass_model_updater(Finding, finds, generate_hash_code, fields=["hash_code"], order="asc", log_prefix="hash_code computation ")
90+
mass_model_updater(Finding, findings, generate_hash_code, fields=["hash_code"], order="asc", log_prefix="hash_code computation ")
8191

8292
logger.info("######## Done Updating Hashcodes########")
8393

@@ -127,19 +137,7 @@ def _dedupe_batch_mode(self, findings_queryset, *, dedupe_sync: bool = True):
127137
if test_finding_ids:
128138
if dedupe_sync:
129139
# Synchronous: load findings and process immediately
130-
test_findings = list(
131-
findings_queryset.filter(test_id=test_id)
132-
.exclude(duplicate=True)
133-
.select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
134-
.prefetch_related(
135-
"endpoints",
136-
# Prefetch duplicates of each finding to avoid N+1 when set_duplicate iterates
137-
Prefetch(
138-
"original_finding",
139-
queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
140-
),
141-
),
142-
)
140+
test_findings = get_finding_models_for_deduplication(test_finding_ids)
143141
logger.debug(f"Deduplicating batch of {len(test_findings)} findings for test {test_id}")
144142
dedupe_batch_of_findings(test_findings)
145143
else:

0 commit comments

Comments
 (0)