prefetch better in dedupe command

Valentijn Scholten · Valentijn Scholten · commit ab18a9432d58 · 2025-11-09T10:49:32.000+01:00
diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
@@ -3,6 +3,7 @@
 
 import hyperlink
 from django.conf import settings
+from django.db.models import Prefetch
 from django.db.models.query_utils import Q
 
 from dojo.celery import app
@@ -13,28 +14,22 @@
 deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")
 
 
-@dojo_model_to_id
-@dojo_async_task
-@app.task
-@dojo_model_from_id
-def do_dedupe_finding_task(new_finding, *args, **kwargs):
-    return do_dedupe_finding(new_finding, *args, **kwargs)
+def get_finding_models_for_deduplication(finding_ids):
+    """
+    Load findings with optimal prefetching for deduplication operations.
+    This avoids N+1 queries when accessing test, engagement, product, endpoints, and original_finding.
 
+    Args:
+        finding_ids: A list of Finding IDs
+
+    Returns:
+        A list of Finding models with related objects prefetched
 
-@dojo_async_task
-@app.task
-def do_dedupe_batch_task(finding_ids, *args, **kwargs):
-    """
-    Async task to deduplicate a batch of findings. The findings are assumed to be in the same test.
-    Similar to post_process_findings_batch but focused only on deduplication.
     """
     if not finding_ids:
-        return
-
-    from django.db.models import Prefetch  # noqa: PLC0415
+        return []
 
-    # Load findings with proper prefetching
-    findings = list(
+    return list(
         Finding.objects.filter(id__in=finding_ids)
         .select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
         .prefetch_related(
@@ -47,6 +42,25 @@ def do_dedupe_batch_task(finding_ids, *args, **kwargs):
         ),
     )
 
+
+@dojo_model_to_id
+@dojo_async_task
+@app.task
+@dojo_model_from_id
+def do_dedupe_finding_task(new_finding, *args, **kwargs):
+    return do_dedupe_finding(new_finding, *args, **kwargs)
+
+
+@dojo_async_task
+@app.task
+def do_dedupe_batch_task(finding_ids, *args, **kwargs):
+    """
+    Async task to deduplicate a batch of findings. The findings are assumed to be in the same test.
+    Similar to post_process_findings_batch but focused only on deduplication.
+    """
+    # Load findings with proper prefetching
+    findings = get_finding_models_for_deduplication(finding_ids)
+
     if not findings:
         logger.debug(f"no findings found for batch deduplication with IDs: {finding_ids}")
         return
diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py
@@ -3,7 +3,6 @@
 from time import strftime
 
 from django.conf import settings
-from django.db.models import Prefetch
 from django.db.models.query_utils import Q
 from django.db.models.signals import post_delete, pre_delete
 from django.db.utils import IntegrityError
@@ -18,7 +17,11 @@
 from dojo.decorators import dojo_async_task, dojo_model_from_id, dojo_model_to_id
 from dojo.endpoint.utils import save_endpoints_to_add
 from dojo.file_uploads.helper import delete_related_files
-from dojo.finding.deduplication import dedupe_batch_of_findings, do_dedupe_finding
+from dojo.finding.deduplication import (
+    dedupe_batch_of_findings,
+    do_dedupe_finding,
+    get_finding_models_for_deduplication,
+)
 from dojo.models import (
     Endpoint,
     Endpoint_Status,
@@ -477,19 +480,7 @@ def post_process_findings_batch(finding_ids, *args, dedupe_option=True, rules_op
     system_settings = System_Settings.objects.get()
 
     # use list() to force a complete query execution and related objects to be loaded once
-    findings = list(
-        Finding.objects.filter(id__in=finding_ids)
-        .select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
-        .exclude(duplicate=True)
-        .prefetch_related(
-            "endpoints",
-            # Prefetch duplicates of each new finding to avoid N+1 when set_duplicate iterates
-            Prefetch(
-                "original_finding",
-                queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
-            ),
-        ),
-    )
+    findings = get_finding_models_for_deduplication(finding_ids)
 
     if not findings:
         logger.debug(f"no findings found for batch deduplication with IDs: {finding_ids}")
diff --git a/dojo/management/commands/dedupe.py b/dojo/management/commands/dedupe.py
@@ -8,6 +8,7 @@
     do_dedupe_batch_task,
     do_dedupe_finding,
     do_dedupe_finding_task,
+    get_finding_models_for_deduplication,
 )
 from dojo.models import Finding, Product
 from dojo.utils import (
@@ -71,13 +72,22 @@ def handle(self, *args, **options):
             findings = Finding.objects.all().filter(id__gt=0).exclude(duplicate=True)
             logger.info("######## Will process the full database with %d findings ########", findings.count())
 
+        # Prefetch related objects for synchronous deduplication
+        findings = findings.select_related(
+            "test", "test__engagement", "test__engagement__product", "test__test_type",
+        ).prefetch_related(
+            "endpoints",
+            Prefetch(
+                "original_finding",
+                queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
+            ),
+        )
+
         # Phase 1: update hash_codes without deduplicating
         if not dedupe_only:
             logger.info("######## Start Updating Hashcodes (foreground) ########")
 
-            # only prefetch here for hash_code calculation
-            finds = findings.prefetch_related("endpoints", "test__test_type")
-            mass_model_updater(Finding, finds, generate_hash_code, fields=["hash_code"], order="asc", log_prefix="hash_code computation ")
+            mass_model_updater(Finding, findings, generate_hash_code, fields=["hash_code"], order="asc", log_prefix="hash_code computation ")
 
             logger.info("######## Done Updating Hashcodes########")
 
@@ -127,19 +137,7 @@ def _dedupe_batch_mode(self, findings_queryset, *, dedupe_sync: bool = True):
             if test_finding_ids:
                 if dedupe_sync:
                     # Synchronous: load findings and process immediately
-                    test_findings = list(
-                        findings_queryset.filter(test_id=test_id)
-                        .exclude(duplicate=True)
-                        .select_related("test", "test__engagement", "test__engagement__product", "test__test_type")
-                        .prefetch_related(
-                            "endpoints",
-                            # Prefetch duplicates of each finding to avoid N+1 when set_duplicate iterates
-                            Prefetch(
-                                "original_finding",
-                                queryset=Finding.objects.only("id", "duplicate_finding_id").order_by("-id"),
-                            ),
-                        ),
-                    )
+                    test_findings = get_finding_models_for_deduplication(test_finding_ids)
                     logger.debug(f"Deduplicating batch of {len(test_findings)} findings for test {test_id}")
                     dedupe_batch_of_findings(test_findings)
                 else: