DefectDojo
diff --git a/‎docs/content/en/open_source/upgrading/2.54.md‎
Lines changed: 15 additions & 3 deletions b/‎docs/content/en/open_source/upgrading/2.54.md‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎dojo/finding/deduplication.py‎
Lines changed: 148 additions & 32 deletions b/‎dojo/finding/deduplication.py‎
Lines changed: 148 additions & 32 deletions
diff --git a/‎dojo/finding/helper.py‎
Lines changed: 6 additions & 3 deletions b/‎dojo/finding/helper.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎dojo/importers/base_importer.py‎
Lines changed: 12 additions & 10 deletions b/‎dojo/importers/base_importer.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎dojo/importers/default_importer.py‎
Lines changed: 1 addition & 1 deletion b/‎dojo/importers/default_importer.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,8 +1,8 @@
 ---
 title: 'Upgrading to DefectDojo Version 2.54.x'
 toc_hide: true
-weight: -20251201
-description: Removal of django-auditlog and exclusive use of django-pghistory for audit logging & Dropped support for DD_PARSER_EXCLUDE
+weight: -20250804
+description: Removal of django-auditlog & Dropped support for DD_PARSER_EXCLUDE & Reimport performance improvements
 ---
 
 ## Breaking Change: Removal of django-auditlog
@@ -44,4 +44,16 @@ The backfill migration is not mandatory to succeed. If it fails for some reason,
 To simplify the management of the DefectDojo application, parser exclusions are no longer controlled via the environment variable DD_PARSER_EXCLUDE or application settings. This variable is now unsupported.
 From now on, you should use the active flag in the Test_Type model to enable or disable parsers. Only parsers associated with active Test_Type entries will be available for use.
 
-Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.54.0) for the contents of the release.
+## Import/reimport performance improvements
+
+DefectDojo 2.54.x includes performance improvements for reimporting scan results, especially for large scans:
+
+- **Faster reimports** due to fewer database queries and more bulk operations.
+- **Reduced database load** during reimport matching and post-processing (helps avoid slowdowns/timeouts under heavy scan volume).
+- **More efficient endpoint status updates** during reimport of dynamic findings.
+- **Less churn when updating vulnerability IDs**, avoiding unnecessary deletes/writes when nothing changed.
+
+No action is required after upgrading. (Optional tuning knobs exist via `DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE` and `DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE`.)
+
+There are other instructions for upgrading to 2.54.x. Check the Release Notes for the contents of the release: `https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.54.0`
+Check the [Release Notes](https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.54.0) for the contents of the release.
@@ -232,59 +232,123 @@ def are_endpoints_duplicates(new_finding, to_duplicate_finding):
     return False
 
 
-def build_dedupe_scope_queryset(test):
-    scope_on_engagement = test.engagement.deduplication_on_engagement
-    if scope_on_engagement:
-        scope_q = Q(test__engagement=test.engagement)
-    else:
-        # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
-        scope_q = Q(test__engagement__product=test.engagement.product) & (
-            Q(test__engagement=test.engagement)
-            | Q(test__engagement__deduplication_on_engagement=False)
-        )
+def build_candidate_scope_queryset(test, mode="deduplication", service=None):
+    """
+    Build a queryset for candidate finding.
+
+    Args:
+        test: The test to scope from
+        mode: "deduplication" (can match across tests) or "reimport" (same test only)
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    if mode == "reimport":
+        # For reimport, only filter by test. Service filtering is not needed because
+        # service is included in hash_code calculation (HASH_CODE_FIELDS_ALWAYS = ["service"]),
+        # so matching by hash_code automatically ensures correct service match.
+        queryset = Finding.objects.filter(test=test)
+    else:  # deduplication mode
+        scope_on_engagement = test.engagement.deduplication_on_engagement
+        if scope_on_engagement:
+            scope_q = Q(test__engagement=test.engagement)
+        else:
+            # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
+            scope_q = Q(test__engagement__product=test.engagement.product) & (
+                Q(test__engagement=test.engagement)
+                | Q(test__engagement__deduplication_on_engagement=False)
+            )
+        queryset = Finding.objects.filter(scope_q)
+
+    # Base prefetches for both modes
+    prefetch_list = ["endpoints", "vulnerability_id_set", "found_by"]
+
+    # Additional prefetches for reimport mode
+    if mode == "reimport":
+        prefetch_list.extend([
+            "status_finding",
+            "status_finding__endpoint",
+        ])
 
     return (
-        Finding.objects.filter(scope_q)
+        queryset
         .select_related("test", "test__engagement", "test__test_type")
-        .prefetch_related("endpoints", "found_by")
+        .prefetch_related(*prefetch_list)
     )
 
 
-def find_candidates_for_deduplication_hash(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+def find_candidates_for_deduplication_hash(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by hash_code. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
     if not hash_codes:
         return {}
-    existing_qs = (
-        base_queryset.filter(hash_code__in=hash_codes)
-        .exclude(hash_code=None)
-        .exclude(duplicate=True)
-        .order_by("id")
-    )
+
+    existing_qs = base_queryset.filter(hash_code__in=hash_codes).exclude(hash_code=None)
+    if mode == "deduplication":
+        existing_qs = existing_qs.exclude(duplicate=True)
+    existing_qs = existing_qs.order_by("id")
+
     existing_by_hash = {}
     for ef in existing_qs:
         existing_by_hash.setdefault(ef.hash_code, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}")
     return existing_by_hash
 
 
-def find_candidates_for_deduplication_unique_id(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+def find_candidates_for_deduplication_unique_id(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by unique_id_from_tool. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
     if not unique_ids:
         return {}
-    existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None).exclude(duplicate=True).order_by("id")
+
+    existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None)
+    if mode == "deduplication":
+        existing_qs = existing_qs.exclude(duplicate=True)
     # unique_id_from_tool can only apply to the same test_type because it is parser dependent
-    existing_qs = existing_qs.filter(test__test_type=test.test_type)
+    existing_qs = existing_qs.filter(test__test_type=test.test_type).order_by("id")
+
     existing_by_uid = {}
     for ef in existing_qs:
         existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}")
     return existing_by_uid
 
 
-def find_candidates_for_deduplication_uid_or_hash(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+def find_candidates_for_deduplication_uid_or_hash(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by unique_id_from_tool or hash_code. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
     unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
     if not hash_codes and not unique_ids:
@@ -298,7 +362,11 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
         uid_q = Q(unique_id_from_tool__isnull=False, unique_id_from_tool__in=unique_ids) & Q(test__test_type=test.test_type)
         cond |= uid_q
 
-    existing_qs = base_queryset.filter(cond).exclude(duplicate=True).order_by("id")
+    existing_qs = base_queryset.filter(cond)
+    if mode == "deduplication":
+        # reimport matching will match against duplicates, import/deduplication doesn't.
+        existing_qs = existing_qs.exclude(duplicate=True)
+    existing_qs = existing_qs.order_by("id")
 
     existing_by_hash = {}
     existing_by_uid = {}
@@ -307,13 +375,15 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
             existing_by_hash.setdefault(ef.hash_code, []).append(ef)
         if ef.unique_id_from_tool is not None:
             existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs")
-    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}")
+    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}")
     return existing_by_uid, existing_by_hash
 
 
 def find_candidates_for_deduplication_legacy(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+    base_queryset = build_candidate_scope_queryset(test, mode="deduplication")
     titles = {f.title for f in findings if getattr(f, "title", None)}
     cwes = {f.cwe for f in findings if getattr(f, "cwe", 0)}
     cwes.discard(0)
@@ -335,6 +405,52 @@ def find_candidates_for_deduplication_legacy(test, findings):
     return by_title, by_cwe
 
 
+# TODO: should we align this with deduplication?
+def find_candidates_for_reimport_legacy(test, findings, service=None):
+    """
+    Find all existing findings in the test that match any of the given findings by title and severity.
+    Used for batch reimport to avoid 1+N query problem.
+    Legacy reimport matches by title (case-insensitive), severity, and numerical_severity.
+    Note: This function is kept separate because legacy reimport has fundamentally different matching logic
+    than legacy deduplication (title+severity vs title+CWE).
+    Note: service parameter is kept for backward compatibility but not used since service is in hash_code.
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode="reimport", service=None)
+
+    # Collect all unique title/severity combinations
+    title_severity_pairs = set()
+    for finding in findings:
+        if finding.title:
+            title_severity_pairs.add((
+                finding.title.lower(),  # Case-insensitive matching
+                finding.severity,
+                Finding.get_numerical_severity(finding.severity),
+            ))
+
+    if not title_severity_pairs:
+        return {}
+
+    # Build query to find all matching findings
+    conditions = Q()
+    for title_lower, severity, numerical_severity in title_severity_pairs:
+        conditions |= (
+            Q(title__iexact=title_lower) &
+            Q(severity=severity) &
+            Q(numerical_severity=numerical_severity)
+        )
+
+    existing_qs = base_queryset.filter(conditions).order_by("id")
+
+    # Build dictionary keyed by (title_lower, severity) for quick lookup
+    existing_by_key = {}
+    for ef in existing_qs:
+        key = (ef.title.lower(), ef.severity)
+        existing_by_key.setdefault(key, []).append(ef)
+
+    deduplicationLogger.debug(f"Found {sum(len(v) for v in existing_by_key.values())} existing findings by legacy matching for reimport")
+    return existing_by_key
+
+
 def _is_candidate_older(new_finding, candidate):
     # Ensure the newer finding is marked as duplicate of the older finding
     is_older = candidate.id < new_finding.id
 
@@ -765,12 +765,15 @@ def add_endpoints(new_finding, form):
             endpoint=endpoint, defaults={"date": form.cleaned_data["date"] or timezone.now()})
 
 
-def save_vulnerability_ids(finding, vulnerability_ids):
+def save_vulnerability_ids(finding, vulnerability_ids, *, delete_existing: bool = True):
     # Remove duplicates
     vulnerability_ids = list(dict.fromkeys(vulnerability_ids))
 
-    # Remove old vulnerability ids
-    Vulnerability_Id.objects.filter(finding=finding).delete()
+    # Remove old vulnerability ids if requested
+    # Callers can set delete_existing=False when they know there are no existing IDs
+    # to avoid an unnecessary delete query (e.g., for new findings)
+    if delete_existing:
+        Vulnerability_Id.objects.filter(finding=finding).delete()
 
     # Save new vulnerability ids
     # Using bulk create throws Django 50 warnings about unsaved models...
 
@@ -32,7 +32,6 @@
     Test_Import,
     Test_Import_Finding_Action,
     Test_Type,
-    Vulnerability_Id,
 )
 from dojo.notifications.helper import create_notification
 from dojo.tag_utils import bulk_add_tags_to_instances
@@ -278,6 +277,7 @@ def determine_process_method(
     def determine_deduplication_algorithm(self) -> str:
         """
         Determines what dedupe algorithm to use for the Test being processed.
+        Overridden in Pro.
         :return: A string representing the dedupe algorithm to use.
         """
         return self.test.deduplication_algorithm
@@ -793,21 +793,23 @@ def process_cve(
 
         return finding
 
-    def process_vulnerability_ids(
+    def store_vulnerability_ids(
         self,
         finding: Finding,
     ) -> Finding:
         """
-        Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
-        to create `Vulnerability_Id` objects with the finding associated correctly
-        """
-        if finding.unsaved_vulnerability_ids:
-            # Remove old vulnerability ids - keeping this call only because of flake8
-            Vulnerability_Id.objects.filter(finding=finding).delete()
+        Store vulnerability IDs for a finding.
+        Reads from finding.unsaved_vulnerability_ids and saves them overwriting existing ones.
+
+        Args:
+            finding: The finding to store vulnerability IDs for
 
-            # user the helper function
-            finding_helper.save_vulnerability_ids(finding, finding.unsaved_vulnerability_ids)
+        Returns:
+            The finding object
 
+        """
+        vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []
+        finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=False)
         return finding
 
     def process_files(
 
@@ -234,7 +234,7 @@ def process_findings(
             # Process any files
             self.process_files(finding)
             # Process vulnerability IDs
-            finding = self.process_vulnerability_ids(finding)
+            finding = self.store_vulnerability_ids(finding)
             # Categorize this finding as a new one
             new_findings.append(finding)
             # all data is already saved on the finding, we only need to trigger post processing in batches