feat(importers): apply import-time tags per batch before post-processing

valentijnscholten · valentijnscholten · commit bf73492248f1 · 2026-05-08T17:16:31.000+02:00
Previously, import-time tags (apply_tags_to_findings / apply_tags_to_endpoints)
were applied after all findings were processed and post-processing tasks
(deduplication, rules) had already been dispatched, so those tasks could not
see the tags.

Apply import-time tags per batch in process_findings(), immediately after
parser tags and location persistence, before dojo_dispatch_task() — the same
pattern already used by bulk_apply_parser_tags.

Closed findings are intentionally excluded: they are absent from the current
report and should not receive the import tags.

Also renames the reimporter's ambiguous batch_findings (match-batch slice of
unsaved parser objects) to unsaved_findings_batch, freeing batch_findings for
the new dedupe-batch accumulator and preventing an iterator-mutation bug.

Consolidates apply_import_tags() into apply_import_tags_for_batch(), removing
a redundant DB re-query and unused Iterable import.

Update performance test query counts accordingly (each import loses 1 query
from the removed post-loop apply_import_tags call).
diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
@@ -1,7 +1,6 @@
 import base64
 import logging
 import time
-from collections.abc import Iterable
 
 from django.conf import settings
 from django.core.exceptions import ValidationError
@@ -337,67 +336,36 @@ def update_test_tags(self):
         if self.tags is not None and len(self.tags) > 0:
             self.test.tags.set(self.tags)
 
-    def apply_import_tags(
-        self,
-        new_findings: Iterable[Finding] | None = None,
-        closed_findings: Iterable[Finding] | None = None,
-        reactivated_findings: Iterable[Finding] | None = None,
-        untouched_findings: Iterable[Finding] | None = None,
-    ) -> None:
-        """Apply tags to findings and endpoints from an import operation."""
-        # Normalize None values to empty lists and convert sets/other iterables to lists
-        if untouched_findings is None:
-            untouched_findings = []
-        elif not isinstance(untouched_findings, list):
-            untouched_findings = list(untouched_findings)
-
-        if reactivated_findings is None:
-            reactivated_findings = []
-        elif not isinstance(reactivated_findings, list):
-            reactivated_findings = list(reactivated_findings)
-
-        if closed_findings is None:
-            closed_findings = []
-        elif not isinstance(closed_findings, list):
-            closed_findings = list(closed_findings)
-
-        if new_findings is None:
-            new_findings = []
-        elif not isinstance(new_findings, list):
-            new_findings = list(new_findings)
-
-        # Collect all affected findings
-        findings_to_tag = new_findings + closed_findings + reactivated_findings + untouched_findings
+    def apply_import_tags_for_batch(self, findings: list[Finding]) -> None:
+        """
+        Apply import-time tags to a batch of already-saved findings and their endpoints.
 
-        if not findings_to_tag:
+        Called per batch inside process_findings(), before post_process_findings_batch is
+        dispatched, so that rules/deduplication tasks see the import tags on the findings.
+        """
+        if not findings or not self.tags:
             return
-
-        # Add any tags to the findings imported if necessary
-        if self.apply_tags_to_findings and self.tags:
-            findings_qs = Finding.objects.filter(id__in=[f.id for f in findings_to_tag])
+        if self.apply_tags_to_findings:
             try:
                 bulk_add_tags_to_instances(
                     tag_or_tags=self.tags,
-                    instances=findings_qs,
+                    instances=findings,
                     tag_field_name="tags",
                 )
             except IntegrityError:
-                # Fallback to safe per-instance tagging if concurrent deletes occur
-                for finding in findings_to_tag:
+                for finding in findings:
                     for tag in self.tags:
                         self.add_tags_safe(finding, tag)
-
-        # Add any tags to any locations/endpoints of the findings imported if necessary
-        if self.apply_tags_to_endpoints and self.tags:
-            locations_qs = self.location_handler.get_locations_for_tagging(findings_to_tag)
+        if self.apply_tags_to_endpoints:
+            locations_qs = self.location_handler.get_locations_for_tagging(findings)
             try:
                 bulk_add_tags_to_instances(
                     tag_or_tags=self.tags,
                     instances=locations_qs,
                     tag_field_name="tags",
                 )
             except IntegrityError:
-                for finding in findings_to_tag:
+                for finding in findings:
                     for location in self.location_handler.get_location_tag_fallback(finding):
                         for tag in self.tags:
                             self.add_tags_safe(location, tag)
diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py
@@ -132,11 +132,6 @@ def process_scan(
             new_findings=new_findings,
             closed_findings=closed_findings,
         )
-        # Apply tags to findings and endpoints/locations
-        self.apply_import_tags(
-            new_findings=new_findings,
-            closed_findings=closed_findings,
-        )
         # Send out some notifications to the user
         logger.debug("IMPORT_SCAN: Generating notifications")
         dojo_dispatch_task(
@@ -169,6 +164,7 @@ def process_findings(
     ) -> list[Finding]:
         # Batched post-processing (no chord): dispatch a task per 1000 findings or on final finding
         batch_finding_ids: list[int] = []
+        batch_findings: list[Finding] = []
         batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
 
         """
@@ -259,6 +255,7 @@ def process_findings(
             push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped)
             logger.debug("process_findings: computed push_to_jira=%s", push_to_jira)
             batch_finding_ids.append(finding.id)
+            batch_findings.append(finding)
 
             # If batch is full or we're at the end, persist locations/endpoints and dispatch
             if len(batch_finding_ids) >= batch_max_size or is_final_finding:
@@ -267,6 +264,9 @@ def process_findings(
                 # so rules/deduplication tasks see the tags already on the findings.
                 bulk_apply_parser_tags(findings_with_parser_tags)
                 findings_with_parser_tags.clear()
+                # Apply import-time tags before post-processing so rules/deduplication see them.
+                self.apply_import_tags_for_batch(batch_findings)
+                batch_findings.clear()
                 finding_ids_batch = list(batch_finding_ids)
                 batch_finding_ids.clear()
                 logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)",
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
@@ -137,13 +137,6 @@ def process_scan(
             reactivated_findings=reactivated_findings,
             untouched_findings=untouched_findings,
         )
-        # Apply tags to findings and endpoints
-        self.apply_import_tags(
-            new_findings=new_findings,
-            closed_findings=closed_findings,
-            reactivated_findings=reactivated_findings,
-            untouched_findings=untouched_findings,
-        )
         # Send out som notifications to the user
         logger.debug("REIMPORT_SCAN: Generating notifications")
         updated_count = (
@@ -173,7 +166,7 @@ def process_scan(
 
     def get_reimport_match_candidates_for_batch(
         self,
-        batch_findings: list[Finding],
+        unsaved_findings_batch: list[Finding],
     ) -> tuple[dict, dict, dict]:
         """
         Fetch candidate matches for a batch of *unsaved* findings during reimport.
@@ -195,23 +188,23 @@ def get_reimport_match_candidates_for_batch(
         if self.deduplication_algorithm == "hash_code":
             candidates_by_hash = find_candidates_for_deduplication_hash(
                 self.test,
-                batch_findings,
+                unsaved_findings_batch,
                 mode="reimport",
             )
         elif self.deduplication_algorithm == "unique_id_from_tool":
             candidates_by_uid = find_candidates_for_deduplication_unique_id(
                 self.test,
-                batch_findings,
+                unsaved_findings_batch,
                 mode="reimport",
             )
         elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
             candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash(
                 self.test,
-                batch_findings,
+                unsaved_findings_batch,
                 mode="reimport",
             )
         elif self.deduplication_algorithm == "legacy":
-            candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings)
+            candidates_by_key = find_candidates_for_reimport_legacy(self.test, unsaved_findings_batch)
 
         return candidates_by_hash, candidates_by_uid, candidates_by_key
 
@@ -308,6 +301,7 @@ def process_findings(
             cleaned_findings.append(sanitized)
 
         batch_finding_ids: list[int] = []
+        batch_findings: list[Finding] = []
         findings_with_parser_tags: list[tuple] = []
         # Batch size for deduplication/post-processing (only new findings)
         dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
@@ -318,13 +312,13 @@ def process_findings(
         # This avoids the 1+N query problem by fetching all candidates for a batch at once
         for batch_start in range(0, len(cleaned_findings), match_batch_max_size):
             batch_end = min(batch_start + match_batch_max_size, len(cleaned_findings))
-            batch_findings = cleaned_findings[batch_start:batch_end]
+            unsaved_findings_batch = cleaned_findings[batch_start:batch_end]
             is_final_batch = batch_end == len(cleaned_findings)
 
             logger.debug(f"Processing reimport batch {batch_start}-{batch_end} of {len(cleaned_findings)} findings")
 
             # Prepare findings in batch: set test, service, calculate hash codes
-            for unsaved_finding in batch_findings:
+            for unsaved_finding in unsaved_findings_batch:
                 # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report)
                 # Finding.mitigated is DateTimeField and it requires timezone
                 if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo:
@@ -342,12 +336,12 @@ def process_findings(
 
             # Fetch all candidates for this batch at once (batch candidate finding)
             candidates_by_hash, candidates_by_uid, candidates_by_key = self.get_reimport_match_candidates_for_batch(
-                batch_findings,
+                unsaved_findings_batch,
             )
 
             # Process each finding in the batch using pre-fetched candidates
-            for idx, unsaved_finding in enumerate(batch_findings):
-                is_final = is_final_batch and idx == len(batch_findings) - 1
+            for idx, unsaved_finding in enumerate(unsaved_findings_batch):
+                is_final = is_final_batch and idx == len(unsaved_findings_batch) - 1
 
                 # Match any findings to this new one coming in using pre-fetched candidates
                 matched_findings = self.match_finding_to_candidate_reimport(
@@ -403,6 +397,7 @@ def process_findings(
                     # all data is already saved on the finding, we only need to trigger post processing in batches
                     push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped)
                     batch_finding_ids.append(finding.id)
+                    batch_findings.append(finding)
 
                     # Post-processing batches (deduplication, rules, etc.) are separate from matching batches.
                     # These batches only contain "new" findings that were saved (not matched to existing findings).
@@ -425,6 +420,9 @@ def process_findings(
                         # so rules/deduplication tasks see the tags already on the findings.
                         bulk_apply_parser_tags(findings_with_parser_tags)
                         findings_with_parser_tags.clear()
+                        # Apply import-time tags before post-processing so rules/deduplication see them.
+                        self.apply_import_tags_for_batch(batch_findings)
+                        batch_findings.clear()
                         finding_ids_batch = list(batch_finding_ids)
                         batch_finding_ids.clear()
                         dojo_dispatch_task(
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
@@ -343,9 +343,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         configure_pghistory_triggers()
 
         self._import_reimport_performance(
-            expected_num_queries1=172,
+            expected_num_queries1=171,
             expected_num_async_tasks1=2,
-            expected_num_queries2=125,
+            expected_num_queries2=124,
             expected_num_async_tasks2=1,
             expected_num_queries3=29,
             expected_num_async_tasks3=1,
@@ -367,9 +367,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._import_reimport_performance(
-            expected_num_queries1=188,
+            expected_num_queries1=187,
             expected_num_async_tasks1=2,
-            expected_num_queries2=133,
+            expected_num_queries2=132,
             expected_num_async_tasks2=1,
             expected_num_queries3=37,
             expected_num_async_tasks3=1,
@@ -392,9 +392,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self.system_settings(enable_product_grade=True)
 
         self._import_reimport_performance(
-            expected_num_queries1=198,
+            expected_num_queries1=197,
             expected_num_async_tasks1=4,
-            expected_num_queries2=143,
+            expected_num_queries2=142,
             expected_num_async_tasks2=3,
             expected_num_queries3=44,
             expected_num_async_tasks3=3,
@@ -633,9 +633,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         configure_pghistory_triggers()
 
         self._import_reimport_performance(
-            expected_num_queries1=179,
+            expected_num_queries1=178,
             expected_num_async_tasks1=2,
-            expected_num_queries2=134,
+            expected_num_queries2=133,
             expected_num_async_tasks2=1,
             expected_num_queries3=37,
             expected_num_async_tasks3=1,
@@ -657,9 +657,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._import_reimport_performance(
-            expected_num_queries1=197,
+            expected_num_queries1=196,
             expected_num_async_tasks1=2,
-            expected_num_queries2=144,
+            expected_num_queries2=143,
             expected_num_async_tasks2=1,
             expected_num_queries3=47,
             expected_num_async_tasks3=1,
@@ -682,9 +682,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self.system_settings(enable_product_grade=True)
 
         self._import_reimport_performance(
-            expected_num_queries1=210,
+            expected_num_queries1=209,
             expected_num_async_tasks1=4,
-            expected_num_queries2=157,
+            expected_num_queries2=156,
             expected_num_async_tasks2=3,
             expected_num_queries3=54,
             expected_num_async_tasks3=3,