From b87199b7c1d8e0c4a42de5d17a7c287fe14df1aa Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 15 Apr 2026 20:12:06 +0200 Subject: [PATCH 1/7] perf: bulk-apply parser-supplied per-finding tags during import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit finding.tags.add() per finding calls tagulous's add() which does: - reload() → SELECT current tags (1 query) - _ensure_tags_in_db() → get_or_create per tag (T queries) - super().add() → INSERT through-table rows (1 query) - tag.increment() → UPDATE count per tag (T queries) For N findings with T parser-supplied tags: O(N·T) queries. Replace with bulk_apply_parser_tags() in tag_utils, which groups findings by tag name and calls bulk_add_tags_to_instances() once per unique tag: O(unique_tags) queries regardless of N. Tags are accumulated per batch and applied just before the post_process_findings_batch task is dispatched, so deduplication and rules tasks see the tags already written to the DB. Both default_importer and default_reimporter use the same approach. For the reimporter, finding_post_processing accepts an optional tag_accumulator list; when supplied, tags are accumulated rather than applied inline (backward-compatible for any direct callers). --- dojo/importers/default_importer.py | 15 ++++++++++++--- dojo/importers/default_reimporter.py | 24 ++++++++++++++++++++---- dojo/tag_utils.py | 23 ++++++++++++++++++++++- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index a57b6884152..0f0c4b7c0cb 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -20,6 +20,7 @@ ) from dojo.notifications.helper import create_notification from dojo.utils import get_full_url, perform_product_grading +from dojo.tag_utils import bulk_apply_parser_tags from dojo.validators import clean_tags logger = logging.getLogger(__name__) @@ -179,6 +180,7 @@ def process_findings( at import time """ new_findings = [] + findings_with_parser_tags: list[tuple] = [] logger.debug("starting import of %i parsed findings.", len(parsed_findings) if parsed_findings else 0) group_names_to_findings_dict = {} @@ -245,12 +247,13 @@ def process_findings( # TODO: Delete this after the move to Locations # Process any endpoints on the finding, or added on the form self.process_endpoints(finding, self.endpoints_to_add) - # Parsers must use unsaved_tags to store tags, so we can clean them + # Parsers must use unsaved_tags to store tags, so we can clean them. + # Accumulate for bulk application after the loop (O(unique_tags) instead of O(N·T)). cleaned_tags = clean_tags(finding.unsaved_tags) if isinstance(cleaned_tags, list): - finding.tags.add(*cleaned_tags) + findings_with_parser_tags.append((finding, cleaned_tags)) elif isinstance(cleaned_tags, str): - finding.tags.add(cleaned_tags) + findings_with_parser_tags.append((finding, [cleaned_tags])) # Process any files self.process_files(finding) # Process vulnerability IDs @@ -268,6 +271,12 @@ def process_findings( if len(batch_finding_ids) >= batch_max_size or is_final_finding: if not settings.V3_FEATURE_LOCATIONS: self.endpoint_manager.persist(user=self.user) + + # Apply parser-supplied tags for this batch before post-processing starts, + # so rules/deduplication tasks see the tags already on the findings. + bulk_apply_parser_tags(findings_with_parser_tags) + findings_with_parser_tags.clear() + finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)", diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index efeaa252eb8..dafd6f4a00c 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -26,6 +26,7 @@ Test, Test_Import, ) +from dojo.tag_utils import bulk_apply_parser_tags from dojo.utils import perform_product_grading from dojo.validators import clean_tags @@ -310,6 +311,7 @@ def process_findings( cleaned_findings.append(sanitized) batch_finding_ids: list[int] = [] + findings_with_parser_tags: list[tuple] = [] # Batch size for deduplication/post-processing (only new findings) dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000) # Batch size for candidate matching (all findings, before matching) @@ -417,6 +419,7 @@ def process_findings( finding, unsaved_finding, is_matched_finding=bool(matched_findings), + tag_accumulator=findings_with_parser_tags, ) # all data is already saved on the finding, we only need to trigger post processing in batches push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped) @@ -440,6 +443,12 @@ def process_findings( if len(batch_finding_ids) >= dedupe_batch_max_size or is_final: if not settings.V3_FEATURE_LOCATIONS: self.endpoint_manager.persist(user=self.user) + + # Apply parser-supplied tags for this batch before post-processing starts, + # so rules/deduplication tasks see the tags already on the findings. + bulk_apply_parser_tags(findings_with_parser_tags) + findings_with_parser_tags.clear() + finding_ids_batch = list(batch_finding_ids) batch_finding_ids.clear() dojo_dispatch_task( @@ -976,6 +985,7 @@ def finding_post_processing( finding_from_report: Finding, *, is_matched_finding: bool = False, + tag_accumulator: list | None = None, ) -> Finding: """ Save all associated objects to the finding after it has been saved @@ -1006,10 +1016,16 @@ def finding_post_processing( finding_from_report.unsaved_tags = merged_tags if finding_from_report.unsaved_tags: cleaned_tags = clean_tags(finding_from_report.unsaved_tags) - if isinstance(cleaned_tags, list): - finding.tags.add(*cleaned_tags) - elif isinstance(cleaned_tags, str): - finding.tags.add(cleaned_tags) + if tag_accumulator is not None: + if isinstance(cleaned_tags, list): + tag_accumulator.append((finding, cleaned_tags)) + elif isinstance(cleaned_tags, str): + tag_accumulator.append((finding, [cleaned_tags])) + else: + if isinstance(cleaned_tags, list): + finding.tags.add(*cleaned_tags) + elif isinstance(cleaned_tags, str): + finding.tags.add(cleaned_tags) # Process any files if finding_from_report.unsaved_files: finding.unsaved_files = finding_from_report.unsaved_files diff --git a/dojo/tag_utils.py b/dojo/tag_utils.py index cf405034be4..e9800965c51 100644 --- a/dojo/tag_utils.py +++ b/dojo/tag_utils.py @@ -164,6 +164,27 @@ def bulk_add_tags_to_instances(tag_or_tags, instances, tag_field_name: str = "ta return total_created +def bulk_apply_parser_tags(findings_with_tags: list) -> None: + """Bulk-apply per-finding parser tags collected during an import loop. + + Reduces O(N·T) per-finding ``finding.tags.add()`` calls to O(unique_tags) queries + by grouping findings by tag name and calling ``bulk_add_tags_to_instances`` once per tag. + + Args: + findings_with_tags: list of ``(finding, [tag_str, ...])`` pairs accumulated + during the import loop (only for findings whose parser supplied tags). + """ + from collections import defaultdict # noqa: PLC0415 + + tag_to_findings: dict = defaultdict(list) + for finding, tag_list in findings_with_tags: + for tag in tag_list: + if tag: + tag_to_findings[tag].append(finding) + for tag_name, findings_for_tag in tag_to_findings.items(): + bulk_add_tags_to_instances(tag_or_tags=tag_name, instances=findings_for_tag) + + def bulk_remove_all_tags(model_class, instance_ids_qs): """ Remove all tags from instances identified by the given ID subquery. @@ -226,4 +247,4 @@ def bulk_remove_all_tags(model_class, instance_ids_qs): ) -__all__ = ["bulk_add_tags_to_instances", "bulk_remove_all_tags"] +__all__ = ["bulk_add_tags_to_instances", "bulk_apply_parser_tags", "bulk_remove_all_tags"] From cb7e56623681b7de10619043e670251afa9ae464 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 15 Apr 2026 20:30:45 +0200 Subject: [PATCH 2/7] chore: fix ruff linting errors in bulk-tag import code --- dojo/importers/default_importer.py | 2 +- dojo/importers/default_reimporter.py | 9 ++++----- dojo/tag_utils.py | 4 +++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 0f0c4b7c0cb..6ffddc669b8 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -19,8 +19,8 @@ Test_Import, ) from dojo.notifications.helper import create_notification -from dojo.utils import get_full_url, perform_product_grading from dojo.tag_utils import bulk_apply_parser_tags +from dojo.utils import get_full_url, perform_product_grading from dojo.validators import clean_tags logger = logging.getLogger(__name__) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index dafd6f4a00c..2a22da10a35 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -1021,11 +1021,10 @@ def finding_post_processing( tag_accumulator.append((finding, cleaned_tags)) elif isinstance(cleaned_tags, str): tag_accumulator.append((finding, [cleaned_tags])) - else: - if isinstance(cleaned_tags, list): - finding.tags.add(*cleaned_tags) - elif isinstance(cleaned_tags, str): - finding.tags.add(cleaned_tags) + elif isinstance(cleaned_tags, list): + finding.tags.add(*cleaned_tags) + elif isinstance(cleaned_tags, str): + finding.tags.add(cleaned_tags) # Process any files if finding_from_report.unsaved_files: finding.unsaved_files = finding_from_report.unsaved_files diff --git a/dojo/tag_utils.py b/dojo/tag_utils.py index e9800965c51..cf92665bbf7 100644 --- a/dojo/tag_utils.py +++ b/dojo/tag_utils.py @@ -165,7 +165,8 @@ def bulk_add_tags_to_instances(tag_or_tags, instances, tag_field_name: str = "ta def bulk_apply_parser_tags(findings_with_tags: list) -> None: - """Bulk-apply per-finding parser tags collected during an import loop. + """ + Bulk-apply per-finding parser tags collected during an import loop. Reduces O(N·T) per-finding ``finding.tags.add()`` calls to O(unique_tags) queries by grouping findings by tag name and calling ``bulk_add_tags_to_instances`` once per tag. @@ -173,6 +174,7 @@ def bulk_apply_parser_tags(findings_with_tags: list) -> None: Args: findings_with_tags: list of ``(finding, [tag_str, ...])`` pairs accumulated during the import loop (only for findings whose parser supplied tags). + """ from collections import defaultdict # noqa: PLC0415 From acd12322ae92ff96ee8dd7753b65d07a2de7fe5e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Thu, 16 Apr 2026 22:03:10 +0200 Subject: [PATCH 3/7] improve bulk add tags for parsers --- dojo/tag_utils.py | 180 ++++++++++++++++++++++++++++++- unittests/test_tag_utils_bulk.py | 176 +++++++++++++++++++++++++++++- 2 files changed, 350 insertions(+), 6 deletions(-) diff --git a/dojo/tag_utils.py b/dojo/tag_utils.py index cf92665bbf7..340f8fa2804 100644 --- a/dojo/tag_utils.py +++ b/dojo/tag_utils.py @@ -164,12 +164,182 @@ def bulk_add_tags_to_instances(tag_or_tags, instances, tag_field_name: str = "ta return total_created +def bulk_add_tag_mapping( + tag_to_instances: dict[str, list], + tag_field_name: str = "tags", + batch_size: int | None = None, +) -> int: + """ + Add different tags to different sets of instances in ~5 queries regardless of tag count. + + Unlike calling ``bulk_add_tags_to_instances`` once per unique tag — which issues + O(unique_tags) queries — this function batches all work: + + 1. Fetch all existing tag objects in one query. + 2. Bulk-create any missing tag objects (one INSERT + one re-fetch if needed). + 3. Fetch all pre-existing through-model rows for these (instance, tag) pairs in one query. + 4. Bulk-create all new relationships in one query (batched by ``batch_size``). + 5. Update all tag counts in one ``UPDATE … CASE WHEN …`` query. + + Args: + tag_to_instances: mapping of tag_name -> list of instances that should receive + that tag. All instances must be of the same model type. + tag_field_name: name of the TagField on the model (default: ``"tags"``). + batch_size: ``bulk_create`` batch size; defaults to ``TAG_BULK_ADD_BATCH_SIZE`` + setting (1000). + + Returns: + Total number of new tag relationships created. + """ + from collections import defaultdict # noqa: PLC0415 + + from django.db.models import Case, IntegerField, When # noqa: PLC0415 + from django.db.models.functions import Lower # noqa: PLC0415 + + if not tag_to_instances: + return 0 + + if batch_size is None: + batch_size = getattr(settings, "TAG_BULK_ADD_BATCH_SIZE", 1000) + + all_instances = [inst for insts in tag_to_instances.values() for inst in insts] + if not all_instances: + return 0 + + model_class = all_instances[0].__class__ + + if model_class is Product: + msg = "bulk_add_tag_mapping: Product instances are not supported; use Product.tags.add() or a propagation-aware helper" + raise ValueError(msg) + + try: + tag_field = model_class._meta.get_field(tag_field_name) + except Exception: + msg = f"Model {model_class.__name__} does not have field '{tag_field_name}'" + raise ValueError(msg) + + if not hasattr(tag_field, "tag_options"): + msg = f"Field '{tag_field_name}' is not a TagField" + raise ValueError(msg) + + tag_model = tag_field.related_model + through_model = tag_field.remote_field.through + case_sensitive = tag_field.tag_options.case_sensitive + + source_field_name = None + target_field_name = None + for field in through_model._meta.fields: + if hasattr(field, "remote_field") and field.remote_field: + if field.remote_field.model == model_class: + source_field_name = field.name + elif field.remote_field.model == tag_model: + target_field_name = field.name + + all_tag_names = list(tag_to_instances.keys()) + + # --- Query 1: fetch existing tag objects --- + if case_sensitive: + existing_tags: dict[str, object] = { + t.name: t + for t in tag_model.objects.filter(name__in=all_tag_names) + } + missing_names = [n for n in all_tag_names if n not in existing_tags] + else: + # Annotate with lowercased name for a case-insensitive IN lookup + existing_tags = { + t.name_lower: t + for t in tag_model.objects.annotate(name_lower=Lower("name")).filter( + name_lower__in=[n.lower() for n in all_tag_names], + ) + } + missing_names = [n for n in all_tag_names if n.lower() not in existing_tags] + + # --- Query 2: create missing tag objects then re-fetch to get their PKs --- + if missing_names: + tag_model.objects.bulk_create( + [tag_model(name=n, protected=False) for n in missing_names], + ignore_conflicts=True, + ) + if case_sensitive: + existing_tags.update( + {t.name: t for t in tag_model.objects.filter(name__in=missing_names)}, + ) + else: + existing_tags.update( + { + t.name_lower: t + for t in tag_model.objects.annotate(name_lower=Lower("name")).filter( + name_lower__in=[n.lower() for n in missing_names], + ) + }, + ) + + def _key(name: str) -> str: + return name if case_sensitive else name.lower() + + # --- Query 3: fetch all pre-existing (instance, tag) through-model rows --- + all_instance_ids = {inst.pk for inst in all_instances} + all_tag_pks = {tag.pk for tag in existing_tags.values()} + + existing_pairs: set[tuple] = set( + through_model.objects.filter( + **{f"{source_field_name}__in": all_instance_ids}, + **{f"{target_field_name}__in": all_tag_pks}, + ).values_list(source_field_name, target_field_name), + ) + + new_relationships = [] + created_per_tag: dict[int, int] = defaultdict(int) + + for tag_name, instances in tag_to_instances.items(): + tag = existing_tags.get(_key(tag_name)) + if tag is None: + continue + for instance in instances: + if (instance.pk, tag.pk) not in existing_pairs: + new_relationships.append( + through_model(**{source_field_name: instance, target_field_name: tag}), + ) + created_per_tag[tag.pk] += 1 + + if not new_relationships: + return 0 + + # --- Query 4: bulk-create all new relationships (batched for memory) --- + total_created = 0 + with transaction.atomic(): + for i in range(0, len(new_relationships), batch_size): + batch = new_relationships[i : i + batch_size] + actually_created = through_model.objects.bulk_create(batch, ignore_conflicts=True) + total_created += ( + len(actually_created) if hasattr(actually_created, "__len__") else len(batch) + ) + + # --- Query 5: update all tag counts in one UPDATE … CASE WHEN … --- + tag_model.objects.filter(pk__in=list(created_per_tag.keys())).update( + count=Case( + *[ + When(pk=pk, then=models.F("count") + delta) + for pk, delta in created_per_tag.items() + ], + output_field=IntegerField(), + ), + ) + + for instance in all_instances: + prefetch_cache = getattr(instance, "_prefetched_objects_cache", None) + if prefetch_cache is not None: + prefetch_cache.pop(tag_field_name, None) + + return total_created + + def bulk_apply_parser_tags(findings_with_tags: list) -> None: """ Bulk-apply per-finding parser tags collected during an import loop. - Reduces O(N·T) per-finding ``finding.tags.add()`` calls to O(unique_tags) queries - by grouping findings by tag name and calling ``bulk_add_tags_to_instances`` once per tag. + Delegates to ``bulk_add_tag_mapping`` to process all tags in ~5 queries total, + regardless of how many unique tag values the parser produced. Args: findings_with_tags: list of ``(finding, [tag_str, ...])`` pairs accumulated @@ -183,8 +353,8 @@ def bulk_apply_parser_tags(findings_with_tags: list) -> None: for tag in tag_list: if tag: tag_to_findings[tag].append(finding) - for tag_name, findings_for_tag in tag_to_findings.items(): - bulk_add_tags_to_instances(tag_or_tags=tag_name, instances=findings_for_tag) + + bulk_add_tag_mapping(tag_to_findings) def bulk_remove_all_tags(model_class, instance_ids_qs): @@ -249,4 +419,4 @@ def bulk_remove_all_tags(model_class, instance_ids_qs): ) -__all__ = ["bulk_add_tags_to_instances", "bulk_apply_parser_tags", "bulk_remove_all_tags"] +__all__ = ["bulk_add_tag_mapping", "bulk_add_tags_to_instances", "bulk_apply_parser_tags", "bulk_remove_all_tags"] diff --git a/unittests/test_tag_utils_bulk.py b/unittests/test_tag_utils_bulk.py index 63fc86ba95f..1c0f4f831a8 100644 --- a/unittests/test_tag_utils_bulk.py +++ b/unittests/test_tag_utils_bulk.py @@ -4,7 +4,7 @@ from dojo.location.models import Location from dojo.models import Endpoint, Engagement, Finding, Product, Product_Type, Test, Test_Type -from dojo.tag_utils import bulk_add_tags_to_instances +from dojo.tag_utils import bulk_add_tag_mapping, bulk_add_tags_to_instances, bulk_apply_parser_tags from dojo.url.models import URL from unittests.dojo_test_case import DojoAPITestCase, versioned_fixtures @@ -260,6 +260,180 @@ def test_bulk_add_non_tag_field(self): self.assertIn("is not a TagField", str(cm.exception)) +class BulkTagMappingTest(TestCase): + """Tests for bulk_add_tag_mapping — the multi-tag, ~5-query variant.""" + + LOCATION_CLASS = Location if settings.V3_FEATURE_LOCATIONS else Endpoint + + def setUp(self): + self.tag_model = self.LOCATION_CLASS.tags.tag_model + self.product_type = Product_Type.objects.create(name="PT-Mapping") + self.product = Product.objects.create(name="Mapping Product", description="test", prod_type=self.product_type) + + def _make_location(self, hostname): + if not settings.V3_FEATURE_LOCATIONS: + return Endpoint.objects.create(product=self.product, host=hostname) + url = URL.get_or_create_from_values(host=hostname) + url.location.associate_with_product(self.product) + return url.location + + def _make_locations(self, n): + return [self._make_location(f"map-host-{i}.example.com") for i in range(n)] + + def test_basic_different_tags_different_instances(self): + a, b, c = self._make_locations(3) + created = bulk_add_tag_mapping({"alpha": [a, b], "beta": [b, c], "gamma": [c]}) + + self.assertEqual(created, 5) + a.refresh_from_db() + b.refresh_from_db() + c.refresh_from_db() + self.assertEqual([t.name for t in a.tags.all()], ["alpha"]) + self.assertCountEqual([t.name for t in b.tags.all()], ["alpha", "beta"]) + self.assertCountEqual([t.name for t in c.tags.all()], ["beta", "gamma"]) + + self.assertEqual(self.tag_model.objects.get(name="alpha").count, 2) + self.assertEqual(self.tag_model.objects.get(name="beta").count, 2) + self.assertEqual(self.tag_model.objects.get(name="gamma").count, 1) + + def test_same_tag_across_all_instances(self): + instances = self._make_locations(4) + created = bulk_add_tag_mapping({"shared": instances}) + + self.assertEqual(created, 4) + self.assertEqual(self.tag_model.objects.get(name="shared").count, 4) + + def test_skips_existing_relationships(self): + a, b, c = self._make_locations(3) + a.tags.add("existing") + b.tags.add("existing") + + created = bulk_add_tag_mapping({"existing": [a, b, c]}) + + self.assertEqual(created, 1) + self.assertEqual(self.tag_model.objects.get(name="existing").count, 3) + + def test_empty_dict_returns_zero(self): + created = bulk_add_tag_mapping({}) + self.assertEqual(created, 0) + + def test_empty_instance_lists_returns_zero(self): + created = bulk_add_tag_mapping({"tag-a": [], "tag-b": []}) + self.assertEqual(created, 0) + self.assertEqual(self.tag_model.objects.filter(name__in=["tag-a", "tag-b"]).count(), 0) + + def test_case_insensitive_finds_existing_tag(self): + # Pre-create tag in lowercase (simulating force_lowercase storage) + instances = self._make_locations(2) + instances[0].tags.add("mytag") + + # Requesting "MYTAG" should match the existing "mytag" object + created = bulk_add_tag_mapping({"MYTAG": [instances[0], instances[1]]}) + + self.assertEqual(created, 1) + self.assertEqual(self.tag_model.objects.count(), 1) + + def test_creates_new_tags_that_dont_exist(self): + instances = self._make_locations(2) + created = bulk_add_tag_mapping({"brand-new-a": [instances[0]], "brand-new-b": [instances[1]]}) + + self.assertEqual(created, 2) + self.assertTrue(self.tag_model.objects.filter(name="brand-new-a").exists()) + self.assertTrue(self.tag_model.objects.filter(name="brand-new-b").exists()) + + def test_clears_prefetch_cache(self): + instances = list(self.LOCATION_CLASS.objects.filter( + pk__in=[loc.pk for loc in self._make_locations(2)], + ).prefetch_related("tags")) + + for inst in instances: + self.assertEqual(list(inst.tags.all()), []) + + bulk_add_tag_mapping({"cache-map": instances}) + + for inst in instances: + self.assertIn("cache-map", [t.name for t in inst.tags.all()]) + + def test_product_rejected(self): + pt = Product_Type.objects.create(name="PT-Reject") + product = Product.objects.create(name="P-Reject", description="x", prod_type=pt) + with self.assertRaises(ValueError, msg="Product instances are not supported"): + bulk_add_tag_mapping({"tag": [product]}) + + def test_batching_creates_all_relationships(self): + instances = self._make_locations(15) + created = bulk_add_tag_mapping({"batch-tag": instances}, batch_size=4) + + self.assertEqual(created, 15) + self.assertEqual(self.tag_model.objects.get(name="batch-tag").count, 15) + + +class BulkApplyParserTagsTest(TestCase): + """Tests for bulk_apply_parser_tags — the import-loop accumulator path.""" + + def setUp(self): + self.tag_model = Finding.tags.tag_model + pt = Product_Type.objects.create(name="PT-Parser") + product = Product.objects.create(name="Parser Product", description="x", prod_type=pt) + engagement = Engagement.objects.create( + name="E-Parser", product=product, + target_start=timezone.now(), target_end=timezone.now(), + ) + tt = Test_Type.objects.create(name="Parser Test Type") + test = Test.objects.create( + title="T-Parser", engagement=engagement, test_type=tt, + target_start=timezone.now(), target_end=timezone.now(), + ) + self.test = test + + def _make_finding(self, title): + return Finding.objects.create(title=title, severity="Low", test=self.test) + + def test_applies_tags_correctly(self): + f1 = self._make_finding("F1") + f2 = self._make_finding("F2") + f3 = self._make_finding("F3") + + bulk_apply_parser_tags([ + (f1, ["network", "web"]), + (f2, ["network"]), + (f3, ["pci"]), + ]) + + f1.refresh_from_db() + f2.refresh_from_db() + f3.refresh_from_db() + self.assertCountEqual([t.name for t in f1.tags.all()], ["network", "web"]) + self.assertCountEqual([t.name for t in f2.tags.all()], ["network"]) + self.assertCountEqual([t.name for t in f3.tags.all()], ["pci"]) + + self.assertEqual(self.tag_model.objects.get(name="network").count, 2) + self.assertEqual(self.tag_model.objects.get(name="web").count, 1) + self.assertEqual(self.tag_model.objects.get(name="pci").count, 1) + + def test_empty_list_is_noop(self): + bulk_apply_parser_tags([]) + self.assertEqual(self.tag_model.objects.count(), 0) + + def test_filters_empty_tag_strings(self): + f = self._make_finding("F-empty") + bulk_apply_parser_tags([(f, ["", "valid", ""])]) + f.refresh_from_db() + self.assertEqual([t.name for t in f.tags.all()], ["valid"]) + + def test_dynamic_tags_many_unique_values(self): + # Simulate a parser that emits one unique tag per finding (e.g. resource name) + findings = [self._make_finding(f"F-dyn-{i}") for i in range(20)] + pairs = [(f, [f"resource-{i}"]) for i, f in enumerate(findings)] + bulk_apply_parser_tags(pairs) + + for i, f in enumerate(findings): + f.refresh_from_db() + self.assertEqual([t.name for t in f.tags.all()], [f"resource-{i}"]) + + self.assertEqual(self.tag_model.objects.count(), 20) + + @versioned_fixtures class BulkTagUtilsInheritanceTest(DojoAPITestCase): fixtures = ["dojo_testdata.json"] From 837e2ad314a572ad8091d19f014f130024ff5681 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Thu, 16 Apr 2026 22:08:44 +0200 Subject: [PATCH 4/7] ruff --- dojo/tag_utils.py | 1 + unittests/test_tag_utils_bulk.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/dojo/tag_utils.py b/dojo/tag_utils.py index 340f8fa2804..62bb1190fff 100644 --- a/dojo/tag_utils.py +++ b/dojo/tag_utils.py @@ -190,6 +190,7 @@ def bulk_add_tag_mapping( Returns: Total number of new tag relationships created. + """ from collections import defaultdict # noqa: PLC0415 diff --git a/unittests/test_tag_utils_bulk.py b/unittests/test_tag_utils_bulk.py index 1c0f4f831a8..c975c5dac3c 100644 --- a/unittests/test_tag_utils_bulk.py +++ b/unittests/test_tag_utils_bulk.py @@ -261,6 +261,7 @@ def test_bulk_add_non_tag_field(self): class BulkTagMappingTest(TestCase): + """Tests for bulk_add_tag_mapping — the multi-tag, ~5-query variant.""" LOCATION_CLASS = Location if settings.V3_FEATURE_LOCATIONS else Endpoint @@ -369,6 +370,7 @@ def test_batching_creates_all_relationships(self): class BulkApplyParserTagsTest(TestCase): + """Tests for bulk_apply_parser_tags — the import-loop accumulator path.""" def setUp(self): From 84ee6a430c475462b630d3a6daab048cb6126e51 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 17 Apr 2026 09:22:02 +0200 Subject: [PATCH 5/7] fix tag creation --- dojo/tag_utils.py | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/dojo/tag_utils.py b/dojo/tag_utils.py index 62bb1190fff..87ed6845961 100644 --- a/dojo/tag_utils.py +++ b/dojo/tag_utils.py @@ -238,6 +238,9 @@ def bulk_add_tag_mapping( all_tag_names = list(tag_to_instances.keys()) + def _key(name: str) -> str: + return name if case_sensitive else name.lower() + # --- Query 1: fetch existing tag objects --- if case_sensitive: existing_tags: dict[str, object] = { @@ -255,28 +258,16 @@ def bulk_add_tag_mapping( } missing_names = [n for n in all_tag_names if n.lower() not in existing_tags] - # --- Query 2: create missing tag objects then re-fetch to get their PKs --- + # --- Query 2: create missing tag objects --- + # Use get_or_create to call model.save(), which lets tagulous generate the slug field. + # bulk_create bypasses save() so slug is never set, causing unique constraint failures. if missing_names: - tag_model.objects.bulk_create( - [tag_model(name=n, protected=False) for n in missing_names], - ignore_conflicts=True, - ) - if case_sensitive: - existing_tags.update( - {t.name: t for t in tag_model.objects.filter(name__in=missing_names)}, - ) - else: - existing_tags.update( - { - t.name_lower: t - for t in tag_model.objects.annotate(name_lower=Lower("name")).filter( - name_lower__in=[n.lower() for n in missing_names], - ) - }, - ) - - def _key(name: str) -> str: - return name if case_sensitive else name.lower() + for n in missing_names: + if case_sensitive: + tag, _ = tag_model.objects.get_or_create(name=n, defaults={"protected": False}) + else: + tag, _ = tag_model.objects.get_or_create(name__iexact=n, defaults={"name": n, "protected": False}) + existing_tags[_key(n)] = tag # --- Query 3: fetch all pre-existing (instance, tag) through-model rows --- all_instance_ids = {inst.pk for inst in all_instances} @@ -307,14 +298,13 @@ def _key(name: str) -> str: return 0 # --- Query 4: bulk-create all new relationships (batched for memory) --- - total_created = 0 + # Use len(new_relationships) for the count: existing pairs were already filtered out above, + # so every entry here is new. bulk_create return value is unreliable with ignore_conflicts. + total_created = len(new_relationships) with transaction.atomic(): for i in range(0, len(new_relationships), batch_size): batch = new_relationships[i : i + batch_size] - actually_created = through_model.objects.bulk_create(batch, ignore_conflicts=True) - total_created += ( - len(actually_created) if hasattr(actually_created, "__len__") else len(batch) - ) + through_model.objects.bulk_create(batch, ignore_conflicts=True) # --- Query 5: update all tag counts in one UPDATE … CASE WHEN … --- tag_model.objects.filter(pk__in=list(created_per_tag.keys())).update( From 65d52ae7b45fee9a55e2180529879e9fc794b1d0 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 17 Apr 2026 09:54:04 +0200 Subject: [PATCH 6/7] fix tests --- unittests/test_tag_utils_bulk.py | 4 +++- unittests/test_tags.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/unittests/test_tag_utils_bulk.py b/unittests/test_tag_utils_bulk.py index c975c5dac3c..c9e369c0a2e 100644 --- a/unittests/test_tag_utils_bulk.py +++ b/unittests/test_tag_utils_bulk.py @@ -1,4 +1,5 @@ from django.conf import settings +from django.contrib.auth.models import User from django.test import TestCase from django.utils import timezone @@ -375,6 +376,7 @@ class BulkApplyParserTagsTest(TestCase): def setUp(self): self.tag_model = Finding.tags.tag_model + self.reporter = User.objects.create_user(username="parser-test-user", password="x") pt = Product_Type.objects.create(name="PT-Parser") product = Product.objects.create(name="Parser Product", description="x", prod_type=pt) engagement = Engagement.objects.create( @@ -389,7 +391,7 @@ def setUp(self): self.test = test def _make_finding(self, title): - return Finding.objects.create(title=title, severity="Low", test=self.test) + return Finding.objects.create(title=title, severity="Low", test=self.test, reporter=self.reporter) def test_applies_tags_correctly(self): f1 = self._make_finding("F1") diff --git a/unittests/test_tags.py b/unittests/test_tags.py index b9077e1daab..b6661ab12d4 100644 --- a/unittests/test_tags.py +++ b/unittests/test_tags.py @@ -386,6 +386,7 @@ class TagImportTestAPI(DojoAPITestCase, TagImportMixin): def setUp(self): super().setUp() + settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() @@ -402,6 +403,7 @@ class TagImportTestUI(DojoAPITestCase, TagImportMixin): def setUp(self): super().setUp() + settings.SECURE_SSL_REDIRECT = False testuser = User.objects.get(username="admin") testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() From 3e8b19168a99b89f615c898a8cf8ebb61e3a6f46 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 17 Apr 2026 10:43:09 +0200 Subject: [PATCH 7/7] fix tests --- unittests/test_tag_utils_bulk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unittests/test_tag_utils_bulk.py b/unittests/test_tag_utils_bulk.py index c9e369c0a2e..3a815041fb4 100644 --- a/unittests/test_tag_utils_bulk.py +++ b/unittests/test_tag_utils_bulk.py @@ -376,7 +376,7 @@ class BulkApplyParserTagsTest(TestCase): def setUp(self): self.tag_model = Finding.tags.tag_model - self.reporter = User.objects.create_user(username="parser-test-user", password="x") + self.reporter = User.objects.create_user(username="parser-test-user") pt = Product_Type.objects.create(name="PT-Parser") product = Product.objects.create(name="Parser Product", description="x", prod_type=pt) engagement = Engagement.objects.create(