Skip to content

Commit b87199b

Browse files
perf: bulk-apply parser-supplied per-finding tags during import
finding.tags.add() per finding calls tagulous's add() which does: - reload() → SELECT current tags (1 query) - _ensure_tags_in_db() → get_or_create per tag (T queries) - super().add() → INSERT through-table rows (1 query) - tag.increment() → UPDATE count per tag (T queries) For N findings with T parser-supplied tags: O(N·T) queries. Replace with bulk_apply_parser_tags() in tag_utils, which groups findings by tag name and calls bulk_add_tags_to_instances() once per unique tag: O(unique_tags) queries regardless of N. Tags are accumulated per batch and applied just before the post_process_findings_batch task is dispatched, so deduplication and rules tasks see the tags already written to the DB. Both default_importer and default_reimporter use the same approach. For the reimporter, finding_post_processing accepts an optional tag_accumulator list; when supplied, tags are accumulated rather than applied inline (backward-compatible for any direct callers).
1 parent 5ca9e64 commit b87199b

3 files changed

Lines changed: 54 additions & 8 deletions

File tree

dojo/importers/default_importer.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
)
2121
from dojo.notifications.helper import create_notification
2222
from dojo.utils import get_full_url, perform_product_grading
23+
from dojo.tag_utils import bulk_apply_parser_tags
2324
from dojo.validators import clean_tags
2425

2526
logger = logging.getLogger(__name__)
@@ -179,6 +180,7 @@ def process_findings(
179180
at import time
180181
"""
181182
new_findings = []
183+
findings_with_parser_tags: list[tuple] = []
182184
logger.debug("starting import of %i parsed findings.", len(parsed_findings) if parsed_findings else 0)
183185
group_names_to_findings_dict = {}
184186

@@ -245,12 +247,13 @@ def process_findings(
245247
# TODO: Delete this after the move to Locations
246248
# Process any endpoints on the finding, or added on the form
247249
self.process_endpoints(finding, self.endpoints_to_add)
248-
# Parsers must use unsaved_tags to store tags, so we can clean them
250+
# Parsers must use unsaved_tags to store tags, so we can clean them.
251+
# Accumulate for bulk application after the loop (O(unique_tags) instead of O(N·T)).
249252
cleaned_tags = clean_tags(finding.unsaved_tags)
250253
if isinstance(cleaned_tags, list):
251-
finding.tags.add(*cleaned_tags)
254+
findings_with_parser_tags.append((finding, cleaned_tags))
252255
elif isinstance(cleaned_tags, str):
253-
finding.tags.add(cleaned_tags)
256+
findings_with_parser_tags.append((finding, [cleaned_tags]))
254257
# Process any files
255258
self.process_files(finding)
256259
# Process vulnerability IDs
@@ -268,6 +271,12 @@ def process_findings(
268271
if len(batch_finding_ids) >= batch_max_size or is_final_finding:
269272
if not settings.V3_FEATURE_LOCATIONS:
270273
self.endpoint_manager.persist(user=self.user)
274+
275+
# Apply parser-supplied tags for this batch before post-processing starts,
276+
# so rules/deduplication tasks see the tags already on the findings.
277+
bulk_apply_parser_tags(findings_with_parser_tags)
278+
findings_with_parser_tags.clear()
279+
271280
finding_ids_batch = list(batch_finding_ids)
272281
batch_finding_ids.clear()
273282
logger.debug("process_findings: dispatching batch with push_to_jira=%s (batch_size=%d, is_final=%s)",

dojo/importers/default_reimporter.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
Test,
2727
Test_Import,
2828
)
29+
from dojo.tag_utils import bulk_apply_parser_tags
2930
from dojo.utils import perform_product_grading
3031
from dojo.validators import clean_tags
3132

@@ -310,6 +311,7 @@ def process_findings(
310311
cleaned_findings.append(sanitized)
311312

312313
batch_finding_ids: list[int] = []
314+
findings_with_parser_tags: list[tuple] = []
313315
# Batch size for deduplication/post-processing (only new findings)
314316
dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
315317
# Batch size for candidate matching (all findings, before matching)
@@ -417,6 +419,7 @@ def process_findings(
417419
finding,
418420
unsaved_finding,
419421
is_matched_finding=bool(matched_findings),
422+
tag_accumulator=findings_with_parser_tags,
420423
)
421424
# all data is already saved on the finding, we only need to trigger post processing in batches
422425
push_to_jira = self.push_to_jira and ((not self.findings_groups_enabled or not self.group_by) or not finding_will_be_grouped)
@@ -440,6 +443,12 @@ def process_findings(
440443
if len(batch_finding_ids) >= dedupe_batch_max_size or is_final:
441444
if not settings.V3_FEATURE_LOCATIONS:
442445
self.endpoint_manager.persist(user=self.user)
446+
447+
# Apply parser-supplied tags for this batch before post-processing starts,
448+
# so rules/deduplication tasks see the tags already on the findings.
449+
bulk_apply_parser_tags(findings_with_parser_tags)
450+
findings_with_parser_tags.clear()
451+
443452
finding_ids_batch = list(batch_finding_ids)
444453
batch_finding_ids.clear()
445454
dojo_dispatch_task(
@@ -976,6 +985,7 @@ def finding_post_processing(
976985
finding_from_report: Finding,
977986
*,
978987
is_matched_finding: bool = False,
988+
tag_accumulator: list | None = None,
979989
) -> Finding:
980990
"""
981991
Save all associated objects to the finding after it has been saved
@@ -1006,10 +1016,16 @@ def finding_post_processing(
10061016
finding_from_report.unsaved_tags = merged_tags
10071017
if finding_from_report.unsaved_tags:
10081018
cleaned_tags = clean_tags(finding_from_report.unsaved_tags)
1009-
if isinstance(cleaned_tags, list):
1010-
finding.tags.add(*cleaned_tags)
1011-
elif isinstance(cleaned_tags, str):
1012-
finding.tags.add(cleaned_tags)
1019+
if tag_accumulator is not None:
1020+
if isinstance(cleaned_tags, list):
1021+
tag_accumulator.append((finding, cleaned_tags))
1022+
elif isinstance(cleaned_tags, str):
1023+
tag_accumulator.append((finding, [cleaned_tags]))
1024+
else:
1025+
if isinstance(cleaned_tags, list):
1026+
finding.tags.add(*cleaned_tags)
1027+
elif isinstance(cleaned_tags, str):
1028+
finding.tags.add(cleaned_tags)
10131029
# Process any files
10141030
if finding_from_report.unsaved_files:
10151031
finding.unsaved_files = finding_from_report.unsaved_files

dojo/tag_utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,27 @@ def bulk_add_tags_to_instances(tag_or_tags, instances, tag_field_name: str = "ta
164164
return total_created
165165

166166

167+
def bulk_apply_parser_tags(findings_with_tags: list) -> None:
168+
"""Bulk-apply per-finding parser tags collected during an import loop.
169+
170+
Reduces O(N·T) per-finding ``finding.tags.add()`` calls to O(unique_tags) queries
171+
by grouping findings by tag name and calling ``bulk_add_tags_to_instances`` once per tag.
172+
173+
Args:
174+
findings_with_tags: list of ``(finding, [tag_str, ...])`` pairs accumulated
175+
during the import loop (only for findings whose parser supplied tags).
176+
"""
177+
from collections import defaultdict # noqa: PLC0415
178+
179+
tag_to_findings: dict = defaultdict(list)
180+
for finding, tag_list in findings_with_tags:
181+
for tag in tag_list:
182+
if tag:
183+
tag_to_findings[tag].append(finding)
184+
for tag_name, findings_for_tag in tag_to_findings.items():
185+
bulk_add_tags_to_instances(tag_or_tags=tag_name, instances=findings_for_tag)
186+
187+
167188
def bulk_remove_all_tags(model_class, instance_ids_qs):
168189
"""
169190
Remove all tags from instances identified by the given ID subquery.
@@ -226,4 +247,4 @@ def bulk_remove_all_tags(model_class, instance_ids_qs):
226247
)
227248

228249

229-
__all__ = ["bulk_add_tags_to_instances", "bulk_remove_all_tags"]
250+
__all__ = ["bulk_add_tags_to_instances", "bulk_apply_parser_tags", "bulk_remove_all_tags"]

0 commit comments

Comments
 (0)