From b8d86e7b1c0cf70852bace4263114d02b730a515 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 5 Apr 2026 09:27:31 +0200 Subject: [PATCH 1/4] Batch-refresh close_old_findings status fields to avoid N refresh_from_db queries. Replace per-finding refresh_from_db(false_p, risk_accepted, out_of_scope) with one values() query for all PKs and assign onto instances, falling back to refresh_from_db when a row is missing. --- dojo/importers/default_reimporter.py | 48 +++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 2828b8cec30..7f3953e64b7 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -464,6 +464,38 @@ def process_findings( return self.new_items, self.reactivated_items, self.to_mitigate, self.untouched + def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None: + """ + Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding. + + These can change during reimport (e.g. false positive) while the in-memory instances + are stale. A naive refresh_from_db per finding issues one SELECT each; we batch one + query for all primary keys and fall back to refresh_from_db only when needed. + """ + ids = [f.pk for f in findings if f.pk is not None] + if not ids: + for finding in findings: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + return + + fresh_by_id = { + row["id"]: row + for row in Finding.objects.filter(pk__in=ids).values( + "id", + "false_p", + "risk_accepted", + "out_of_scope", + ) + } + for finding in findings: + row = fresh_by_id.get(finding.pk) + if row is not None: + finding.false_p = row["false_p"] + finding.risk_accepted = row["risk_accepted"] + finding.out_of_scope = row["out_of_scope"] + else: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + def close_old_findings( self, findings: list[Finding], @@ -477,17 +509,17 @@ def close_old_findings( if self.close_old_findings_toggle is False: return [] logger.debug("REIMPORT_SCAN: Closing findings no longer present in scan report") + # Get any status changes that could have occurred earlier in the process + # for special statuses only. + # An example of such is a finding being reported as false positive, and + # reimport makes this change in the database. However, the findings here + # are calculated based from the original values before the reimport, so + # any updates made during reimport are discarded without first getting the + # state of the finding as it stands at this moment + self._sync_close_old_finding_status_fields(findings) # Determine if pushing to jira or if the finding groups are enabled mitigated_findings = [] for finding in findings: - # Get any status changes that could have occurred earlier in the process - # for special statuses only. - # An example of such is a finding being reported as false positive, and - # reimport makes this change in the database. However, the findings here - # are calculated based from the original values before the reimport, so - # any updates made during reimport are discarded without first getting the - # state of the finding as it stands at this moment - finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) # Ensure the finding is not already closed if not finding.mitigated or not finding.is_mitigated: logger.debug("mitigating finding: %i:%s", finding.id, finding) From 2aaca496e920b1c85eb7f9806ee3685c772c5897 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 5 Apr 2026 09:36:05 +0200 Subject: [PATCH 2/4] docs: cite #12291 for close_old_findings status refresh origin --- dojo/importers/default_reimporter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 7f3953e64b7..07e4a6dcbe4 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -469,8 +469,10 @@ def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding. These can change during reimport (e.g. false positive) while the in-memory instances - are stale. A naive refresh_from_db per finding issues one SELECT each; we batch one - query for all primary keys and fall back to refresh_from_db only when needed. + are stale. Per-finding refresh_from_db in close_old_findings was added in + https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per + finding issues one SELECT each; we batch one query for all primary keys and fall + back to refresh_from_db only when needed. """ ids = [f.pk for f in findings if f.pk is not None] if not ids: @@ -515,7 +517,7 @@ def close_old_findings( # reimport makes this change in the database. However, the findings here # are calculated based from the original values before the reimport, so # any updates made during reimport are discarded without first getting the - # state of the finding as it stands at this moment + # state of the finding as it stands at this moment (django-DefectDojo #12291). self._sync_close_old_finding_status_fields(findings) # Determine if pushing to jira or if the finding groups are enabled mitigated_findings = [] From 226e284a747e89a5ed394c37fb7fb85e3bad3a7e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 5 Apr 2026 09:39:42 +0200 Subject: [PATCH 3/4] perf: chunk close_old_findings status sync queries (1000 PKs per SELECT) --- dojo/importers/default_reimporter.py | 56 ++++++++++++++++------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 07e4a6dcbe4..f0a4c05028c 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -1,4 +1,5 @@ import logging +from itertools import batched from django.conf import settings from django.core.files.uploadedfile import TemporaryUploadedFile @@ -30,6 +31,9 @@ logger = logging.getLogger(__name__) deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication") +# Bound IN-list size when bulk-loading status fields for close_old_findings. +_CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK = 1000 + class DefaultReImporterOptions(ImporterOptions): def validate_test( @@ -471,32 +475,36 @@ def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None These can change during reimport (e.g. false positive) while the in-memory instances are stale. Per-finding refresh_from_db in close_old_findings was added in https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per - finding issues one SELECT each; we batch one query for all primary keys and fall + finding issues one SELECT each; we batch one query per chunk of primary keys and fall back to refresh_from_db only when needed. - """ - ids = [f.pk for f in findings if f.pk is not None] - if not ids: - for finding in findings: - finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) - return - fresh_by_id = { - row["id"]: row - for row in Finding.objects.filter(pk__in=ids).values( - "id", - "false_p", - "risk_accepted", - "out_of_scope", - ) - } - for finding in findings: - row = fresh_by_id.get(finding.pk) - if row is not None: - finding.false_p = row["false_p"] - finding.risk_accepted = row["risk_accepted"] - finding.out_of_scope = row["out_of_scope"] - else: - finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + This really should be fixed differently, but for now we at least optimize it to be done in bulk. + """ + findings_without_pk = [f for f in findings if f.pk is None] + findings_with_pk = [f for f in findings if f.pk is not None] + + for finding in findings_without_pk: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + + for chunk in batched(findings_with_pk, _CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK, strict=False): + ids = [f.pk for f in chunk] + fresh_by_id = { + row["id"]: row + for row in Finding.objects.filter(pk__in=ids).values( + "id", + "false_p", + "risk_accepted", + "out_of_scope", + ) + } + for finding in chunk: + row = fresh_by_id.get(finding.pk) + if row is not None: + finding.false_p = row["false_p"] + finding.risk_accepted = row["risk_accepted"] + finding.out_of_scope = row["out_of_scope"] + else: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) def close_old_findings( self, From a06d5599056daa275d5a34380c01a3a3043e2bcb Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 5 Apr 2026 14:43:43 +0200 Subject: [PATCH 4/4] tests: add 4th reimport step (empty scan, close_old_findings) to performance tests - Add step 4 to _import_reimport_performance: reimport with an empty StackHawk scan and close_old_findings=True, verifying findings are actually closed (assertGreater on len_closed_findings). - Fix all reimport steps to pass service="Secured Application" so the reimporter's service filter matches findings produced by the StackHawk parser (which sets service from the scan's application field). Without this, original_items was always empty and no matching/closing occurred. - Add stackhawk_empty.json scan fixture. - Fix update_performance_test_counts.py to handle reimport3 (step 4) by adding reimport3_queries/reimport3_async_tasks to param_map. - Update all expected query/task counts for both Small and Locations test classes to reflect the new step and the batch status-sync fix. --- scripts/update_performance_test_counts.py | 2 + .../scans/stackhawk/stackhawk_empty.json | 32 +++++++++ unittests/test_importers_performance.py | 65 ++++++++++++++++++- 3 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 unittests/scans/stackhawk/stackhawk_empty.json diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py index f9fd077e0e8..77a2e629e11 100644 --- a/scripts/update_performance_test_counts.py +++ b/scripts/update_performance_test_counts.py @@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] | "reimport1_async_tasks": "expected_num_async_tasks2", "reimport2_queries": "expected_num_queries3", "reimport2_async_tasks": "expected_num_async_tasks3", + "reimport3_queries": "expected_num_queries4", + "reimport3_async_tasks": "expected_num_async_tasks4", } param_map_deduplication = { "first_import_queries": "expected_num_queries1", diff --git a/unittests/scans/stackhawk/stackhawk_empty.json b/unittests/scans/stackhawk/stackhawk_empty.json new file mode 100644 index 00000000000..056193baafe --- /dev/null +++ b/unittests/scans/stackhawk/stackhawk_empty.json @@ -0,0 +1,32 @@ +{ + "service": "StackHawk", + "scanCompleted": { + "scan": { + "comment defect dojo team": "This is an empty StackHawk scan results", + "id": "e2ff5651-7eef-47e9-b743-0c2f7d861e27", + "hawkscanVersion": "2.1.1", + "env": "Development", + "status": "COMPLETED", + "application": "Secured Application", + "startedTimestamp": "2022-02-16T23:07:19.575Z", + "scanURL": "https://app.stackhawk.com/scans/e2ff5651-7eef-47e9-b743-0c2f7d861e27" + }, + "scanDuration": "21", + "spiderDuration": "45", + "completedScanStats": { + "urlsCount": "31", + "duration": "66", + "scanResultsStats": { + "totalCount": "0", + "lowCount": "0", + "mediumCount": "0", + "highCount": "0", + "lowTriagedCount": "0", + "mediumTriagedCount": "0", + "highTriagedCount": "0" + } + }, + "findings": [ + ] + } +} diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index ea3b3b79b40..ef7b4763460 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -51,6 +51,7 @@ STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json" STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json" +STACK_HAWK_EMPTY = get_unit_tests_scans_path("stackhawk") / "stackhawk_empty.json" STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan" @@ -126,12 +127,17 @@ def _import_reimport_performance( expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1, scan_file2, scan_file3, + scan_file4, scan_type, product_name, engagement_name, + *, + close_old_findings4=False, ): """ Test import/reimport/reimport performance with specified scan files and scan type. @@ -195,6 +201,7 @@ def _import_reimport_performance( "verified": True, "sync": True, "scan_type": scan_type, + "service": "Secured Application", "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"], "apply_tags_to_findings": True, } @@ -224,10 +231,44 @@ def _import_reimport_performance( "verified": True, "sync": True, "scan_type": scan_type, + "service": "Secured Application", } reimporter = DefaultReImporter(**reimport_options) test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) + # Fourth import (reimport again, empty report) + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 + self.subTest("reimport3"), impersonate(Dojo_User.objects.get(username="admin")), + scan_file4.open(encoding="utf-8") as scan, + ): + with self.subTest(step="reimport3", metric="queries"): + with self.assertNumQueries(expected_num_queries4): + with self.subTest(step="reimport3", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks4): + reimport_options = { + "test": test, + "user": lead, + "lead": lead, + "scan_date": None, + "minimum_severity": "Info", + "active": True, + "verified": True, + "sync": True, + "scan_type": scan_type, + # StackHawk parser sets the service field causing close old findings to fail if we do not specify the service field + # This is a big problem that needs fixing. Parsers should not set the service field. + "service": "Secured Application", + "close_old_findings": close_old_findings4, + } + reimporter = DefaultReImporter(**reimport_options) + test, _, len_new_findings4, len_closed_findings4, _, _, _ = reimporter.process_scan(scan) + logger.info("Step 4: new=%s closed=%s", len_new_findings4, len_closed_findings4) + self.assertGreater(len_closed_findings4, 0, "Step 4 (empty reimport with close_old_findings=True) should close findings") + @tag("performance") @skip_unless_v2 @@ -235,7 +276,7 @@ class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase): """Performance tests using small sample files (StackHawk, ~6 findings).""" - def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): + def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4): """ Log output can be quite large as when the assertNumQueries fails, all queries are printed. It could be usefule to capture the output in `less`: @@ -251,12 +292,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1=STACK_HAWK_SUBSET_FILENAME, scan_file2=STACK_HAWK_FILENAME, scan_file3=STACK_HAWK_SUBSET_FILENAME, + scan_file4=STACK_HAWK_EMPTY, scan_type=STACK_HAWK_SCAN_TYPE, product_name="TestDojoDefaultImporter", engagement_name="Test Create Engagement", + close_old_findings4=True, ) @override_settings(ENABLE_AUDITLOG=True) @@ -275,6 +320,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self): expected_num_async_tasks2=17, expected_num_queries3=108, expected_num_async_tasks3=16, + expected_num_queries4=155, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -297,6 +344,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): expected_num_async_tasks2=17, expected_num_queries3=115, expected_num_async_tasks3=16, + expected_num_queries4=155, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -320,6 +369,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr expected_num_async_tasks2=19, expected_num_queries3=119, expected_num_async_tasks3=18, + expected_num_queries4=162, + expected_num_async_tasks4=8, ) # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results. @@ -486,7 +537,7 @@ def setUp(self): for model in [Location, LocationFindingReference]: ContentType.objects.get_for_model(model) - def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): + def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4): r""" Log output can be quite large as when the assertNumQueries fails, all queries are printed. It could be useful to capture the output in `less`: @@ -502,12 +553,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1=STACK_HAWK_SUBSET_FILENAME, scan_file2=STACK_HAWK_FILENAME, scan_file3=STACK_HAWK_SUBSET_FILENAME, + scan_file4=STACK_HAWK_EMPTY, scan_type=STACK_HAWK_SCAN_TYPE, product_name="TestDojoDefaultImporterLocations", engagement_name="Test Create Engagement Locations", + close_old_findings4=True, ) @override_settings(ENABLE_AUDITLOG=True) @@ -526,6 +581,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self): expected_num_async_tasks2=17, expected_num_queries3=346, expected_num_async_tasks3=16, + expected_num_queries4=212, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -548,6 +605,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): expected_num_async_tasks2=17, expected_num_queries3=355, expected_num_async_tasks3=16, + expected_num_queries4=212, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -571,6 +630,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr expected_num_async_tasks2=19, expected_num_queries3=359, expected_num_async_tasks3=18, + expected_num_queries4=222, + expected_num_async_tasks4=8, ) def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):