diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 2828b8cec30..f0a4c05028c 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -1,4 +1,5 @@ import logging +from itertools import batched from django.conf import settings from django.core.files.uploadedfile import TemporaryUploadedFile @@ -30,6 +31,9 @@ logger = logging.getLogger(__name__) deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication") +# Bound IN-list size when bulk-loading status fields for close_old_findings. +_CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK = 1000 + class DefaultReImporterOptions(ImporterOptions): def validate_test( @@ -464,6 +468,44 @@ def process_findings( return self.new_items, self.reactivated_items, self.to_mitigate, self.untouched + def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None: + """ + Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding. + + These can change during reimport (e.g. false positive) while the in-memory instances + are stale. Per-finding refresh_from_db in close_old_findings was added in + https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per + finding issues one SELECT each; we batch one query per chunk of primary keys and fall + back to refresh_from_db only when needed. + + This really should be fixed differently, but for now we at least optimize it to be done in bulk. + """ + findings_without_pk = [f for f in findings if f.pk is None] + findings_with_pk = [f for f in findings if f.pk is not None] + + for finding in findings_without_pk: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + + for chunk in batched(findings_with_pk, _CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK, strict=False): + ids = [f.pk for f in chunk] + fresh_by_id = { + row["id"]: row + for row in Finding.objects.filter(pk__in=ids).values( + "id", + "false_p", + "risk_accepted", + "out_of_scope", + ) + } + for finding in chunk: + row = fresh_by_id.get(finding.pk) + if row is not None: + finding.false_p = row["false_p"] + finding.risk_accepted = row["risk_accepted"] + finding.out_of_scope = row["out_of_scope"] + else: + finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) + def close_old_findings( self, findings: list[Finding], @@ -477,17 +519,17 @@ def close_old_findings( if self.close_old_findings_toggle is False: return [] logger.debug("REIMPORT_SCAN: Closing findings no longer present in scan report") + # Get any status changes that could have occurred earlier in the process + # for special statuses only. + # An example of such is a finding being reported as false positive, and + # reimport makes this change in the database. However, the findings here + # are calculated based from the original values before the reimport, so + # any updates made during reimport are discarded without first getting the + # state of the finding as it stands at this moment (django-DefectDojo #12291). + self._sync_close_old_finding_status_fields(findings) # Determine if pushing to jira or if the finding groups are enabled mitigated_findings = [] for finding in findings: - # Get any status changes that could have occurred earlier in the process - # for special statuses only. - # An example of such is a finding being reported as false positive, and - # reimport makes this change in the database. However, the findings here - # are calculated based from the original values before the reimport, so - # any updates made during reimport are discarded without first getting the - # state of the finding as it stands at this moment - finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"]) # Ensure the finding is not already closed if not finding.mitigated or not finding.is_mitigated: logger.debug("mitigating finding: %i:%s", finding.id, finding) diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py index f9fd077e0e8..77a2e629e11 100644 --- a/scripts/update_performance_test_counts.py +++ b/scripts/update_performance_test_counts.py @@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] | "reimport1_async_tasks": "expected_num_async_tasks2", "reimport2_queries": "expected_num_queries3", "reimport2_async_tasks": "expected_num_async_tasks3", + "reimport3_queries": "expected_num_queries4", + "reimport3_async_tasks": "expected_num_async_tasks4", } param_map_deduplication = { "first_import_queries": "expected_num_queries1", diff --git a/unittests/scans/stackhawk/stackhawk_empty.json b/unittests/scans/stackhawk/stackhawk_empty.json new file mode 100644 index 00000000000..056193baafe --- /dev/null +++ b/unittests/scans/stackhawk/stackhawk_empty.json @@ -0,0 +1,32 @@ +{ + "service": "StackHawk", + "scanCompleted": { + "scan": { + "comment defect dojo team": "This is an empty StackHawk scan results", + "id": "e2ff5651-7eef-47e9-b743-0c2f7d861e27", + "hawkscanVersion": "2.1.1", + "env": "Development", + "status": "COMPLETED", + "application": "Secured Application", + "startedTimestamp": "2022-02-16T23:07:19.575Z", + "scanURL": "https://app.stackhawk.com/scans/e2ff5651-7eef-47e9-b743-0c2f7d861e27" + }, + "scanDuration": "21", + "spiderDuration": "45", + "completedScanStats": { + "urlsCount": "31", + "duration": "66", + "scanResultsStats": { + "totalCount": "0", + "lowCount": "0", + "mediumCount": "0", + "highCount": "0", + "lowTriagedCount": "0", + "mediumTriagedCount": "0", + "highTriagedCount": "0" + } + }, + "findings": [ + ] + } +} diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index ea3b3b79b40..ef7b4763460 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -51,6 +51,7 @@ STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json" STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json" +STACK_HAWK_EMPTY = get_unit_tests_scans_path("stackhawk") / "stackhawk_empty.json" STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan" @@ -126,12 +127,17 @@ def _import_reimport_performance( expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1, scan_file2, scan_file3, + scan_file4, scan_type, product_name, engagement_name, + *, + close_old_findings4=False, ): """ Test import/reimport/reimport performance with specified scan files and scan type. @@ -195,6 +201,7 @@ def _import_reimport_performance( "verified": True, "sync": True, "scan_type": scan_type, + "service": "Secured Application", "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"], "apply_tags_to_findings": True, } @@ -224,10 +231,44 @@ def _import_reimport_performance( "verified": True, "sync": True, "scan_type": scan_type, + "service": "Secured Application", } reimporter = DefaultReImporter(**reimport_options) test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) + # Fourth import (reimport again, empty report) + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 + self.subTest("reimport3"), impersonate(Dojo_User.objects.get(username="admin")), + scan_file4.open(encoding="utf-8") as scan, + ): + with self.subTest(step="reimport3", metric="queries"): + with self.assertNumQueries(expected_num_queries4): + with self.subTest(step="reimport3", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks4): + reimport_options = { + "test": test, + "user": lead, + "lead": lead, + "scan_date": None, + "minimum_severity": "Info", + "active": True, + "verified": True, + "sync": True, + "scan_type": scan_type, + # StackHawk parser sets the service field causing close old findings to fail if we do not specify the service field + # This is a big problem that needs fixing. Parsers should not set the service field. + "service": "Secured Application", + "close_old_findings": close_old_findings4, + } + reimporter = DefaultReImporter(**reimport_options) + test, _, len_new_findings4, len_closed_findings4, _, _, _ = reimporter.process_scan(scan) + logger.info("Step 4: new=%s closed=%s", len_new_findings4, len_closed_findings4) + self.assertGreater(len_closed_findings4, 0, "Step 4 (empty reimport with close_old_findings=True) should close findings") + @tag("performance") @skip_unless_v2 @@ -235,7 +276,7 @@ class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase): """Performance tests using small sample files (StackHawk, ~6 findings).""" - def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): + def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4): """ Log output can be quite large as when the assertNumQueries fails, all queries are printed. It could be usefule to capture the output in `less`: @@ -251,12 +292,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1=STACK_HAWK_SUBSET_FILENAME, scan_file2=STACK_HAWK_FILENAME, scan_file3=STACK_HAWK_SUBSET_FILENAME, + scan_file4=STACK_HAWK_EMPTY, scan_type=STACK_HAWK_SCAN_TYPE, product_name="TestDojoDefaultImporter", engagement_name="Test Create Engagement", + close_old_findings4=True, ) @override_settings(ENABLE_AUDITLOG=True) @@ -275,6 +320,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self): expected_num_async_tasks2=17, expected_num_queries3=108, expected_num_async_tasks3=16, + expected_num_queries4=155, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -297,6 +344,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): expected_num_async_tasks2=17, expected_num_queries3=115, expected_num_async_tasks3=16, + expected_num_queries4=155, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -320,6 +369,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr expected_num_async_tasks2=19, expected_num_queries3=119, expected_num_async_tasks3=18, + expected_num_queries4=162, + expected_num_async_tasks4=8, ) # Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results. @@ -486,7 +537,7 @@ def setUp(self): for model in [Location, LocationFindingReference]: ContentType.objects.get_for_model(model) - def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): + def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4): r""" Log output can be quite large as when the assertNumQueries fails, all queries are printed. It could be useful to capture the output in `less`: @@ -502,12 +553,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, + expected_num_queries4, + expected_num_async_tasks4, scan_file1=STACK_HAWK_SUBSET_FILENAME, scan_file2=STACK_HAWK_FILENAME, scan_file3=STACK_HAWK_SUBSET_FILENAME, + scan_file4=STACK_HAWK_EMPTY, scan_type=STACK_HAWK_SCAN_TYPE, product_name="TestDojoDefaultImporterLocations", engagement_name="Test Create Engagement Locations", + close_old_findings4=True, ) @override_settings(ENABLE_AUDITLOG=True) @@ -526,6 +581,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self): expected_num_async_tasks2=17, expected_num_queries3=346, expected_num_async_tasks3=16, + expected_num_queries4=212, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -548,6 +605,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): expected_num_async_tasks2=17, expected_num_queries3=355, expected_num_async_tasks3=16, + expected_num_queries4=212, + expected_num_async_tasks4=6, ) @override_settings(ENABLE_AUDITLOG=True) @@ -571,6 +630,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr expected_num_async_tasks2=19, expected_num_queries3=359, expected_num_async_tasks3=18, + expected_num_queries4=222, + expected_num_async_tasks4=8, ) def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):