Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 50 additions & 8 deletions dojo/importers/default_reimporter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from itertools import batched

from django.conf import settings
from django.core.files.uploadedfile import TemporaryUploadedFile
Expand Down Expand Up @@ -30,6 +31,9 @@
logger = logging.getLogger(__name__)
deduplicationLogger = logging.getLogger("dojo.specific-loggers.deduplication")

# Bound IN-list size when bulk-loading status fields for close_old_findings.
_CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK = 1000


class DefaultReImporterOptions(ImporterOptions):
def validate_test(
Expand Down Expand Up @@ -464,6 +468,44 @@ def process_findings(

return self.new_items, self.reactivated_items, self.to_mitigate, self.untouched

def _sync_close_old_finding_status_fields(self, findings: list[Finding]) -> None:
"""
Refresh false_p, risk_accepted, and out_of_scope from the DB for each finding.

These can change during reimport (e.g. false positive) while the in-memory instances
are stale. Per-finding refresh_from_db in close_old_findings was added in
https://github.com/DefectDojo/django-DefectDojo/pull/12291. A naive refresh per
finding issues one SELECT each; we batch one query per chunk of primary keys and fall
back to refresh_from_db only when needed.

This really should be fixed differently, but for now we at least optimize it to be done in bulk.
"""
findings_without_pk = [f for f in findings if f.pk is None]
findings_with_pk = [f for f in findings if f.pk is not None]

for finding in findings_without_pk:
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])

for chunk in batched(findings_with_pk, _CLOSE_OLD_FINDINGS_STATUS_FIELDS_CHUNK, strict=False):
ids = [f.pk for f in chunk]
fresh_by_id = {
row["id"]: row
for row in Finding.objects.filter(pk__in=ids).values(
"id",
"false_p",
"risk_accepted",
"out_of_scope",
)
}
for finding in chunk:
row = fresh_by_id.get(finding.pk)
if row is not None:
finding.false_p = row["false_p"]
finding.risk_accepted = row["risk_accepted"]
finding.out_of_scope = row["out_of_scope"]
else:
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])

def close_old_findings(
self,
findings: list[Finding],
Expand All @@ -477,17 +519,17 @@ def close_old_findings(
if self.close_old_findings_toggle is False:
return []
logger.debug("REIMPORT_SCAN: Closing findings no longer present in scan report")
# Get any status changes that could have occurred earlier in the process
# for special statuses only.
# An example of such is a finding being reported as false positive, and
# reimport makes this change in the database. However, the findings here
# are calculated based from the original values before the reimport, so
# any updates made during reimport are discarded without first getting the
# state of the finding as it stands at this moment (django-DefectDojo #12291).
self._sync_close_old_finding_status_fields(findings)
# Determine if pushing to jira or if the finding groups are enabled
mitigated_findings = []
for finding in findings:
# Get any status changes that could have occurred earlier in the process
# for special statuses only.
# An example of such is a finding being reported as false positive, and
# reimport makes this change in the database. However, the findings here
# are calculated based from the original values before the reimport, so
# any updates made during reimport are discarded without first getting the
# state of the finding as it stands at this moment
finding.refresh_from_db(fields=["false_p", "risk_accepted", "out_of_scope"])
# Ensure the finding is not already closed
if not finding.mitigated or not finding.is_mitigated:
logger.debug("mitigating finding: %i:%s", finding.id, finding)
Expand Down
2 changes: 2 additions & 0 deletions scripts/update_performance_test_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,8 @@ def _extract_call_span(method_content: str, call_name: str) -> tuple[int, int] |
"reimport1_async_tasks": "expected_num_async_tasks2",
"reimport2_queries": "expected_num_queries3",
"reimport2_async_tasks": "expected_num_async_tasks3",
"reimport3_queries": "expected_num_queries4",
"reimport3_async_tasks": "expected_num_async_tasks4",
}
param_map_deduplication = {
"first_import_queries": "expected_num_queries1",
Expand Down
32 changes: 32 additions & 0 deletions unittests/scans/stackhawk/stackhawk_empty.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"service": "StackHawk",
"scanCompleted": {
"scan": {
"comment defect dojo team": "This is an empty StackHawk scan results",
"id": "e2ff5651-7eef-47e9-b743-0c2f7d861e27",
"hawkscanVersion": "2.1.1",
"env": "Development",
"status": "COMPLETED",
"application": "Secured Application",
"startedTimestamp": "2022-02-16T23:07:19.575Z",
"scanURL": "https://app.stackhawk.com/scans/e2ff5651-7eef-47e9-b743-0c2f7d861e27"
},
"scanDuration": "21",
"spiderDuration": "45",
"completedScanStats": {
"urlsCount": "31",
"duration": "66",
"scanResultsStats": {
"totalCount": "0",
"lowCount": "0",
"mediumCount": "0",
"highCount": "0",
"lowTriagedCount": "0",
"mediumTriagedCount": "0",
"highTriagedCount": "0"
}
},
"findings": [
]
}
}
65 changes: 63 additions & 2 deletions unittests/test_importers_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@

STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json"
STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json"
STACK_HAWK_EMPTY = get_unit_tests_scans_path("stackhawk") / "stackhawk_empty.json"
STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"


Expand Down Expand Up @@ -126,12 +127,17 @@ def _import_reimport_performance(
expected_num_async_tasks2,
expected_num_queries3,
expected_num_async_tasks3,
expected_num_queries4,
expected_num_async_tasks4,
scan_file1,
scan_file2,
scan_file3,
scan_file4,
scan_type,
product_name,
engagement_name,
*,
close_old_findings4=False,
):
"""
Test import/reimport/reimport performance with specified scan files and scan type.
Expand Down Expand Up @@ -195,6 +201,7 @@ def _import_reimport_performance(
"verified": True,
"sync": True,
"scan_type": scan_type,
"service": "Secured Application",
"tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
"apply_tags_to_findings": True,
}
Expand Down Expand Up @@ -224,18 +231,52 @@ def _import_reimport_performance(
"verified": True,
"sync": True,
"scan_type": scan_type,
"service": "Secured Application",
}
reimporter = DefaultReImporter(**reimport_options)
test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)

# Fourth import (reimport again, empty report)
# Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
# This allows us to see all count mismatches in a single test run, making it easier to fix
# all incorrect expected values at once rather than fixing them one at a time.
# Nested with statements are intentional - each assertion needs its own subTest wrapper.
with ( # noqa: SIM117
self.subTest("reimport3"), impersonate(Dojo_User.objects.get(username="admin")),
scan_file4.open(encoding="utf-8") as scan,
):
with self.subTest(step="reimport3", metric="queries"):
with self.assertNumQueries(expected_num_queries4):
with self.subTest(step="reimport3", metric="async_tasks"):
with self._assertNumAsyncTask(expected_num_async_tasks4):
reimport_options = {
"test": test,
"user": lead,
"lead": lead,
"scan_date": None,
"minimum_severity": "Info",
"active": True,
"verified": True,
"sync": True,
"scan_type": scan_type,
# StackHawk parser sets the service field causing close old findings to fail if we do not specify the service field
# This is a big problem that needs fixing. Parsers should not set the service field.
"service": "Secured Application",
"close_old_findings": close_old_findings4,
}
reimporter = DefaultReImporter(**reimport_options)
test, _, len_new_findings4, len_closed_findings4, _, _, _ = reimporter.process_scan(scan)
logger.info("Step 4: new=%s closed=%s", len_new_findings4, len_closed_findings4)
self.assertGreater(len_closed_findings4, 0, "Step 4 (empty reimport with close_old_findings=True) should close findings")


@tag("performance")
@skip_unless_v2
class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase):

"""Performance tests using small sample files (StackHawk, ~6 findings)."""

def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
"""
Log output can be quite large as when the assertNumQueries fails, all queries are printed.
It could be usefule to capture the output in `less`:
Expand All @@ -251,12 +292,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
expected_num_async_tasks2,
expected_num_queries3,
expected_num_async_tasks3,
expected_num_queries4,
expected_num_async_tasks4,
scan_file1=STACK_HAWK_SUBSET_FILENAME,
scan_file2=STACK_HAWK_FILENAME,
scan_file3=STACK_HAWK_SUBSET_FILENAME,
scan_file4=STACK_HAWK_EMPTY,
scan_type=STACK_HAWK_SCAN_TYPE,
product_name="TestDojoDefaultImporter",
engagement_name="Test Create Engagement",
close_old_findings4=True,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -275,6 +320,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
expected_num_async_tasks2=17,
expected_num_queries3=108,
expected_num_async_tasks3=16,
expected_num_queries4=155,
expected_num_async_tasks4=6,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -297,6 +344,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
expected_num_async_tasks2=17,
expected_num_queries3=115,
expected_num_async_tasks3=16,
expected_num_queries4=155,
expected_num_async_tasks4=6,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -320,6 +369,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
expected_num_async_tasks2=19,
expected_num_queries3=119,
expected_num_async_tasks3=18,
expected_num_queries4=162,
expected_num_async_tasks4=8,
)

# Deduplication is enabled in the tests above, but to properly test it we must run the same import twice and capture the results.
Expand Down Expand Up @@ -486,7 +537,7 @@ def setUp(self):
for model in [Location, LocationFindingReference]:
ContentType.objects.get_for_model(model)

def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3, expected_num_queries4, expected_num_async_tasks4):
r"""
Log output can be quite large as when the assertNumQueries fails, all queries are printed.
It could be useful to capture the output in `less`:
Expand All @@ -502,12 +553,16 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
expected_num_async_tasks2,
expected_num_queries3,
expected_num_async_tasks3,
expected_num_queries4,
expected_num_async_tasks4,
scan_file1=STACK_HAWK_SUBSET_FILENAME,
scan_file2=STACK_HAWK_FILENAME,
scan_file3=STACK_HAWK_SUBSET_FILENAME,
scan_file4=STACK_HAWK_EMPTY,
scan_type=STACK_HAWK_SCAN_TYPE,
product_name="TestDojoDefaultImporterLocations",
engagement_name="Test Create Engagement Locations",
close_old_findings4=True,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -526,6 +581,8 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
expected_num_async_tasks2=17,
expected_num_queries3=346,
expected_num_async_tasks3=16,
expected_num_queries4=212,
expected_num_async_tasks4=6,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -548,6 +605,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
expected_num_async_tasks2=17,
expected_num_queries3=355,
expected_num_async_tasks3=16,
expected_num_queries4=212,
expected_num_async_tasks4=6,
)

@override_settings(ENABLE_AUDITLOG=True)
Expand All @@ -571,6 +630,8 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
expected_num_async_tasks2=19,
expected_num_queries3=359,
expected_num_async_tasks3=18,
expected_num_queries4=222,
expected_num_async_tasks4=8,
)

def _deduplication_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, *, check_duplicates=True):
Expand Down
Loading