Skip to content

Commit bf04c3c

Browse files
uid_or_hash_code: fix self/older check
1 parent 92a92ca commit bf04c3c

1 file changed

Lines changed: 43 additions & 2 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from operator import attrgetter
23

34
import hyperlink
45
from django.conf import settings
@@ -216,6 +217,39 @@ def find_candidates_for_deduplication_unique_id(test, findings, *, include_produ
216217
return existing_by_uid
217218

218219

220+
def deduplicate_uid_or_hash_code_old(new_finding):
221+
if new_finding.test.engagement.deduplication_on_engagement:
222+
existing_findings = Finding.objects.filter(
223+
(Q(hash_code__isnull=False) & Q(hash_code=new_finding.hash_code))
224+
# unique_id_from_tool can only apply to the same test_type because it is parser dependent
225+
| (Q(unique_id_from_tool__isnull=False) & Q(unique_id_from_tool=new_finding.unique_id_from_tool) & Q(test__test_type=new_finding.test.test_type)),
226+
test__engagement=new_finding.test.engagement).exclude(
227+
id=new_finding.id).exclude(
228+
duplicate=True).order_by("id")
229+
else:
230+
# same without "test__engagement=new_finding.test.engagement" condition
231+
existing_findings = Finding.objects.filter(
232+
(Q(hash_code__isnull=False) & Q(hash_code=new_finding.hash_code))
233+
| (Q(unique_id_from_tool__isnull=False) & Q(unique_id_from_tool=new_finding.unique_id_from_tool) & Q(test__test_type=new_finding.test.test_type)),
234+
test__engagement__product=new_finding.test.engagement.product).exclude(
235+
id=new_finding.id).exclude(
236+
duplicate=True).order_by("id")
237+
deduplicationLogger.debug("Found "
238+
+ str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code")
239+
for find in existing_findings:
240+
if is_deduplication_on_engagement_mismatch(new_finding, find):
241+
deduplicationLogger.debug(
242+
"deduplication_on_engagement_mismatch, skipping dedupe.")
243+
continue
244+
try:
245+
if are_endpoints_duplicates(new_finding, find):
246+
set_duplicate(new_finding, find)
247+
break
248+
except Exception as e:
249+
deduplicationLogger.debug(str(e))
250+
continue
251+
252+
219253
def find_candidates_for_deduplication_uid_or_hash(test, findings, *, include_product_scope_filter):
220254
base_queryset = build_dedupe_scope_queryset(test)
221255
hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
@@ -325,8 +359,7 @@ def match_uid_or_hash_candidate(new_finding, candidates_by_uid, candidates_by_ha
325359
deduplicationLogger.debug("UID_OR_HASH: combined candidate ids (sorted)=%s", sorted(combined_by_id.keys()))
326360
for candidate_id in sorted(combined_by_id.keys()):
327361
candidate = combined_by_id[candidate_id]
328-
# Exclude self
329-
if candidate.id == new_finding.id:
362+
if not _is_candidate_older(new_finding, candidate):
330363
continue
331364
if is_deduplication_on_engagement_mismatch(new_finding, candidate):
332365
deduplicationLogger.debug("deduplication_on_engagement_mismatch, skipping dedupe.")
@@ -407,6 +440,7 @@ def _dedupe_batch_hash_code(findings):
407440
if not candidates_by_hash:
408441
return
409442
for new_finding in findings:
443+
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_HASH_CODE")
410444
match = match_hash_candidate(new_finding, candidates_by_hash)
411445
if match:
412446
try:
@@ -423,6 +457,7 @@ def _dedupe_batch_unique_id(findings):
423457
if not candidates_by_uid:
424458
return
425459
for new_finding in findings:
460+
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL")
426461
match = match_unique_id_candidate(new_finding, candidates_by_uid)
427462
if match:
428463
try:
@@ -440,6 +475,7 @@ def _dedupe_batch_uid_or_hash(findings):
440475
if not (candidates_by_uid or existing_by_hash):
441476
return
442477
for new_finding in findings:
478+
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE")
443479
if new_finding.duplicate:
444480
continue
445481

@@ -460,6 +496,7 @@ def _dedupe_batch_legacy(findings):
460496
if not (candidates_by_title or candidates_by_cwe):
461497
return
462498
for new_finding in findings:
499+
deduplicationLogger.debug(f"deduplication start for finding {new_finding.id} with DEDUPE_ALGO_LEGACY")
463500
match = match_legacy_candidate(new_finding, candidates_by_title, candidates_by_cwe)
464501
if match:
465502
try:
@@ -471,6 +508,10 @@ def _dedupe_batch_legacy(findings):
471508
def dedupe_batch_of_findings(findings):
472509
if not findings:
473510
return
511+
512+
# sort findings by id to ensure deduplication is deterministic/reproducible
513+
findings = sorted(findings, key=attrgetter("id"))
514+
474515
test = findings[0].test
475516
dedup_alg = test.deduplication_algorithm
476517

0 commit comments

Comments
 (0)