11import logging
2+ from operator import attrgetter
23
34import hyperlink
45from django .conf import settings
@@ -216,6 +217,39 @@ def find_candidates_for_deduplication_unique_id(test, findings, *, include_produ
216217 return existing_by_uid
217218
218219
220+ def deduplicate_uid_or_hash_code_old (new_finding ):
221+ if new_finding .test .engagement .deduplication_on_engagement :
222+ existing_findings = Finding .objects .filter (
223+ (Q (hash_code__isnull = False ) & Q (hash_code = new_finding .hash_code ))
224+ # unique_id_from_tool can only apply to the same test_type because it is parser dependent
225+ | (Q (unique_id_from_tool__isnull = False ) & Q (unique_id_from_tool = new_finding .unique_id_from_tool ) & Q (test__test_type = new_finding .test .test_type )),
226+ test__engagement = new_finding .test .engagement ).exclude (
227+ id = new_finding .id ).exclude (
228+ duplicate = True ).order_by ("id" )
229+ else :
230+ # same without "test__engagement=new_finding.test.engagement" condition
231+ existing_findings = Finding .objects .filter (
232+ (Q (hash_code__isnull = False ) & Q (hash_code = new_finding .hash_code ))
233+ | (Q (unique_id_from_tool__isnull = False ) & Q (unique_id_from_tool = new_finding .unique_id_from_tool ) & Q (test__test_type = new_finding .test .test_type )),
234+ test__engagement__product = new_finding .test .engagement .product ).exclude (
235+ id = new_finding .id ).exclude (
236+ duplicate = True ).order_by ("id" )
237+ deduplicationLogger .debug ("Found "
238+ + str (len (existing_findings )) + " findings with either the same unique_id_from_tool or hash_code" )
239+ for find in existing_findings :
240+ if is_deduplication_on_engagement_mismatch (new_finding , find ):
241+ deduplicationLogger .debug (
242+ "deduplication_on_engagement_mismatch, skipping dedupe." )
243+ continue
244+ try :
245+ if are_endpoints_duplicates (new_finding , find ):
246+ set_duplicate (new_finding , find )
247+ break
248+ except Exception as e :
249+ deduplicationLogger .debug (str (e ))
250+ continue
251+
252+
219253def find_candidates_for_deduplication_uid_or_hash (test , findings , * , include_product_scope_filter ):
220254 base_queryset = build_dedupe_scope_queryset (test )
221255 hash_codes = {f .hash_code for f in findings if getattr (f , "hash_code" , None ) is not None }
@@ -325,8 +359,7 @@ def match_uid_or_hash_candidate(new_finding, candidates_by_uid, candidates_by_ha
325359 deduplicationLogger .debug ("UID_OR_HASH: combined candidate ids (sorted)=%s" , sorted (combined_by_id .keys ()))
326360 for candidate_id in sorted (combined_by_id .keys ()):
327361 candidate = combined_by_id [candidate_id ]
328- # Exclude self
329- if candidate .id == new_finding .id :
362+ if not _is_candidate_older (new_finding , candidate ):
330363 continue
331364 if is_deduplication_on_engagement_mismatch (new_finding , candidate ):
332365 deduplicationLogger .debug ("deduplication_on_engagement_mismatch, skipping dedupe." )
@@ -407,6 +440,7 @@ def _dedupe_batch_hash_code(findings):
407440 if not candidates_by_hash :
408441 return
409442 for new_finding in findings :
443+ deduplicationLogger .debug (f"deduplication start for finding { new_finding .id } with DEDUPE_ALGO_HASH_CODE" )
410444 match = match_hash_candidate (new_finding , candidates_by_hash )
411445 if match :
412446 try :
@@ -423,6 +457,7 @@ def _dedupe_batch_unique_id(findings):
423457 if not candidates_by_uid :
424458 return
425459 for new_finding in findings :
460+ deduplicationLogger .debug (f"deduplication start for finding { new_finding .id } with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL" )
426461 match = match_unique_id_candidate (new_finding , candidates_by_uid )
427462 if match :
428463 try :
@@ -440,6 +475,7 @@ def _dedupe_batch_uid_or_hash(findings):
440475 if not (candidates_by_uid or existing_by_hash ):
441476 return
442477 for new_finding in findings :
478+ deduplicationLogger .debug (f"deduplication start for finding { new_finding .id } with DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE" )
443479 if new_finding .duplicate :
444480 continue
445481
@@ -460,6 +496,7 @@ def _dedupe_batch_legacy(findings):
460496 if not (candidates_by_title or candidates_by_cwe ):
461497 return
462498 for new_finding in findings :
499+ deduplicationLogger .debug (f"deduplication start for finding { new_finding .id } with DEDUPE_ALGO_LEGACY" )
463500 match = match_legacy_candidate (new_finding , candidates_by_title , candidates_by_cwe )
464501 if match :
465502 try :
@@ -471,6 +508,10 @@ def _dedupe_batch_legacy(findings):
471508def dedupe_batch_of_findings (findings ):
472509 if not findings :
473510 return
511+
512+ # sort findings by id to ensure deduplication is deterministic/reproducible
513+ findings = sorted (findings , key = attrgetter ("id" ))
514+
474515 test = findings [0 ].test
475516 dedup_alg = test .deduplication_algorithm
476517
0 commit comments