Skip to content

Commit 84e2f6f

Browse files
UNIQUE_ID_OR_HASH_CODE: dont stop after one candidate (#13513)
* UNIQUE_ID_OR_HASH_CODE: dont stop after one candidate * docs: add upgrade note
1 parent fd5b2fb commit 84e2f6f

File tree

3 files changed

+117
-32
lines changed

3 files changed

+117
-32
lines changed

docs/content/en/open_source/upgrading/2.52.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ weight: -20251006
55
description: MobSF parsers & Helm chart changes.
66
---
77

8+
## Deduplication fix of `UNIQUE_ID_OR_HASH_CODE`
9+
A bug was fixed in the `UNIQUE_ID_OR_HASH_CODE` algorithm where it stopped processing candidate findings with equal `unique_id_from_tool` or `hash_code` value.
10+
Strictly speaking this is not a breaking change, but we wanted to make you aware that you can see more (better) more deduplicatation for parsers using this algorithm.
11+
812
## Merge of MobSF parsers
913

1014
Mobsfscan Scan" has been merged into the "MobSF Scan" parser. The "Mobsfscan Scan" scan_type has been retained to keep deduplication working for existing Tests, but users are encouraged to move to the "MobSF Scan" scan_type.
@@ -17,16 +21,16 @@ This release introduces more important changes to the Helm chart configuration:
1721

1822
#### Tags
1923

20-
`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart.
21-
This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`.
22-
If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`).
24+
`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart.
25+
This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`.
26+
If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`).
2327
Digest pinning is now supported as well.
2428

2529
#### Security context
2630

2731
This Helm chart extends security context capabilities to all deployed pods and containers.
2832
You can define a default pod and container security context globally using `securityContext.podSecurityContext` and `securityContext.containerSecurityContext` keys.
29-
Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones.
33+
Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones.
3034

3135
#### Fine-grained resources
3236

dojo/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ def deduplicate_uid_or_hash_code(new_finding):
508508
id=new_finding.id).exclude(
509509
duplicate=True).order_by("id")
510510
deduplicationLogger.debug("Found "
511-
+ str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code")
511+
+ str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code: " + str([find.id for find in existing_findings]))
512512
for find in existing_findings:
513513
if is_deduplication_on_engagement_mismatch(new_finding, find):
514514
deduplicationLogger.debug(
@@ -517,10 +517,10 @@ def deduplicate_uid_or_hash_code(new_finding):
517517
try:
518518
if are_endpoints_duplicates(new_finding, find):
519519
set_duplicate(new_finding, find)
520+
break
520521
except Exception as e:
521522
deduplicationLogger.debug(str(e))
522523
continue
523-
break
524524

525525

526526
def set_duplicate(new_finding, existing_finding):

unittests/test_deduplication_logic.py

Lines changed: 107 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,55 +1181,136 @@ def test_dedupe_same_id_different_test_type_unique_id_or_hash_code(self):
11811181
# expect not duplicate as the mathcing finding is from another test_type, hash_code is also different
11821182
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
11831183

1184-
def test_identical_different_endpoints_unique_id_or_hash_code(self):
1184+
def test_identical_different_endpoints_unique_id_or_hash_code_dynamic(self):
11851185
# create identical copy, so unique id is the same
1186-
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
1186+
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)
11871187

1188-
finding_new.save(dedupe_option=False)
1189-
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
1188+
finding_new1.save(dedupe_option=False)
1189+
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
11901190
ep1.save()
1191-
finding_new.endpoints.add(ep1)
1192-
finding_new.save()
1191+
finding_new1.endpoints.add(ep1)
1192+
finding_new1.save()
11931193

11941194
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
11951195
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
1196-
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1196+
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
11971197
else:
1198-
self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1198+
# endpoints don't match with 224, so not a duplicate
1199+
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1200+
1201+
# remove the finding to prevent it from being duplicated by the next finding we create
1202+
finding_new1.delete()
11991203

12001204
# same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True
1201-
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
1205+
finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224)
12021206

1203-
finding_new.save(dedupe_option=False)
1204-
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
1207+
finding_new2.save(dedupe_option=False)
1208+
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https")
12051209
ep1.save()
1206-
finding_new.endpoints.add(ep1)
1207-
finding_new.unique_id_from_tool = 1
1208-
finding_new.dynamic_finding = True
1209-
finding_new.save()
1210+
finding_new2.endpoints.add(ep1)
1211+
finding_new2.unique_id_from_tool = 1
1212+
finding_new2.dynamic_finding = True
1213+
finding_new2.save()
12101214

12111215
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
12121216
# different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate
1213-
self.assert_finding(finding_new, not_pk=224, duplicate=True, hash_code=finding_224.hash_code)
1217+
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1218+
else:
1219+
# endpoints do not match with 224
1220+
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1221+
1222+
def test_identical_different_endpoints_unique_id_or_hash_code_static(self):
1223+
# create identical copy, so unique id is the same
1224+
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)
1225+
1226+
finding_new1.save(dedupe_option=False)
1227+
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
1228+
ep1.save()
1229+
finding_new1.endpoints.add(ep1)
1230+
finding_new1.save()
1231+
1232+
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
1233+
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
1234+
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
12141235
else:
1215-
self.assert_finding(finding_new, not_pk=224, duplicate=False, hash_code=finding_224.hash_code)
1236+
# endpoints don't match with 224, so not a duplicate
1237+
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1238+
1239+
# remove the finding to prevent it from being duplicated by the next finding we create
1240+
finding_new1.delete()
12161241

12171242
# same scenario, now with different uid. and different endpoints
1218-
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
1243+
finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224)
12191244

1220-
finding_new.save(dedupe_option=False)
1221-
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
1245+
finding_new3.save(dedupe_option=False)
1246+
ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https")
12221247
ep1.save()
1223-
finding_new.endpoints.add(ep1)
1224-
finding_new.unique_id_from_tool = 1
1225-
finding_new.dynamic_finding = False
1226-
finding_new.save()
1248+
finding_new3.endpoints.add(ep1)
1249+
finding_new3.unique_id_from_tool = 1
1250+
finding_new3.dynamic_finding = False
1251+
finding_new3.save()
1252+
1253+
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
1254+
# different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints
1255+
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1256+
else:
1257+
# endpoints do not match with 224
1258+
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1259+
1260+
def test_identical_different_endpoints_unique_id_or_hash_code_multiple(self):
1261+
# create identical copy, so unique id is the same
1262+
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)
1263+
1264+
finding_new1.save(dedupe_option=False)
1265+
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
1266+
ep1.save()
1267+
finding_new1.endpoints.add(ep1)
1268+
finding_new1.save()
1269+
1270+
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
1271+
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
1272+
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1273+
else:
1274+
# endpoints don't match with 224, so not a duplicate
1275+
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1276+
1277+
# same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True
1278+
finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224)
1279+
1280+
finding_new2.save(dedupe_option=False)
1281+
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https")
1282+
ep1.save()
1283+
finding_new2.endpoints.add(ep1)
1284+
finding_new2.unique_id_from_tool = 1
1285+
finding_new2.dynamic_finding = True
1286+
finding_new2.save()
1287+
1288+
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
1289+
# different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate
1290+
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1291+
else:
1292+
# endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first
1293+
# candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497
1294+
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code)
1295+
1296+
# same scenario, now with different uid. and different endpoints
1297+
finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224)
1298+
1299+
finding_new3.save(dedupe_option=False)
1300+
ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https")
1301+
ep1.save()
1302+
finding_new3.endpoints.add(ep1)
1303+
finding_new3.unique_id_from_tool = 1
1304+
finding_new3.dynamic_finding = False
1305+
finding_new3.save()
12271306

12281307
if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
12291308
# different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints
1230-
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
1309+
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
12311310
else:
1232-
self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
1311+
# endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first
1312+
# candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497
1313+
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code)
12331314

12341315
# # some extra tests
12351316

0 commit comments

Comments
 (0)