Skip to content

Commit 4083a5b

Browse files
committed
Address review comments
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent 309f7fd commit 4083a5b

File tree

8 files changed

+318
-285
lines changed

8 files changed

+318
-285
lines changed

vulnerabilities/importer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __lt__(self, other):
7373

7474
# TODO: Add cache
7575
def _cmp_key(self):
76-
return (str(self.system), self.value, self.scoring_elements, self.published_at)
76+
return (self.system.identifier, self.value, self.scoring_elements, self.published_at)
7777

7878
@classmethod
7979
def from_dict(cls, severity: dict):
@@ -190,7 +190,11 @@ def __lt__(self, other):
190190

191191
# TODO: Add cache
192192
def _cmp_key(self):
193-
return (str(self.package), str(self.affected_version_range), str(self.fixed_version))
193+
return (
194+
str(self.package),
195+
str(self.affected_version_range or ""),
196+
str(self.fixed_version or ""),
197+
)
194198

195199
@classmethod
196200
def merge(

vulnerabilities/migrations/0089_alter_advisory_unique_content_id.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Generated by Django 4.2.16 on 2025-02-06 14:27
1+
# Generated by Django 4.2.16 on 2025-02-12 13:41
22

33
from django.db import migrations, models
44

@@ -13,6 +13,11 @@ class Migration(migrations.Migration):
1313
migrations.AlterField(
1414
model_name="advisory",
1515
name="unique_content_id",
16-
field=models.CharField(blank=True, max_length=64),
16+
field=models.CharField(
17+
blank=True,
18+
db_index=True,
19+
help_text="A 64 character unique identifier for the content of the advisory since we use sha256 as hex",
20+
max_length=64,
21+
),
1722
),
1823
]

vulnerabilities/models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1317,7 +1317,9 @@ class Advisory(models.Model):
13171317

13181318
unique_content_id = models.CharField(
13191319
max_length=64,
1320+
db_index=True,
13201321
blank=True,
1322+
help_text="A 64 character unique identifier for the content of the advisory since we use sha256 as hex",
13211323
)
13221324
aliases = models.JSONField(blank=True, default=list, help_text="A list of alias strings")
13231325
summary = models.TextField(
@@ -1372,7 +1374,7 @@ def save(self, *args, **kwargs):
13721374

13731375
def save(self, *args, **kwargs):
13741376
advisory_data = self.to_advisory_data()
1375-
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)[:31]
1377+
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)
13761378
super().save(*args, **kwargs)
13771379

13781380
def to_advisory_data(self) -> "AdvisoryData":

vulnerabilities/pipelines/remove_duplicate_advisories.py

Lines changed: 43 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
import logging
1111
from itertools import groupby
12-
from operator import attrgetter
1312

13+
from aboutcode.pipeline import LoopProgress
1414
from django.db.models import Count
1515
from django.db.models import Q
1616

@@ -26,54 +26,58 @@ class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
2626

2727
@classmethod
2828
def steps(cls):
29-
return (cls.remove_duplicates,)
29+
return (
30+
cls.recompute_content_ids,
31+
cls.remove_duplicates,
32+
)
3033

3134
def remove_duplicates(self):
3235
"""
3336
Find advisories with the same content and keep only the latest one.
3437
"""
35-
# Get all advisories that have duplicates based on content ID
36-
duplicate_content_ids = (
37-
Advisory.objects.values("unique_content_id")
38-
.annotate(count=Count("id"))
39-
.filter(count__gt=1)
40-
.values_list("unique_content_id", flat=True)
41-
)
4238

43-
self.log(
44-
f"Found {len(duplicate_content_ids)} content IDs with duplicates", level=logging.INFO
39+
duplicated_advisories = groupby(
40+
Advisory.objects.order_by("unique_content_id").all().paginated(),
41+
key=lambda x: x.unique_content_id,
4542
)
46-
47-
for content_id in duplicate_content_ids:
48-
# Get all advisories with this content ID
49-
advisories = Advisory.objects.filter(unique_content_id=content_id)
50-
51-
# Find the latest advisory
52-
latest = advisories.latest("date_imported")
53-
54-
# Delete all except the latest
55-
advisories.exclude(id=latest.id).delete()
56-
57-
if self.log:
58-
self.log(
59-
f"Kept advisory {latest.id} and removed "
60-
f"{advisories.count() - 1} duplicates for content ID {content_id}",
61-
level=logging.INFO,
62-
)
63-
64-
def update_content_ids(self):
43+
progress = LoopProgress(total_iterations=Advisory.objects.count(), logger=self.log)
44+
for _content_id, advisories in progress.iter(duplicated_advisories):
45+
advisories = list(advisories)
46+
self.log(
47+
f"Removing duplicates for content ID {_content_id} {len(advisories)}",
48+
level=logging.INFO,
49+
)
50+
oldest = min(advisories, key=lambda x: x.date_imported)
51+
try:
52+
advisory_ids = []
53+
for adv in advisories:
54+
if adv.id != oldest.id:
55+
advisory_ids.append(adv.id)
56+
Advisory.objects.filter(id__in=advisory_ids).delete()
57+
except Exception as e:
58+
self.log(f"Error deleting advisories: {e}", level=logging.ERROR)
59+
60+
self.log(
61+
f"Kept advisory {oldest.id} and removed "
62+
f"{len(list(advisories)) - 1} duplicates for content ID {_content_id}",
63+
level=logging.INFO,
64+
)
65+
66+
def recompute_content_ids(self):
6567
"""
66-
Update content IDs for all advisories that don't have one.
68+
Recompute content IDs for all advisories.
6769
"""
68-
advisories = Advisory.objects.filter(
69-
Q(unique_content_id="") | Q(unique_content_id__isnull=True)
70-
)
7170

72-
self.log(f"Found {advisories.count()} advisories without content ID", level=logging.INFO)
71+
advisories = []
72+
73+
progress = LoopProgress(
74+
total_iterations=Advisory.objects.count(),
75+
progress_step=1,
76+
logger=self.log,
77+
)
7378

74-
for advisory in advisories:
79+
for advisory in progress.iter(Advisory.objects.all().paginated()):
7580
advisory.unique_content_id = compute_content_id(advisory)
76-
advisory.save()
81+
advisories.append(advisory)
7782

78-
if self.log:
79-
self.log(f"Updated content ID for advisory {advisory.id}", level=logging.DEBUG)
83+
Advisory.objects.bulk_update(advisories, ["unique_content_id"], batch_size=1000)

0 commit comments

Comments
 (0)