Skip to content

Commit 0b965cd

Browse files
committed
Enhance grouping algo
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent 9cfdd83 commit 0b965cd

3 files changed

Lines changed: 152 additions & 16 deletions

File tree

vulnerabilities/pipelines/v2_improvers/mark_unfurl_version_range.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from django.db import transaction
1111
from django.db.models import Exists
12+
from django.db.models import Min
1213
from django.db.models import OuterRef
1314
from django.db.models import Q
1415

@@ -51,7 +52,7 @@ def mark_all_impacts_unfurled(self):
5152
impacted_packages=impacted_packages,
5253
)
5354

54-
for advisory_id in advisories_qs.iterator(chunk_size=100):
55+
for advisory_id in advisories_qs.iterator(chunk_size=1000):
5556
batch.append(advisory_id)
5657

5758
if len(batch) >= batch_size:
@@ -104,9 +105,12 @@ def latest_advisories_with_all_impacts_unfurled_attempted(
104105
_all_impacts_unfurled_successfully=False,
105106
is_latest=True,
106107
)
107-
.annotate(has_unattempted_impacts=Exists(impacts_not_attempted))
108+
.annotate(
109+
has_unattempted_impacts=Exists(impacts_not_attempted),
110+
first_base_purl=Min("impacted_packages__base_purl"),
111+
)
108112
.filter(has_unattempted_impacts=False)
109-
.order_by("_all_impacts_unfurled", "datasource_id")
113+
.order_by("_all_impacts_unfurled", "first_base_purl")
110114
.values_list("id", flat=True)
111115
)
112116

vulnerabilities/pipes/group_advisories.py

Lines changed: 123 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import hashlib
11+
import json
1012
from collections import defaultdict
1113
from typing import List
1214

@@ -16,6 +18,7 @@
1618
from vulnerabilities.models import AdvisorySetMember
1719
from vulnerabilities.models import AdvisoryV2
1820
from vulnerabilities.models import Group
21+
from vulnerabilities.utils import normalize_list
1922

2023

2124
@transaction.atomic
@@ -138,25 +141,137 @@ def group_advisory_for_package(package, logger=None):
138141
return
139142

140143

144+
def compute_advisory_content_hash(adv, version_less_purl_str: str):
145+
"""
146+
Compute a content hash for an advisory.
147+
148+
``version_less_purl_str`` is pre-computed by the caller once and reused
149+
across all advisories — avoids re-constructing PackageURL N times.
150+
The impacted_packages relation must already be prefetched with
151+
``affecting_packages`` and ``fixed_by_packages`` before calling this.
152+
"""
153+
affected = []
154+
fixed = []
155+
156+
for impact in adv.impacted_packages.all():
157+
if impact.base_purl != version_less_purl_str:
158+
continue
159+
for pkg in impact.affecting_packages.all():
160+
if pkg.package_url:
161+
affected.append(pkg.package_url)
162+
for pkg in impact.fixed_by_packages.all():
163+
if pkg.package_url:
164+
fixed.append(pkg.package_url)
165+
166+
normalized_data = {
167+
"affected_packages": normalize_list(affected),
168+
"fixed_packages": normalize_list(fixed),
169+
}
170+
normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True)
171+
return hashlib.sha256(normalized_json.encode("utf-8")).hexdigest()
172+
173+
174+
def get_merged_identifier_groups(advisories, alias_map: dict):
175+
"""
176+
Merge advisories based on shared advisory_id or alias.
177+
178+
``alias_map`` is a dict[adv.id -> list[AdvisoryAlias]] pre-built by the
179+
caller from a single bulk query — eliminates per-advisory alias lookups.
180+
181+
Uses a union-find (DSU) structure instead of the original O(n²) list-scan
182+
merge, reducing merge cost to O(n·α(n)).
183+
"""
184+
from vulnerabilities.models import Group
185+
186+
advisories = list(advisories)
187+
if not advisories:
188+
return []
189+
190+
parent = list(range(len(advisories)))
191+
rank = [0] * len(advisories)
192+
193+
def find(x):
194+
while parent[x] != x:
195+
parent[x] = parent[parent[x]]
196+
x = parent[x]
197+
return x
198+
199+
def union(x, y):
200+
rx, ry = find(x), find(y)
201+
if rx == ry:
202+
return
203+
if rank[rx] < rank[ry]:
204+
rx, ry = ry, rx
205+
parent[ry] = rx
206+
if rank[rx] == rank[ry]:
207+
rank[rx] += 1
208+
209+
identifier_to_idx: dict[str, int] = {}
210+
211+
for i, adv in enumerate(advisories):
212+
identifiers = [adv.advisory_id] + [alias.alias for alias in alias_map.get(adv.id, [])]
213+
for ident in identifiers:
214+
if ident in identifier_to_idx:
215+
union(i, identifier_to_idx[ident])
216+
else:
217+
identifier_to_idx[ident] = i
218+
219+
root_to_group: dict[int, list] = defaultdict(list)
220+
for i, adv in enumerate(advisories):
221+
root_to_group[find(i)].append(adv)
222+
223+
final_groups: list[Group] = []
224+
225+
for group_members in root_to_group.values():
226+
aliases = set()
227+
for adv in group_members:
228+
aliases.update(alias_map.get(adv.id, []))
229+
230+
primary = max(
231+
group_members,
232+
key=lambda a: a.precedence if a.precedence is not None else -1,
233+
)
234+
secondaries = [a for a in group_members if a is not primary]
235+
final_groups.append(Group(aliases=aliases, primary=primary, secondaries=secondaries))
236+
237+
return final_groups
238+
239+
141240
def merge_advisories(advisories, package):
142241
"""
143-
Merge advisories based on their content hash and identifiers.
242+
Merge advisories based on content hash and identifiers.
243+
244+
Builds the alias map once up-front from the already-prefetched queryset
245+
so every downstream call shares a single in-memory dict.
144246
"""
145-
from vulnerabilities.utils import compute_advisory_content_hash
146-
from vulnerabilities.utils import get_merged_identifier_groups
247+
from packageurl import PackageURL
147248

148249
advisories = list(advisories)
250+
if not advisories:
251+
return []
252+
253+
version_less_purl_str = str(
254+
PackageURL(
255+
type=package.type,
256+
namespace=package.namespace,
257+
name=package.name,
258+
qualifiers=package.qualifiers,
259+
subpath=package.subpath,
260+
)
261+
)
149262

150-
content_hash_map = defaultdict(list)
263+
alias_map: dict[int, list] = defaultdict(list)
264+
for adv in advisories:
265+
alias_map[adv.id] = list(adv.aliases.all())
151266

267+
content_hash_map: dict[str, list] = defaultdict(list)
152268
for adv in advisories:
153-
content_hash = compute_advisory_content_hash(adv, package)
269+
content_hash = compute_advisory_content_hash(adv, version_less_purl_str)
154270
content_hash_map[content_hash].append(adv)
155271

156-
final_groups: List[Group] = []
157-
272+
final_groups: list[Group] = []
158273
for group in content_hash_map.values():
159-
groups = get_merged_identifier_groups(group)
274+
groups = get_merged_identifier_groups(group, alias_map)
160275
final_groups.extend(groups)
161276

162277
return final_groups

vulnerabilities/tests/test_advisory_merge.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#
99

1010
import hashlib
11+
from collections import defaultdict
1112

1213
import pytest
1314

@@ -19,10 +20,10 @@
1920
from vulnerabilities.models import ImpactedPackage
2021
from vulnerabilities.models import PackageV2
2122
from vulnerabilities.pipes.group_advisories import delete_and_save_advisory_set
23+
from vulnerabilities.pipes.group_advisories import get_merged_identifier_groups
2224
from vulnerabilities.pipes.group_advisories import merge_advisories
2325
from vulnerabilities.utils import compute_advisory_content_hash
2426
from vulnerabilities.utils import get_advisories_from_groups
25-
from vulnerabilities.utils import get_merged_identifier_groups
2627

2728

2829
@pytest.mark.django_db
@@ -91,7 +92,11 @@ def test_identifier_merging(self):
9192
adv1.aliases.add(alias)
9293
adv2.aliases.add(alias)
9394

94-
groups = get_merged_identifier_groups([adv1, adv2])
95+
alias_map: dict[int, list] = defaultdict(list)
96+
for adv in [adv1, adv2]:
97+
alias_map[adv.id] = list(adv.aliases.all())
98+
99+
groups = get_merged_identifier_groups([adv1, adv2], alias_map=alias_map)
95100

96101
assert len(groups) == 1
97102
identifiers, primary, secondary = groups[0]
@@ -112,7 +117,11 @@ def test_transitive_merge(self):
112117
a2.aliases.add(alias_2)
113118
a3.aliases.add(alias_2)
114119

115-
groups = get_merged_identifier_groups([a1, a2, a3])
120+
alias_map: dict[int, list] = defaultdict(list)
121+
for adv in [a1, a2, a3]:
122+
alias_map[adv.id] = list(adv.aliases.all())
123+
124+
groups = get_merged_identifier_groups([a1, a2, a3], alias_map=alias_map)
116125

117126
assert len(groups) == 1
118127

@@ -125,7 +134,11 @@ def test_primary_selection_by_precedence(self):
125134
a1.aliases.add(alias_1)
126135
a2.aliases.add(alias_1)
127136

128-
groups = get_merged_identifier_groups([a1, a2])
137+
alias_map: dict[int, list] = defaultdict(list)
138+
for adv in [a1, a2]:
139+
alias_map[adv.id] = list(adv.aliases.all())
140+
141+
groups = get_merged_identifier_groups([a1, a2], alias_map=alias_map)
129142
_, primary, _ = groups[0]
130143

131144
assert primary == a2
@@ -134,7 +147,11 @@ def test_get_advisories_from_groups(self):
134147
adv = self.create_advisory("GHSA-ABC-123", ["1.0"])
135148
adv.aliases.create(alias="CVE-999")
136149

137-
groups = get_merged_identifier_groups([adv])
150+
alias_map: dict[int, list] = defaultdict(list)
151+
for adv in [adv]:
152+
alias_map[adv.id] = list(adv.aliases.all())
153+
154+
groups = get_merged_identifier_groups([adv], alias_map=alias_map)
138155
result = get_advisories_from_groups(groups)
139156

140157
assert result[0].identifier == "GHSA-ABC-123"

0 commit comments

Comments
 (0)