|
7 | 7 | # See https://aboutcode.org for more information about nexB OSS projects. |
8 | 8 | # |
9 | 9 |
|
| 10 | +import hashlib |
| 11 | +import json |
10 | 12 | from collections import defaultdict |
11 | 13 | from typing import List |
12 | 14 |
|
|
16 | 18 | from vulnerabilities.models import AdvisorySetMember |
17 | 19 | from vulnerabilities.models import AdvisoryV2 |
18 | 20 | from vulnerabilities.models import Group |
| 21 | +from vulnerabilities.utils import normalize_list |
19 | 22 |
|
20 | 23 |
|
21 | 24 | @transaction.atomic |
@@ -138,25 +141,137 @@ def group_advisory_for_package(package, logger=None): |
138 | 141 | return |
139 | 142 |
|
140 | 143 |
|
| 144 | +def compute_advisory_content_hash(adv, version_less_purl_str: str): |
| 145 | + """ |
| 146 | + Compute a content hash for an advisory. |
| 147 | +
|
| 148 | + ``version_less_purl_str`` is pre-computed by the caller once and reused |
| 149 | + across all advisories — avoids re-constructing PackageURL N times. |
| 150 | + The impacted_packages relation must already be prefetched with |
| 151 | + ``affecting_packages`` and ``fixed_by_packages`` before calling this. |
| 152 | + """ |
| 153 | + affected = [] |
| 154 | + fixed = [] |
| 155 | + |
| 156 | + for impact in adv.impacted_packages.all(): |
| 157 | + if impact.base_purl != version_less_purl_str: |
| 158 | + continue |
| 159 | + for pkg in impact.affecting_packages.all(): |
| 160 | + if pkg.package_url: |
| 161 | + affected.append(pkg.package_url) |
| 162 | + for pkg in impact.fixed_by_packages.all(): |
| 163 | + if pkg.package_url: |
| 164 | + fixed.append(pkg.package_url) |
| 165 | + |
| 166 | + normalized_data = { |
| 167 | + "affected_packages": normalize_list(affected), |
| 168 | + "fixed_packages": normalize_list(fixed), |
| 169 | + } |
| 170 | + normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True) |
| 171 | + return hashlib.sha256(normalized_json.encode("utf-8")).hexdigest() |
| 172 | + |
| 173 | + |
| 174 | +def get_merged_identifier_groups(advisories, alias_map: dict): |
| 175 | + """ |
| 176 | + Merge advisories based on shared advisory_id or alias. |
| 177 | +
|
| 178 | + ``alias_map`` is a dict[adv.id -> list[AdvisoryAlias]] pre-built by the |
| 179 | + caller from a single bulk query — eliminates per-advisory alias lookups. |
| 180 | +
|
| 181 | + Uses a union-find (DSU) structure instead of the original O(n²) list-scan |
| 182 | + merge, reducing merge cost to O(n·α(n)). |
| 183 | + """ |
| 184 | + from vulnerabilities.models import Group |
| 185 | + |
| 186 | + advisories = list(advisories) |
| 187 | + if not advisories: |
| 188 | + return [] |
| 189 | + |
| 190 | + parent = list(range(len(advisories))) |
| 191 | + rank = [0] * len(advisories) |
| 192 | + |
| 193 | + def find(x): |
| 194 | + while parent[x] != x: |
| 195 | + parent[x] = parent[parent[x]] |
| 196 | + x = parent[x] |
| 197 | + return x |
| 198 | + |
| 199 | + def union(x, y): |
| 200 | + rx, ry = find(x), find(y) |
| 201 | + if rx == ry: |
| 202 | + return |
| 203 | + if rank[rx] < rank[ry]: |
| 204 | + rx, ry = ry, rx |
| 205 | + parent[ry] = rx |
| 206 | + if rank[rx] == rank[ry]: |
| 207 | + rank[rx] += 1 |
| 208 | + |
| 209 | + identifier_to_idx: dict[str, int] = {} |
| 210 | + |
| 211 | + for i, adv in enumerate(advisories): |
| 212 | + identifiers = [adv.advisory_id] + [alias.alias for alias in alias_map.get(adv.id, [])] |
| 213 | + for ident in identifiers: |
| 214 | + if ident in identifier_to_idx: |
| 215 | + union(i, identifier_to_idx[ident]) |
| 216 | + else: |
| 217 | + identifier_to_idx[ident] = i |
| 218 | + |
| 219 | + root_to_group: dict[int, list] = defaultdict(list) |
| 220 | + for i, adv in enumerate(advisories): |
| 221 | + root_to_group[find(i)].append(adv) |
| 222 | + |
| 223 | + final_groups: list[Group] = [] |
| 224 | + |
| 225 | + for group_members in root_to_group.values(): |
| 226 | + aliases = set() |
| 227 | + for adv in group_members: |
| 228 | + aliases.update(alias_map.get(adv.id, [])) |
| 229 | + |
| 230 | + primary = max( |
| 231 | + group_members, |
| 232 | + key=lambda a: a.precedence if a.precedence is not None else -1, |
| 233 | + ) |
| 234 | + secondaries = [a for a in group_members if a is not primary] |
| 235 | + final_groups.append(Group(aliases=aliases, primary=primary, secondaries=secondaries)) |
| 236 | + |
| 237 | + return final_groups |
| 238 | + |
| 239 | + |
141 | 240 | def merge_advisories(advisories, package): |
142 | 241 | """ |
143 | | - Merge advisories based on their content hash and identifiers. |
| 242 | + Merge advisories based on content hash and identifiers. |
| 243 | +
|
| 244 | + Builds the alias map once up-front from the already-prefetched queryset |
| 245 | + so every downstream call shares a single in-memory dict. |
144 | 246 | """ |
145 | | - from vulnerabilities.utils import compute_advisory_content_hash |
146 | | - from vulnerabilities.utils import get_merged_identifier_groups |
| 247 | + from packageurl import PackageURL |
147 | 248 |
|
148 | 249 | advisories = list(advisories) |
| 250 | + if not advisories: |
| 251 | + return [] |
| 252 | + |
| 253 | + version_less_purl_str = str( |
| 254 | + PackageURL( |
| 255 | + type=package.type, |
| 256 | + namespace=package.namespace, |
| 257 | + name=package.name, |
| 258 | + qualifiers=package.qualifiers, |
| 259 | + subpath=package.subpath, |
| 260 | + ) |
| 261 | + ) |
149 | 262 |
|
150 | | - content_hash_map = defaultdict(list) |
| 263 | + alias_map: dict[int, list] = defaultdict(list) |
| 264 | + for adv in advisories: |
| 265 | + alias_map[adv.id] = list(adv.aliases.all()) |
151 | 266 |
|
| 267 | + content_hash_map: dict[str, list] = defaultdict(list) |
152 | 268 | for adv in advisories: |
153 | | - content_hash = compute_advisory_content_hash(adv, package) |
| 269 | + content_hash = compute_advisory_content_hash(adv, version_less_purl_str) |
154 | 270 | content_hash_map[content_hash].append(adv) |
155 | 271 |
|
156 | | - final_groups: List[Group] = [] |
157 | | - |
| 272 | + final_groups: list[Group] = [] |
158 | 273 | for group in content_hash_map.values(): |
159 | | - groups = get_merged_identifier_groups(group) |
| 274 | + groups = get_merged_identifier_groups(group, alias_map) |
160 | 275 | final_groups.extend(groups) |
161 | 276 |
|
162 | 277 | return final_groups |
0 commit comments