Merge pull request #2207 from aboutcode-org/store_advisory_content_hash

TG1999 · web-flow · commit 281ae60c40e5 · 2026-03-16T15:34:36.000+05:30
Store advisory content hash
diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py
@@ -20,6 +20,7 @@
 from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline
 from vulnerabilities.pipelines import remove_duplicate_advisories
 from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees
+from vulnerabilities.pipelines.v2_improvers import compute_advisory_content_hash
 from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2
 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2
 from vulnerabilities.pipelines.v2_improvers import (
@@ -74,5 +75,6 @@
         compute_advisory_todo.ComputeToDo,
         collect_ssvc_trees.CollectSSVCPipeline,
         relate_severities.RelateSeveritiesPipeline,
+        compute_advisory_content_hash.ComputeAdvisoryContentHash,
     ]
 )
diff --git a/vulnerabilities/migrations/0116_advisoryv2_advisory_content_hash.py b/vulnerabilities/migrations/0116_advisoryv2_advisory_content_hash.py
@@ -0,0 +1,23 @@
+# Generated by Django 5.2.11 on 2026-03-11 08:46
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("vulnerabilities", "0115_impactedpackageaffecting_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="advisoryv2",
+            name="advisory_content_hash",
+            field=models.CharField(
+                blank=True,
+                help_text="A unique hash computed from the content of the advisory used to identify advisories with the same content.",
+                max_length=64,
+                null=True,
+            ),
+        ),
+    ]
diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py
@@ -3010,6 +3010,13 @@ class AdvisoryV2(models.Model):
         help_text="Related advisories that are used to calculate the severity of this advisory.",
     )
 
+    advisory_content_hash = models.CharField(
+        max_length=64,
+        blank=True,
+        null=True,
+        help_text="A unique hash computed from the content of the advisory used to identify advisories with the same content.",
+    )
+
     @property
     def risk_score(self):
         """
@@ -3078,35 +3085,6 @@ def get_aliases(self):
         """
         return self.aliases.all()
 
-    def compute_advisory_content(self):
-        """
-        Compute a unique content hash for an advisory by normalizing its data and hashing it.
-
-        :param advisory: An Advisory object
-        :return: SHA-256 hash digest as content hash
-        """
-        normalized_data = {
-            "summary": normalize_text(self.summary),
-            "impacted_packages": sorted(
-                [impact.to_dict() for impact in self.impacted_packages.all()],
-                key=lambda x: json.dumps(x, sort_keys=True),
-            ),
-            "patches": sorted(
-                [patch.to_patch_data().to_dict() for patch in self.patches.all()],
-                key=lambda x: json.dumps(x, sort_keys=True),
-            ),
-            "severities": sorted(
-                [sev.to_vulnerability_severity_data().to_dict() for sev in self.severities.all()],
-                key=lambda x: (x.get("system"), x.get("value")),
-            ),
-            "weaknesses": normalize_list([weakness.cwe_id for weakness in self.weaknesses.all()]),
-        }
-
-        normalized_json = json.dumps(normalized_data, separators=(",", ":"), sort_keys=True)
-        content_hash = hashlib.sha256(normalized_json.encode("utf-8")).hexdigest()
-
-        return content_hash
-
     alias = get_aliases
 
 
diff --git a/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py b/vulnerabilities/pipelines/v2_importers/alpine_linux_importer.py
@@ -193,7 +193,8 @@ def load_advisories(
 
             fixed_version_range = None
             try:
-                fixed_version_range = AlpineLinuxVersionRange.from_versions([version])
+                if version:
+                    fixed_version_range = AlpineLinuxVersionRange.from_versions([version])
             except InvalidVersion as e:
                 logger(
                     f"{version!r} is not a valid AlpineVersion {e!r}",
diff --git a/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py b/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py
@@ -330,19 +330,20 @@ def to_version_ranges(self, versions_data, fixed_versions):
                 "=": "=",
             }
             comparator = comparator_by_range_expression.get(range_expression)
-            if comparator:
+            if comparator and version_value and version_value not in self.ignorable_versions:
                 constraints.append(
                     VersionConstraint(comparator=comparator, version=SemverVersion(version_value))
                 )
 
         for fixed_version in fixed_versions:
             # The VersionConstraint method `invert()` inverts the fixed_version's comparator,
             # enabling inclusion of multiple fixed versions with the `affected_version_range` values.
-            constraints.append(
-                VersionConstraint(
-                    comparator="=",
-                    version=SemverVersion(fixed_version),
-                ).invert()
-            )
+            if fixed_version and fixed_version not in self.ignorable_versions:
+                constraints.append(
+                    VersionConstraint(
+                        comparator="=",
+                        version=SemverVersion(fixed_version),
+                    ).invert()
+                )
 
         return ApacheVersionRange(constraints=constraints)
diff --git a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py
@@ -35,6 +35,7 @@ class ElixirSecurityImporterPipeline(VulnerableCodeBaseImporterPipelineV2):
     spdx_license_expression = "CC0-1.0"
     license_url = "https://github.com/dependabot/elixir-security-advisories/blob/master/LICENSE.txt"
     repo_url = "git+https://github.com/dependabot/elixir-security-advisories"
+    run_once = True
 
     precedence = 200
 
diff --git a/vulnerabilities/pipelines/v2_importers/gitlab_importer.py b/vulnerabilities/pipelines/v2_importers/gitlab_importer.py
@@ -252,6 +252,7 @@ def parse_gitlab_advisory(
             original_advisory_text=json.dumps(gitlab_advisory, indent=2, ensure_ascii=False),
         )
     affected_version_range = None
+    fixed_version_range = None
     fixed_versions = gitlab_advisory.get("fixed_versions") or []
     affected_range = gitlab_advisory.get("affected_range")
     gitlab_native_schemes = set(["pypi", "gem", "npm", "go", "packagist", "conan"])
@@ -285,7 +286,8 @@ def parse_gitlab_advisory(
     if affected_version_range:
         vrc = affected_version_range.__class__
 
-    fixed_version_range = vrc.from_versions(parsed_fixed_versions)
+    if parsed_fixed_versions:
+        fixed_version_range = vrc.from_versions(parsed_fixed_versions)
     if not fixed_version_range and not affected_version_range:
         return
 
diff --git a/vulnerabilities/pipelines/v2_importers/ruby_importer.py b/vulnerabilities/pipelines/v2_importers/ruby_importer.py
@@ -162,7 +162,9 @@ def get_affected_packages(record, purl):
     affected_packages = []
     for unaffected_version in record.get("unaffected_versions", []):
         try:
-            affected_version_range = GemVersionRange.from_native(unaffected_version).invert()
+            if unaffected_version:
+                unaffected_version = unaffected_version.strip()
+                affected_version_range = GemVersionRange.from_native(unaffected_version).invert()
             validate_comparators(affected_version_range.constraints)
             affected_packages.append(
                 AffectedPackageV2(
diff --git a/vulnerabilities/pipelines/v2_improvers/compute_advisory_content_hash.py b/vulnerabilities/pipelines/v2_improvers/compute_advisory_content_hash.py
@@ -0,0 +1,62 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# VulnerableCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/vulnerablecode for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+
+from aboutcode.pipeline import LoopProgress
+
+from vulnerabilities.models import AdvisoryV2
+from vulnerabilities.pipelines import VulnerableCodePipeline
+from vulnerabilities.utils import compute_advisory_content
+
+
+class ComputeAdvisoryContentHash(VulnerableCodePipeline):
+    """Compute Advisory Content Hash for Advisory."""
+
+    pipeline_id = "compute_advisory_content_hash_v2"
+
+    @classmethod
+    def steps(cls):
+        return (cls.compute_advisory_content_hash,)
+
+    def compute_advisory_content_hash(self):
+        """Compute Advisory Content Hash for Advisory."""
+
+        advisories = AdvisoryV2.objects.filter(advisory_content_hash__isnull=True)
+
+        advisories_count = advisories.count()
+
+        progress = LoopProgress(
+            total_iterations=advisories_count,
+            logger=self.log,
+            progress_step=1,
+        )
+
+        to_update = []
+        batch_size = 5000
+
+        for advisory in progress.iter(advisories.iterator(chunk_size=batch_size)):
+            advisory.advisory_content_hash = compute_advisory_content(advisory)
+            to_update.append(advisory)
+
+            if len(to_update) >= batch_size:
+                AdvisoryV2.objects.bulk_update(
+                    to_update,
+                    ["advisory_content_hash"],
+                    batch_size=batch_size,
+                )
+                to_update.clear()
+
+        if to_update:
+            AdvisoryV2.objects.bulk_update(
+                to_update,
+                ["advisory_content_hash"],
+                batch_size=batch_size,
+            )
+
+        self.log("Finished computing advisory_content_hash")
diff --git a/vulnerabilities/pipes/advisory.py b/vulnerabilities/pipes/advisory.py
@@ -48,6 +48,7 @@
 from vulnerabilities.models import VulnerabilitySeverity
 from vulnerabilities.models import Weakness
 from vulnerabilities.pipes.univers_utils import get_exact_purls_v2
+from vulnerabilities.utils import compute_advisory_content
 
 
 def get_or_create_aliases(aliases: List) -> QuerySet:
@@ -301,6 +302,7 @@ def insert_advisory_v2(
     advisory_obj = None
     created = False
     content_id = compute_content_id_v2(advisory_data=advisory)
+    advisory_content_hash = compute_advisory_content(advisory_data=advisory)
     try:
         default_data = {
             "datasource_id": pipeline_id,
@@ -311,6 +313,7 @@ def insert_advisory_v2(
             "original_advisory_text": advisory.original_advisory_text,
             "url": advisory.url,
             "precedence": precedence,
+            "advisory_content_hash": advisory_content_hash,
         }
 
         advisory_obj, created = AdvisoryV2.objects.get_or_create(
diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_compute_advisory_content_hash.py b/vulnerabilities/tests/pipelines/v2_improvers/test_compute_advisory_content_hash.py
@@ -0,0 +1,88 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# VulnerableCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/vulnerablecode for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from unittest.mock import patch
+
+import pytest
+
+from vulnerabilities.models import AdvisoryV2
+from vulnerabilities.pipelines.v2_improvers.compute_advisory_content_hash import (
+    ComputeAdvisoryContentHash,
+)
+
+pytestmark = pytest.mark.django_db
+
+
+@pytest.fixture
+def advisory_factory():
+    def _create(count, with_hash=False, start=0):
+        objs = []
+        for i in range(start, start + count):
+            objs.append(
+                AdvisoryV2(
+                    summary=f"summary {i}",
+                    advisory_content_hash="existing_hash" if with_hash else None,
+                    unique_content_id=f"unique_id_{i}",
+                    advisory_id=f"ADV-{i}",
+                    datasource_id="ds",
+                    avid=f"ds/ADV-{i}",
+                    url=f"https://example.com/ADV-{i}",
+                )
+            )
+        return AdvisoryV2.objects.bulk_create(objs)
+
+    return _create
+
+
+def run_pipeline():
+    pipeline = ComputeAdvisoryContentHash()
+    pipeline.compute_advisory_content_hash()
+
+
+@patch(
+    "vulnerabilities.pipelines.v2_improvers.compute_advisory_content_hash.compute_advisory_content"
+)
+def test_pipeline_updates_only_missing_hash(mock_compute, advisory_factory):
+    advisory_factory(3, with_hash=False, start=0)
+    advisory_factory(2, with_hash=True, start=100)
+
+    mock_compute.return_value = "new_hash"
+
+    run_pipeline()
+
+    updated = AdvisoryV2.objects.filter(advisory_content_hash="new_hash").count()
+    untouched = AdvisoryV2.objects.filter(advisory_content_hash="existing_hash").count()
+
+    assert updated == 3
+    assert untouched == 2
+    assert mock_compute.call_count == 3
+
+
+@patch(
+    "vulnerabilities.pipelines.v2_improvers.compute_advisory_content_hash.compute_advisory_content"
+)
+def test_pipeline_bulk_update_batches(mock_compute, advisory_factory):
+    advisory_factory(6000, with_hash=False)
+
+    mock_compute.return_value = "batch_hash"
+
+    run_pipeline()
+
+    assert AdvisoryV2.objects.filter(advisory_content_hash="batch_hash").count() == 6000
+
+    assert mock_compute.call_count == 6000
+
+
+@patch(
+    "vulnerabilities.pipelines.v2_improvers.compute_advisory_content_hash.compute_advisory_content"
+)
+def test_pipeline_no_advisories(mock_compute):
+    run_pipeline()
+
+    assert mock_compute.call_count == 0
diff --git a/vulnerabilities/tests/test_api_v2.py b/vulnerabilities/tests/test_api_v2.py
@@ -859,7 +859,7 @@ def setUp(self):
 
     def test_list_with_purl_filter(self):
         url = reverse("package-v3-list")
-        with self.assertNumQueries(29):
+        with self.assertNumQueries(31):
             response = self.client.get(url, {"purl": "pkg:pypi/sample@1.0.0"})
         assert response.status_code == 200
         assert "packages" in response.data["results"]
@@ -868,7 +868,7 @@ def test_list_with_purl_filter(self):
 
     def test_bulk_lookup(self):
         url = reverse("package-v3-bulk-lookup")
-        with self.assertNumQueries(28):
+        with self.assertNumQueries(30):
             response = self.client.post(url, {"purls": ["pkg:pypi/sample@1.0.0"]}, format="json")
         assert response.status_code == 200
         assert "packages" in response.data
@@ -878,7 +878,7 @@ def test_bulk_lookup(self):
     def test_bulk_search_plain(self):
         url = reverse("package-v3-bulk-search")
         payload = {"purls": ["pkg:pypi/sample@1.0.0"], "plain_purl": True, "purl_only": False}
-        with self.assertNumQueries(28):
+        with self.assertNumQueries(30):
             response = self.client.post(url, payload, format="json")
         assert response.status_code == 200
         assert "packages" in response.data
@@ -894,7 +894,7 @@ def test_bulk_search_purl_only(self):
 
     def test_lookup_single_package(self):
         url = reverse("package-v3-lookup")
-        with self.assertNumQueries(21):
+        with self.assertNumQueries(23):
             response = self.client.post(url, {"purl": "pkg:pypi/sample@1.0.0"}, format="json")
         assert response.status_code == 200
         assert any(pkg["purl"] == "pkg:pypi/sample@1.0.0" for pkg in response.data)
diff --git a/vulnerabilities/utils.py b/vulnerabilities/utils.py

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline`
`21`	`21`	`from vulnerabilities.pipelines import remove_duplicate_advisories`
`22`	`22`	`from vulnerabilities.pipelines.v2_improvers import collect_ssvc_trees`
	`23`	`+from vulnerabilities.pipelines.v2_improvers import compute_advisory_content_hash`
`23`	`24`	`from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2`
`24`	`25`	`from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2`
`25`	`26`	`from vulnerabilities.pipelines.v2_improvers import (`
`@@ -74,5 +75,6 @@`
`74`	`75`	`compute_advisory_todo.ComputeToDo,`
`75`	`76`	`collect_ssvc_trees.CollectSSVCPipeline,`
`76`	`77`	`relate_severities.RelateSeveritiesPipeline,`
	`78`	`+ compute_advisory_content_hash.ComputeAdvisoryContentHash,`
`77`	`79`	`]`
`78`	`80`	`)`