aboutcode-org
diff --git a/‎vulnerabilities/improvers/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎vulnerabilities/improvers/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vulnerabilities/pipelines/recompute_content_ids.py‎
Lines changed: 197 additions & 0 deletions b/‎vulnerabilities/pipelines/recompute_content_ids.py‎
Lines changed: 197 additions & 0 deletions
diff --git a/‎vulnerabilities/pipelines/remove_duplicate_advisories.py‎
Lines changed: 77 additions & 57 deletions b/‎vulnerabilities/pipelines/remove_duplicate_advisories.py‎
Lines changed: 77 additions & 57 deletions
@@ -18,8 +18,8 @@
 from vulnerabilities.pipelines import enhance_with_kev
 from vulnerabilities.pipelines import enhance_with_metasploit
 from vulnerabilities.pipelines import flag_ghost_packages
+from vulnerabilities.pipelines import recompute_content_ids
 from vulnerabilities.pipelines import remove_duplicate_advisories
-
 IMPROVERS_REGISTRY = [
     valid_versions.GitHubBasicImprover,
     valid_versions.GitLabBasicImprover,
@@ -46,6 +46,7 @@
     compute_package_version_rank.ComputeVersionRankPipeline,
     collect_commits.CollectFixCommitsPipeline,
     add_cvss31_to_CVEs.CVEAdvisoryMappingPipeline,
+    recompute_content_ids.RecomputeContentIDPipeline,
     remove_duplicate_advisories.RemoveDuplicateAdvisoriesPipeline,
 ]
 
 
@@ -0,0 +1,197 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# VulnerableCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/vulnerablecode for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import logging
+import multiprocessing
+import os
+import warnings
+from concurrent import futures
+
+from aboutcode.pipeline import LoopProgress
+from django.core.paginator import Paginator
+from django.db import transaction
+
+from vulnerabilities.models import Advisory
+from vulnerabilities.pipelines import VulnerableCodePipeline
+from vulnerabilities.utils import compute_content_id
+from vulnerablecode import settings
+
+logger = logging.getLogger("scanpipe.pipes")
+
+
+def get_max_workers(keep_available=4):
+    """
+    Return the `VULNERABLECODE_PROCESSES` if defined in the setting,
+    or returns a default value based on the number of available CPUs,
+    minus the provided `keep_available` value.
+
+    On operating system where the multiprocessing start method is not "fork",
+    but for example "spawn", such as on macOS, multiprocessing and threading are
+    disabled by default returning -1 `max_workers`.
+    """
+    processes_from_settings = settings.VULNERABLECODE_PROCESSES
+    if processes_from_settings in [-1, 0, 1]:
+        return processes_from_settings
+
+    if multiprocessing.get_start_method() != "fork":
+        return -1
+
+    max_workers = os.cpu_count() - keep_available
+    if max_workers < 1:
+        return 1
+
+    if processes_from_settings is not None:
+        if processes_from_settings <= max_workers:
+            return processes_from_settings
+        else:
+            msg = (
+                f"The value {processes_from_settings} specified in SCANCODEIO_PROCESSES"
+                f" exceeds the number of available CPUs on this machine."
+                f" {max_workers} CPUs will be used instead for multiprocessing."
+            )
+            warnings.warn(msg, ResourceWarning)
+
+    return max_workers
+
+
+class InsufficientResourcesError(Exception):
+    pass
+
+
+def process_advisories(
+    advisories,
+    advisory_func,
+    progress_logger=None,
+    batch_size=1000,
+):
+    """
+    Run the `advisory_func` on the advisories of the provided `advisories`.
+
+    Multiprocessing is enabled by default on this pipe, the number of processes can be
+    controlled through the `VULNERABLECODE_PROCESSES` setting.
+    Multiprocessing can be disabled using `VULNERABLECODE_PROCESSES=0`,
+    and threading can also be disabled `VULNERABLECODE_PROCESSES=-1`
+
+    The advisories QuerySet is chunked in `batch_size` results at the time,
+    this can result in a significant reduction in memory usage.
+    """
+    advisories_count = advisories.count()
+    logger.info(f"Process {advisories_count} advisories with {advisory_func.__name__}")
+    progress = LoopProgress(advisories_count, logger=progress_logger)
+    max_workers = get_max_workers(keep_available=4)
+
+    advisory_batches = get_advisory_batches(advisories, batch_size)
+
+    if max_workers <= 0:
+        for advisory_ids in progress.iter(advisory_batches):
+            progress.log_progress()
+            logger.debug(f"{advisory_func.__name__} len={len(advisory_ids)}")
+            advisory_func(advisory_ids=advisory_ids, logger=None)
+        return
+
+    logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
+
+    with futures.ProcessPoolExecutor(max_workers) as executor:
+        future_to_advisories = {
+            executor.submit(advisory_func, advisory_ids, None): advisory_ids
+            for advisory_ids in advisory_batches
+        }
+
+        future_as_completed = futures.as_completed(future_to_advisories)
+
+        for future in progress.iter(future_as_completed):
+            advisory_ids = future_to_advisories[future]
+            progress.log_progress()
+            logger.debug(f"{advisory_func.__name__} len={len(advisory_ids)}")
+            try:
+                future.result()
+            except futures.process.BrokenProcessPool as broken_pool_error:
+                message = (
+                    "You may not have enough resources to complete this operation. "
+                    "Please ensure that there is at least 2 GB of available memory per "
+                    "CPU core for successful execution."
+                )
+                raise broken_pool_error from InsufficientResourcesError(message)
+
+
+def get_advisory_batches(advisories, batch_size=1000):
+    """
+    Yield lists of advisory ids each of upto batch size length.
+    """
+    paginator = Paginator(advisories, per_page=batch_size)
+    for page_number in paginator.page_range:
+        page = paginator.page(page_number)
+        yield [obj.id for obj in page.object_list]
+
+
+def recompute_content_ids(advisory_ids, logger):
+    """
+    Recompute content IDs for all `advisory_ids`.
+    """
+    advisories = Advisory.objects.exclude(unique_content_id__length=64).filter(id__in=advisory_ids)
+    total_count = advisories.count()
+
+    if not total_count:
+        logger("No advisories need content ID recomputation", level=logging.INFO)
+        return
+
+    logger(f"Recomputing content IDs for {total_count} advisories", level=logging.INFO)
+
+    progress = LoopProgress(
+        total_iterations=total_count,
+        progress_step=total_count // 100,
+        logger=logger,
+    )
+
+    with transaction.atomic():
+        advisories = advisories.select_for_update(nowait=True, skip_locked=True)
+        if not advisories.exists():
+            return
+        advisories_to_update = []
+        for advisory in progress.iter(advisories):
+            advisory.unique_content_id = compute_content_id(advisory.to_advisory_data())
+            advisories_to_update.append(advisory)
+
+        if advisories_to_update:
+            Advisory.objects.bulk_update(
+                advisories_to_update,
+                ["unique_content_id"],
+                batch_size=len(advisories_to_update),
+            )
+            if logger:
+                logger(
+                    f"Updated content IDs for {len(advisories_to_update)} advisories",
+                    level=logging.INFO,
+                )
+
+
+class RecomputeContentIDPipeline(VulnerableCodePipeline):
+    """Pipeline to remove duplicate advisories based on their content."""
+
+    pipeline_id = "recompute_content_ids"
+    BATCH_SIZE = 1000
+
+    @classmethod
+    def steps(cls):
+        return (cls.recompute_content_ids,)
+
+    def recompute_content_ids(self):
+        """
+        Recompute content IDs for all advisories.
+        """
+        while True:
+            advisories = Advisory.objects.exclude(unique_content_id__length=64)
+            if not advisories.exists():
+                break
+            process_advisories(
+                advisories=advisories,
+                advisory_func=recompute_content_ids,
+                progress_logger=self.log,
+                batch_size=1000,
+            )
@@ -11,84 +11,104 @@
 from itertools import groupby
 
 from aboutcode.pipeline import LoopProgress
+from django.db import transaction
 from django.db.models import Count
-from django.db.models import Q
 
 from vulnerabilities.models import Advisory
 from vulnerabilities.pipelines import VulnerableCodePipeline
-from vulnerabilities.utils import compute_content_id
+from vulnerabilities.pipelines.recompute_content_ids import process_advisories
+
+
+def remove_duplicates_batch(advisory_ids, logger=None):
+    """
+    Process a batch of advisories to remove duplicates.
+    Keep only the oldest advisory for each content ID.
+    """
+    try:
+        with transaction.atomic():
+            advisories = Advisory.objects.filter(id__in=advisory_ids).select_for_update(
+                nowait=True, skip_locked=True
+            )
+            if not advisories.exists():
+                return
+
+            advisories_by_content_id = groupby(
+                advisories.order_by("unique_content_id").paginated(),
+                key=lambda x: x.unique_content_id,
+            )
+
+            progress = LoopProgress(total_iterations=advisories.count(), logger=logger)
+
+            for content_id, group_advisories in progress.iter(advisories_by_content_id):
+                group_advisories = list(group_advisories)
+
+                if len(group_advisories) <= 1:
+                    continue
+
+                if logger:
+                    logger.info(
+                        f"Found {len(group_advisories)} duplicates for content ID {content_id}",
+                    )
+
+                oldest = min(group_advisories, key=lambda x: x.date_imported)
+
+                advisory_ids_to_delete = [adv.id for adv in group_advisories if adv.id != oldest.id]
+                if advisory_ids_to_delete:
+                    Advisory.objects.filter(id__in=advisory_ids_to_delete).delete()
+                    if logger:
+                        logger.info(
+                            f"Kept advisory {oldest.id} and removed "
+                            f"{len(advisory_ids_to_delete)} duplicates for content ID {content_id}",
+                        )
+
+    except Exception as e:
+        if logger:
+            logger(
+                f"Error processing batch of advisories: {e}",
+                level=logging.ERROR,
+            )
 
 
 class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
     """Pipeline to remove duplicate advisories based on their content."""
 
     pipeline_id = "remove_duplicate_advisories"
+    BATCH_SIZE = 1000
 
     @classmethod
     def steps(cls):
-        return (
-            cls.recompute_content_ids,
-            cls.remove_duplicates,
-        )
+        return (cls.remove_duplicates,)
 
     def remove_duplicates(self):
         """
-        Find advisories with the same content and keep only the latest one.
+        Find advisories with the same content and keep only the oldest one.
+        Process in parallel batches with proper transaction management.
         """
-
-        duplicated_advisories = groupby(
-            Advisory.objects.order_by("unique_content_id").all().paginated(),
-            key=lambda x: x.unique_content_id,
-        )
-        progress = LoopProgress(total_iterations=Advisory.objects.count(), logger=self.log)
-        for _content_id, advisories in progress.iter(duplicated_advisories):
-            advisories = list(advisories)
-            self.log(
-                f"Removing duplicates for content ID {_content_id} {len(advisories)}",
-                level=logging.INFO,
-            )
-            oldest = min(advisories, key=lambda x: x.date_imported)
-            try:
-                advisory_ids = []
-                for adv in advisories:
-                    if adv.id != oldest.id:
-                        advisory_ids.append(adv.id)
-                Advisory.objects.filter(id__in=advisory_ids).delete()
-            except Exception as e:
-                self.log(f"Error deleting advisories: {e}", level=logging.ERROR)
-
-            self.log(
-                f"Kept advisory {oldest.id} and removed "
-                f"{len(list(advisories)) - 1} duplicates for content ID {_content_id}",
-                level=logging.INFO,
+        while True:
+            duplicate_content_ids = (
+                Advisory.objects.values("unique_content_id")
+                .annotate(count=Count("id"))
+                .filter(count__gt=1)
+                .values_list("unique_content_id", flat=True)
             )
 
-    def recompute_content_ids(self):
-        """
-        Recompute content IDs for all advisories.
-        """
-
-        advisories_list = []
+            print(f"duplicate_content_ids: {duplicate_content_ids}")
 
-        advisories = Advisory.objects.exclude(unique_content_id__length=64)
+            advisories = Advisory.objects.filter(unique_content_id__in=duplicate_content_ids)
 
-        progress = LoopProgress(
-            total_iterations=advisories.count(),
-            progress_step=1000,
-            logger=self.log,
-        )
+            if not advisories.exists():
+                break
 
-        batch_size = 50000
+            self.log(
+                f"Processing {advisories.count()} content IDs with duplicates",
+                level=logging.INFO,
+            )
 
-        for advisory in progress.iter(advisories.paginated(per_page=batch_size)):
-            self.log(f"Recomputing content ID for advisory {advisory.id}", level=logging.INFO)
-            advisory.unique_content_id = compute_content_id(advisory.to_advisory_data())
-            advisories_list.append(advisory)
-            if len(advisories_list) % batch_size == 0:
-                Advisory.objects.bulk_update(
-                    advisories_list, ["unique_content_id"], batch_size=batch_size
-                )
-                advisories_list = []
+            process_advisories(
+                advisories=advisories,
+                advisory_func=remove_duplicates_batch,
+                progress_logger=self.log,
+                batch_size=self.BATCH_SIZE,
+            )
 
-        if advisories:
-            Advisory.objects.bulk_update(advisories, ["unique_content_id"], batch_size=batch_size)
+            self.log("Completed duplicate removal batch", level=logging.INFO)