|
| 1 | +# |
| 2 | +# Copyright (c) nexB Inc. and others. All rights reserved. |
| 3 | +# VulnerableCode is a trademark of nexB Inc. |
| 4 | +# SPDX-License-Identifier: Apache-2.0 |
| 5 | +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. |
| 6 | +# See https://github.com/aboutcode-org/vulnerablecode for support or download. |
| 7 | +# See https://aboutcode.org for more information about nexB OSS projects. |
| 8 | +# |
| 9 | + |
| 10 | +import logging |
| 11 | +import multiprocessing |
| 12 | +import os |
| 13 | +import warnings |
| 14 | +from concurrent import futures |
| 15 | + |
| 16 | +from aboutcode.pipeline import LoopProgress |
| 17 | +from django.core.paginator import Paginator |
| 18 | +from django.db import transaction |
| 19 | + |
| 20 | +from vulnerabilities.models import Advisory |
| 21 | +from vulnerabilities.pipelines import VulnerableCodePipeline |
| 22 | +from vulnerabilities.utils import compute_content_id |
| 23 | +from vulnerablecode import settings |
| 24 | + |
| 25 | +logger = logging.getLogger("scanpipe.pipes") |
| 26 | + |
| 27 | + |
| 28 | +def get_max_workers(keep_available=4): |
| 29 | + """ |
| 30 | + Return the `VULNERABLECODE_PROCESSES` if defined in the setting, |
| 31 | + or returns a default value based on the number of available CPUs, |
| 32 | + minus the provided `keep_available` value. |
| 33 | +
|
| 34 | + On operating system where the multiprocessing start method is not "fork", |
| 35 | + but for example "spawn", such as on macOS, multiprocessing and threading are |
| 36 | + disabled by default returning -1 `max_workers`. |
| 37 | + """ |
| 38 | + processes_from_settings = settings.VULNERABLECODE_PROCESSES |
| 39 | + if processes_from_settings in [-1, 0, 1]: |
| 40 | + return processes_from_settings |
| 41 | + |
| 42 | + if multiprocessing.get_start_method() != "fork": |
| 43 | + return -1 |
| 44 | + |
| 45 | + max_workers = os.cpu_count() - keep_available |
| 46 | + if max_workers < 1: |
| 47 | + return 1 |
| 48 | + |
| 49 | + if processes_from_settings is not None: |
| 50 | + if processes_from_settings <= max_workers: |
| 51 | + return processes_from_settings |
| 52 | + else: |
| 53 | + msg = ( |
| 54 | + f"The value {processes_from_settings} specified in SCANCODEIO_PROCESSES" |
| 55 | + f" exceeds the number of available CPUs on this machine." |
| 56 | + f" {max_workers} CPUs will be used instead for multiprocessing." |
| 57 | + ) |
| 58 | + warnings.warn(msg, ResourceWarning) |
| 59 | + |
| 60 | + return max_workers |
| 61 | + |
| 62 | + |
| 63 | +class InsufficientResourcesError(Exception): |
| 64 | + pass |
| 65 | + |
| 66 | + |
| 67 | +def process_advisories( |
| 68 | + advisories, |
| 69 | + advisory_func, |
| 70 | + progress_logger=None, |
| 71 | + batch_size=1000, |
| 72 | +): |
| 73 | + """ |
| 74 | + Run the `advisory_func` on the advisories of the provided `advisories`. |
| 75 | +
|
| 76 | + Multiprocessing is enabled by default on this pipe, the number of processes can be |
| 77 | + controlled through the `VULNERABLECODE_PROCESSES` setting. |
| 78 | + Multiprocessing can be disabled using `VULNERABLECODE_PROCESSES=0`, |
| 79 | + and threading can also be disabled `VULNERABLECODE_PROCESSES=-1` |
| 80 | +
|
| 81 | + The advisories QuerySet is chunked in `batch_size` results at the time, |
| 82 | + this can result in a significant reduction in memory usage. |
| 83 | + """ |
| 84 | + advisories_count = advisories.count() |
| 85 | + logger.info(f"Process {advisories_count} advisories with {advisory_func.__name__}") |
| 86 | + progress = LoopProgress(advisories_count, logger=progress_logger) |
| 87 | + max_workers = get_max_workers(keep_available=4) |
| 88 | + |
| 89 | + advisory_batches = get_advisory_batches(advisories, batch_size) |
| 90 | + |
| 91 | + if max_workers <= 0: |
| 92 | + for advisory_ids in progress.iter(advisory_batches): |
| 93 | + progress.log_progress() |
| 94 | + logger.debug(f"{advisory_func.__name__} len={len(advisory_ids)}") |
| 95 | + advisory_func(advisory_ids=advisory_ids, logger=None) |
| 96 | + return |
| 97 | + |
| 98 | + logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers") |
| 99 | + |
| 100 | + with futures.ProcessPoolExecutor(max_workers) as executor: |
| 101 | + future_to_advisories = { |
| 102 | + executor.submit(advisory_func, advisory_ids, None): advisory_ids |
| 103 | + for advisory_ids in advisory_batches |
| 104 | + } |
| 105 | + |
| 106 | + future_as_completed = futures.as_completed(future_to_advisories) |
| 107 | + |
| 108 | + for future in progress.iter(future_as_completed): |
| 109 | + advisory_ids = future_to_advisories[future] |
| 110 | + progress.log_progress() |
| 111 | + logger.debug(f"{advisory_func.__name__} len={len(advisory_ids)}") |
| 112 | + try: |
| 113 | + future.result() |
| 114 | + except futures.process.BrokenProcessPool as broken_pool_error: |
| 115 | + message = ( |
| 116 | + "You may not have enough resources to complete this operation. " |
| 117 | + "Please ensure that there is at least 2 GB of available memory per " |
| 118 | + "CPU core for successful execution." |
| 119 | + ) |
| 120 | + raise broken_pool_error from InsufficientResourcesError(message) |
| 121 | + |
| 122 | + |
| 123 | +def get_advisory_batches(advisories, batch_size=1000): |
| 124 | + """ |
| 125 | + Yield lists of advisory ids each of upto batch size length. |
| 126 | + """ |
| 127 | + paginator = Paginator(advisories, per_page=batch_size) |
| 128 | + for page_number in paginator.page_range: |
| 129 | + page = paginator.page(page_number) |
| 130 | + yield [obj.id for obj in page.object_list] |
| 131 | + |
| 132 | + |
| 133 | +def recompute_content_ids(advisory_ids, logger): |
| 134 | + """ |
| 135 | + Recompute content IDs for all `advisory_ids`. |
| 136 | + """ |
| 137 | + advisories = Advisory.objects.exclude(unique_content_id__length=64).filter(id__in=advisory_ids) |
| 138 | + total_count = advisories.count() |
| 139 | + |
| 140 | + if not total_count: |
| 141 | + logger("No advisories need content ID recomputation", level=logging.INFO) |
| 142 | + return |
| 143 | + |
| 144 | + logger(f"Recomputing content IDs for {total_count} advisories", level=logging.INFO) |
| 145 | + |
| 146 | + progress = LoopProgress( |
| 147 | + total_iterations=total_count, |
| 148 | + progress_step=total_count // 100, |
| 149 | + logger=logger, |
| 150 | + ) |
| 151 | + |
| 152 | + with transaction.atomic(): |
| 153 | + advisories = advisories.select_for_update(nowait=True, skip_locked=True) |
| 154 | + if not advisories.exists(): |
| 155 | + return |
| 156 | + advisories_to_update = [] |
| 157 | + for advisory in progress.iter(advisories): |
| 158 | + advisory.unique_content_id = compute_content_id(advisory.to_advisory_data()) |
| 159 | + advisories_to_update.append(advisory) |
| 160 | + |
| 161 | + if advisories_to_update: |
| 162 | + Advisory.objects.bulk_update( |
| 163 | + advisories_to_update, |
| 164 | + ["unique_content_id"], |
| 165 | + batch_size=len(advisories_to_update), |
| 166 | + ) |
| 167 | + if logger: |
| 168 | + logger( |
| 169 | + f"Updated content IDs for {len(advisories_to_update)} advisories", |
| 170 | + level=logging.INFO, |
| 171 | + ) |
| 172 | + |
| 173 | + |
| 174 | +class RecomputeContentIDPipeline(VulnerableCodePipeline): |
| 175 | + """Pipeline to remove duplicate advisories based on their content.""" |
| 176 | + |
| 177 | + pipeline_id = "recompute_content_ids" |
| 178 | + BATCH_SIZE = 1000 |
| 179 | + |
| 180 | + @classmethod |
| 181 | + def steps(cls): |
| 182 | + return (cls.recompute_content_ids,) |
| 183 | + |
| 184 | + def recompute_content_ids(self): |
| 185 | + """ |
| 186 | + Recompute content IDs for all advisories. |
| 187 | + """ |
| 188 | + while True: |
| 189 | + advisories = Advisory.objects.exclude(unique_content_id__length=64) |
| 190 | + if not advisories.exists(): |
| 191 | + break |
| 192 | + process_advisories( |
| 193 | + advisories=advisories, |
| 194 | + advisory_func=recompute_content_ids, |
| 195 | + progress_logger=self.log, |
| 196 | + batch_size=1000, |
| 197 | + ) |
0 commit comments