|
8 | 8 | # |
9 | 9 |
|
10 | 10 | import logging |
| 11 | +import re |
11 | 12 | import traceback |
| 13 | +from abc import abstractmethod |
| 14 | +from collections import defaultdict |
12 | 15 | from datetime import datetime |
13 | 16 | from datetime import timezone |
14 | 17 | from timeit import default_timer as timer |
15 | 18 | from traceback import format_exc as traceback_format_exc |
16 | 19 | from typing import Iterable |
17 | 20 | from typing import List |
| 21 | +from urllib.parse import urlparse |
18 | 22 |
|
| 23 | +import gitlab |
19 | 24 | from aboutcode.pipeline import LoopProgress |
20 | 25 | from aboutcode.pipeline import PipelineDefinition |
21 | 26 | from aboutcode.pipeline import humanize_time |
| 27 | +from github import Github |
22 | 28 |
|
23 | 29 | from vulnerabilities.importer import AdvisoryData |
| 30 | +from vulnerabilities.importer import ReferenceV2 |
24 | 31 | from vulnerabilities.improver import MAX_CONFIDENCE |
25 | 32 | from vulnerabilities.models import Advisory |
26 | 33 | from vulnerabilities.models import PipelineRun |
27 | 34 | from vulnerabilities.pipes.advisory import import_advisory |
28 | 35 | from vulnerabilities.pipes.advisory import insert_advisory |
29 | 36 | from vulnerabilities.pipes.advisory import insert_advisory_v2 |
| 37 | +from vulnerablecode.settings import env |
30 | 38 |
|
31 | 39 | module_logger = logging.getLogger(__name__) |
32 | 40 |
|
@@ -321,3 +329,99 @@ def collect_and_store_advisories(self): |
321 | 329 | continue |
322 | 330 |
|
323 | 331 | self.log(f"Successfully collected {collected_advisory_count:,d} advisories") |
| 332 | + |
| 333 | + |
| 334 | +class VCSCollector(VulnerableCodeBaseImporterPipeline): |
| 335 | + """ |
| 336 | + Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities. |
| 337 | + """ |
| 338 | + |
| 339 | + vcs_url: str |
| 340 | + CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE) |
| 341 | + SUPPORTED_IDENTIFIERS = ["CVE-"] |
| 342 | + |
| 343 | + collected_items: dict = {} |
| 344 | + |
| 345 | + def advisories_count(self) -> int: |
| 346 | + return 0 |
| 347 | + |
| 348 | + @classmethod |
| 349 | + def steps(cls): |
| 350 | + return ( |
| 351 | + cls.configure_target, |
| 352 | + cls.fetch_entries, |
| 353 | + cls.collect_items, |
| 354 | + cls.collect_and_store_advisories, |
| 355 | + ) |
| 356 | + |
| 357 | + def configure_target(self): |
| 358 | + parsed_url = urlparse(self.repo_url) |
| 359 | + parts = parsed_url.path.strip("/").split("/") |
| 360 | + if len(parts) < 2: |
| 361 | + raise ValueError(f"Invalid URL: {self.repo_url}") |
| 362 | + |
| 363 | + self.repo_name = f"{parts[0]}/{parts[1]}" |
| 364 | + |
| 365 | + @abstractmethod |
| 366 | + def fetch_entries(self): |
| 367 | + raise NotImplementedError |
| 368 | + |
| 369 | + @abstractmethod |
| 370 | + def collect_items(self): |
| 371 | + raise NotImplementedError |
| 372 | + |
| 373 | + def collect_advisories(self): |
| 374 | + """ |
| 375 | + Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub/Gitlab issues and PRs. |
| 376 | + """ |
| 377 | + self.log("Generating AdvisoryData objects from GitHub/Gitlab issues and PRs.") |
| 378 | + for vuln_id, refs in self.collected_items.items(): |
| 379 | + print(vuln_id, refs) |
| 380 | + references = [ReferenceV2(reference_type=ref_id, url=url) for ref_id, url in refs] |
| 381 | + yield AdvisoryData( |
| 382 | + advisory_id=vuln_id, |
| 383 | + aliases=[], |
| 384 | + references_v2=references, |
| 385 | + url=self.repo_url, |
| 386 | + ) |
| 387 | + |
| 388 | + |
| 389 | +class GitHubCollector(VCSCollector): |
| 390 | + def fetch_entries(self): |
| 391 | + """Fetch GitHub Data Entries""" |
| 392 | + github_token = env.str("GITHUB_TOKEN") |
| 393 | + g = Github(login_or_token=github_token) |
| 394 | + base_query = f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})" |
| 395 | + self.issues = g.search_issues(f"{base_query} is:issue") |
| 396 | + self.prs = g.search_issues(f"{base_query} is:pr") |
| 397 | + |
| 398 | + def collect_items(self): |
| 399 | + self.collected_items = defaultdict(list) |
| 400 | + |
| 401 | + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: |
| 402 | + for item in items: |
| 403 | + matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or "")) |
| 404 | + for match in matches: |
| 405 | + self.collected_items[match].append(("Issue", item.html_url)) |
| 406 | + |
| 407 | + |
| 408 | +class GitLabCollector(VCSCollector): |
| 409 | + def fetch_entries(self): |
| 410 | + """Fetch GitLab Data Entries""" |
| 411 | + gitlab_token = env.str("GITLAB_TOKEN") |
| 412 | + gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token) |
| 413 | + project = gl.projects.get(self.repo_name) |
| 414 | + base_query = " ".join(self.SUPPORTED_IDENTIFIERS) |
| 415 | + self.issues = project.search(scope="issues", search=base_query) |
| 416 | + self.prs = project.search(scope="merge_requests", search=base_query) |
| 417 | + |
| 418 | + def collect_items(self): |
| 419 | + self.collected_items = defaultdict(list) |
| 420 | + for i_type, items in [("Issue", self.issues), ("PR", self.prs)]: |
| 421 | + for item in items: |
| 422 | + title = item.get("title") or "" |
| 423 | + description = item.get("description") or "" |
| 424 | + matches = self.CVE_PATTERN.findall(title + " " + description) |
| 425 | + for match in matches: |
| 426 | + url = item.get("web_url") |
| 427 | + self.collected_items[match].append((i_type, url)) |
0 commit comments