Skip to content

Commit 359de5b

Browse files
committed
Refactor the pipeline for collecting GitHub/GitLab issues and PRS
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent d199875 commit 359de5b

File tree

12 files changed

+258
-232
lines changed

12 files changed

+258
-232
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,4 @@ websocket-client==0.59.0
125125
yarl==1.7.2
126126
zipp==3.19.1
127127
PyGithub==2.6.1
128+
python-gitlab~=7.1.0

vulnerabilities/importers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from vulnerabilities.pipelines.v2_importers import apache_kafka_importer as apache_kafka_importer_v2
4848
from vulnerabilities.pipelines.v2_importers import apache_tomcat_importer as apache_tomcat_v2
4949
from vulnerabilities.pipelines.v2_importers import archlinux_importer as archlinux_importer_v2
50+
from vulnerabilities.pipelines.v2_importers import collect_issue_pr as collect_issue_pr_v2
5051
from vulnerabilities.pipelines.v2_importers import collect_fix_commits as collect_fix_commits_v2
5152
from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2
5253
from vulnerabilities.pipelines.v2_importers import debian_importer as debian_importer_v2
@@ -114,6 +115,7 @@
114115
nginx_importer_v2.NginxImporterPipeline,
115116
debian_importer_v2.DebianImporterPipeline,
116117
mattermost_importer_v2.MattermostImporterPipeline,
118+
collect_issue_pr_v2.CollectIssuePRPipeline,
117119
github_issue_pr_v2.GithubPipelineIssuePRPipeline,
118120
apache_tomcat_v2.ApacheTomcatImporterPipeline,
119121
suse_score_importer_v2.SUSESeverityScoreImporterPipeline,
@@ -156,6 +158,8 @@
156158
ubuntu_usn.UbuntuUSNImporter,
157159
fireeye.FireyeImporter,
158160
oss_fuzz.OSSFuzzImporter,
161+
collect_issue_pr_v2.CollectKubernetesPRSIssues,
162+
collect_issue_pr_v2.CollectWiresharkPRSIssues,
159163
github_issue_pr_v2.GithubPipelineIssuePR,
160164
collect_fix_commits_v2.CollectLinuxFixCommitsPipeline,
161165
collect_fix_commits_v2.CollectBusyBoxFixCommitsPipeline,

vulnerabilities/pipelines/__init__.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,33 @@
88
#
99

1010
import logging
11+
import re
1112
import traceback
13+
from abc import abstractmethod
14+
from collections import defaultdict
1215
from datetime import datetime
1316
from datetime import timezone
1417
from timeit import default_timer as timer
1518
from traceback import format_exc as traceback_format_exc
1619
from typing import Iterable
1720
from typing import List
21+
from urllib.parse import urlparse
1822

23+
import gitlab
1924
from aboutcode.pipeline import LoopProgress
2025
from aboutcode.pipeline import PipelineDefinition
2126
from aboutcode.pipeline import humanize_time
27+
from github import Github
2228

2329
from vulnerabilities.importer import AdvisoryData
30+
from vulnerabilities.importer import ReferenceV2
2431
from vulnerabilities.improver import MAX_CONFIDENCE
2532
from vulnerabilities.models import Advisory
2633
from vulnerabilities.models import PipelineRun
2734
from vulnerabilities.pipes.advisory import import_advisory
2835
from vulnerabilities.pipes.advisory import insert_advisory
2936
from vulnerabilities.pipes.advisory import insert_advisory_v2
37+
from vulnerablecode.settings import env
3038

3139
module_logger = logging.getLogger(__name__)
3240

@@ -334,3 +342,99 @@ def collect_and_store_advisories(self):
334342
continue
335343

336344
self.log(f"Successfully collected {collected_advisory_count:,d} advisories")
345+
346+
347+
class VCSCollector(VulnerableCodeBaseImporterPipeline):
348+
"""
349+
Pipeline to collect GitHub/GitLab issues and PRs related to vulnerabilities.
350+
"""
351+
352+
vcs_url: str
353+
CVE_PATTERN = re.compile(r"(CVE-\d{4}-\d+)", re.IGNORECASE)
354+
SUPPORTED_IDENTIFIERS = ["CVE-"]
355+
356+
collected_items: dict = {}
357+
358+
def advisories_count(self) -> int:
359+
return 0
360+
361+
@classmethod
362+
def steps(cls):
363+
return (
364+
cls.configure_target,
365+
cls.fetch_entries,
366+
cls.collect_items,
367+
cls.collect_and_store_advisories,
368+
)
369+
370+
def configure_target(self):
371+
parsed_url = urlparse(self.repo_url)
372+
parts = parsed_url.path.strip("/").split("/")
373+
if len(parts) < 2:
374+
raise ValueError(f"Invalid URL: {self.repo_url}")
375+
376+
self.repo_name = f"{parts[0]}/{parts[1]}"
377+
378+
@abstractmethod
379+
def fetch_entries(self):
380+
raise NotImplementedError
381+
382+
@abstractmethod
383+
def collect_items(self):
384+
raise NotImplementedError
385+
386+
def collect_advisories(self):
387+
"""
388+
Generate AdvisoryData objects for each vulnerability ID grouped with its related GitHub/Gitlab issues and PRs.
389+
"""
390+
self.log("Generating AdvisoryData objects from GitHub/Gitlab issues and PRs.")
391+
for vuln_id, refs in self.collected_items.items():
392+
print(vuln_id, refs)
393+
references = [ReferenceV2(reference_type=ref_id, url=url) for ref_id, url in refs]
394+
yield AdvisoryData(
395+
advisory_id=vuln_id,
396+
aliases=[],
397+
references_v2=references,
398+
url=self.repo_url,
399+
)
400+
401+
402+
class GitHubCollector(VCSCollector):
403+
def fetch_entries(self):
404+
"""Fetch GitHub Data Entries"""
405+
github_token = env.str("GITHUB_TOKEN")
406+
g = Github(login_or_token=github_token)
407+
base_query = f"repo:{self.repo_name} ({' OR '.join(self.SUPPORTED_IDENTIFIERS)})"
408+
self.issues = g.search_issues(f"{base_query} is:issue")
409+
self.prs = g.search_issues(f"{base_query} is:pr")
410+
411+
def collect_items(self):
412+
self.collected_items = defaultdict(list)
413+
414+
for i_type, items in [("Issue", self.issues), ("PR", self.prs)]:
415+
for item in items:
416+
matches = self.CVE_PATTERN.findall(item.title + " " + (item.body or ""))
417+
for match in matches:
418+
self.collected_items[match].append(("Issue", item.html_url))
419+
420+
421+
class GitLabCollector(VCSCollector):
422+
def fetch_entries(self):
423+
"""Fetch GitLab Data Entries"""
424+
gitlab_token = env.str("GITLAB_TOKEN")
425+
gl = gitlab.Gitlab("https://gitlab.com/", private_token=gitlab_token)
426+
project = gl.projects.get(self.repo_name)
427+
base_query = " ".join(self.SUPPORTED_IDENTIFIERS)
428+
self.issues = project.search(scope="issues", search=base_query)
429+
self.prs = project.search(scope="merge_requests", search=base_query)
430+
431+
def collect_items(self):
432+
self.collected_items = defaultdict(list)
433+
for i_type, items in [("Issue", self.issues), ("PR", self.prs)]:
434+
for item in items:
435+
title = item.get("title") or ""
436+
description = item.get("description") or ""
437+
matches = self.CVE_PATTERN.findall(title + " " + description)
438+
for match in matches:
439+
url = item.get("web_url")
440+
self.collected_items[match].append((i_type, url))
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from vulnerabilities.pipelines import GitHubCollector
10+
from vulnerabilities.pipelines import GitLabCollector
11+
12+
13+
class CollectKubernetesPRSIssues(GitHubCollector):
14+
pipeline_id = "collect-kubernetes-prs-issues"
15+
repo_url = "https://github.com/kubernetes/kubernetes"
16+
17+
18+
class CollectWiresharkPRSIssues(GitLabCollector):
19+
pipeline_id = "collect-wireshark-prs-issues"
20+
repo_url = "https://gitlab.com/wireshark/wireshark"

vulnerabilities/pipelines/v2_importers/github_issue_pr.py

Lines changed: 0 additions & 92 deletions
This file was deleted.

0 commit comments

Comments
 (0)