-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset.py
More file actions
83 lines (65 loc) · 3.01 KB
/
dataset.py
File metadata and controls
83 lines (65 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""Defines the CVEfixes dataset for evaluating SAST tools on real-world vulnerabilities.
This module provides the classes and logic to load the CVEfixes dataset, which
is composed of Git repositories at commits just before a CVE-related fix.
It reads repository information from a CSV file.
"""
import csv
from typing import Self
from codesectools.datasets.core.dataset import File, GitRepo, GitRepoDataset
from codesectools.shared.cwe import CWEs
from codesectools.utils import DATA_DIR
class CVEfixes(GitRepoDataset):
"""Represents the CVEfixes dataset.
This class handles loading the dataset of Git repositories linked to specific
CVEs. It filters repositories based on size.
Attributes:
name (str): The name of the dataset, "CVEfixes".
supported_languages (list[str]): A list of supported languages.
max_repo_size (int): The maximum size of a repository (in bytes) to be
included in the analysis.
"""
name = "CVEfixes"
supported_languages = ["java"]
license = "CC BY 4.0"
license_url = "https://creativecommons.org/licenses/by/4.0/"
def __init__(self, lang: str = "") -> None:
"""Initialize the CVEfixes dataset.
Args:
lang: The programming language of the dataset to load.
"""
self.max_repo_size = 100 * 10**6
super().__init__(lang)
def download_files(self: Self, test: bool = False) -> None:
"""Copy the dataset files from the package data directory to the user cache."""
self.directory.mkdir(exist_ok=True, parents=True)
license_file = DATA_DIR / self.name / "LICENSE"
(self.directory / license_file.name).write_bytes(license_file.read_bytes())
for dataset_file in (DATA_DIR / self.name).glob("CVEfixes_*.csv"):
(self.directory / dataset_file.name).write_bytes(dataset_file.read_bytes())
def load_dataset(
self,
) -> list[File]:
"""Load the CVEfixes dataset from its source CSV file.
Parses a CSV file containing information about CVEs, repositories,
commits, and vulnerable files to create a list of `GitRepo` objects.
Returns:
A list of `GitRepo` objects representing the dataset, filtered by
repository size.
"""
dataset_path = self.directory / f"CVEfixes_{self.lang}.csv"
repos = []
with open(dataset_path, newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
name = row["cve_id"]
url = row["repo_url"]
commit = eval(row["parents"])[0]
size = int(row["repo_size"])
cwes = [
CWEs.from_string(cwe_id) for cwe_id in row["cwe_ids"].split(";")
]
files = row["filenames"].split(";")
repo = GitRepo(name, url, commit, size, cwes, files, has_vuln=True)
if repo.size < self.max_repo_size:
repos.append(repo)
return repos