Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions codesectools/datasets/BenchmarkJava/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class TestCode(File):

def __init__(
self,
filename: str,
filepath: Path,
content: str | bytes,
cwes: list[CWE],
vuln_type: str,
Expand All @@ -38,15 +38,15 @@ def __init__(
"""Initialize a TestCode instance.

Args:
filename: The name of the file.
filepath: The path to the file.
content: The content of the file, as a string or bytes.
cwes: A list of CWEs associated with the file.
vuln_type: The type of vulnerability.
has_vuln: A boolean indicating if the vulnerability is real or a false positive test case.

"""
super().__init__(
filename=filename, content=content, cwes=cwes, has_vuln=has_vuln
filepath=filepath, content=content, cwes=cwes, has_vuln=has_vuln
)

self.vuln_type = vuln_type
Expand Down Expand Up @@ -148,10 +148,19 @@ def load_dataset(self) -> list[TestCode]:
next(reader)
for row in reader:
filename = f"{row[0]}.java"
content = (testcode_dir / filename).read_text()
filepath = testcode_dir / filename
content = filepath.read_text()
cwes = [CWEs.from_id(int(row[3]))]
vuln_type = row[1]
has_vuln = True if row[2] == "true" else False
files.append(TestCode(filename, content, cwes, vuln_type, has_vuln))
files.append(
TestCode(
filepath.relative_to(self.directory),
content,
cwes,
vuln_type,
has_vuln,
)
)

return files
53 changes: 28 additions & 25 deletions codesectools/datasets/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING

import git
Expand All @@ -21,7 +22,6 @@
from codesectools.utils import USER_CACHE_DIR

if TYPE_CHECKING:
from pathlib import Path
from typing import Self

from codesectools.sasts.core.parser import AnalysisResult, Defect
Expand Down Expand Up @@ -197,7 +197,7 @@ class File(DatasetUnit):
"""Represent a single file in a dataset.

Attributes:
filename (str): The name of the file.
filepath (Path): The relative path to the file.
content (bytes): The byte content of the file.
cwes (list[CWE]): A list of CWEs associated with the file.
has_vuln (bool): True if the vulnerability is real, False if it's
Expand All @@ -206,20 +206,21 @@ class File(DatasetUnit):
"""

def __init__(
self, filename: str, content: str | bytes, cwes: list[CWE], has_vuln: bool
self, filepath: Path, content: str | bytes, cwes: list[CWE], has_vuln: bool
) -> None:
"""Initialize a File instance.

Args:
filename: The name of the file.
filepath: The relative path of the file.
content: The content of the file, as a string or bytes. It will be
converted to bytes if provided as a string.
cwes: A list of CWEs associated with the file.
has_vuln: True if the vulnerability is real, False if it's
intended to be a false positive test case.

"""
self.filename = filename
self.filepath = filepath
self.filename = self.filepath.name
self.content = content
self.cwes = cwes
self.has_vuln = has_vuln
Expand All @@ -231,29 +232,29 @@ def __repr__(self) -> str:
"""Return a developer-friendly string representation of the File.

Returns:
A string showing the class name, filename, and CWE IDs.
A string showing the class name, filepath, and CWE IDs.

"""
return f"""{self.__class__.__name__}(
filename: \t{self.filename}
filepath: \t{self.filepath}
cwes: \t{self.cwes}
)"""

def __eq__(self, other: str | Self) -> bool:
"""Compare this File with another object for equality based on filename.
def __eq__(self, other: str | Path | Self) -> bool:
"""Compare this File with another object for equality based on filepath.

Args:
other: The object to compare with. Can be a string (filename) or
other: The object to compare with. Can be a string/Path (filepath) or
another File instance.

Returns:
True if the filenames are equal, False otherwise.
True if the filepaths are equal, False otherwise.

"""
if isinstance(other, str):
return self.filename == other
if isinstance(other, (str, Path)):
return self.filepath == Path(other)
elif isinstance(other, self.__class__):
return self.filename == other.filename
return self.filepath == other.filepath
else:
return False

Expand All @@ -264,7 +265,9 @@ def save(self, dir: Path) -> None:
dir: The path to the directory where the file should be saved.

"""
(dir / self.filename).write_bytes(self.content)
target_path = dir / self.filepath
target_path.parent.mkdir(parents=True, exist_ok=True)
target_path.write_bytes(self.content)


class FileDataset(Dataset):
Expand Down Expand Up @@ -303,7 +306,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
"""
# 1. Prepare ground truth from all files in the dataset
ground_truth: dict[str, tuple[bool, set[CWE]]] = {
file.filename: (file.has_vuln, set(file.cwes)) for file in self.files
str(file.filepath): (file.has_vuln, set(file.cwes)) for file in self.files
}

# 2. Process reported defects to get unique (file, cwe) pairs
Expand All @@ -313,32 +316,32 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
if not defect.cwe or defect.cwe.id == -1:
continue

file_cwe_pair = (defect.filename, defect.cwe)
file_cwe_pair = (str(defect.filepath), defect.cwe)
if file_cwe_pair not in unique_reported_defects:
unique_reported_defects[file_cwe_pair] = defect

# 3. Classify unique reported vulnerabilities as TP or FP
tp_defects_map: dict[tuple[str, CWE], Defect] = {}
fp_defects_map: dict[tuple[str, CWE], Defect] = {}

for (filename, cwe), defect in unique_reported_defects.items():
has_vuln, expected_cwes = ground_truth.get(filename, (False, set()))
for (filepath, cwe), defect in unique_reported_defects.items():
has_vuln, expected_cwes = ground_truth.get(filepath, (False, set()))

if has_vuln and cwe in expected_cwes:
# Correctly identified a vulnerability
tp_defects_map[(filename, cwe)] = defect
tp_defects_map[(filepath, cwe)] = defect
else:
# Reported a vuln in a non-vulnerable file, with wrong CWE,
# or in a file not part of the dataset.
fp_defects_map[(filename, cwe)] = defect
fp_defects_map[(filepath, cwe)] = defect

# 4. Determine False Negatives by finding what was missed from the ground truth.
fn_defects_set: set[tuple[str, CWE]] = set()
for filename, (has_vuln, expected_cwes) in ground_truth.items():
for filepath, (has_vuln, expected_cwes) in ground_truth.items():
if has_vuln:
for expected_cwe in expected_cwes:
if (filename, expected_cwe) not in tp_defects_map:
fn_defects_set.add((filename, expected_cwe))
if (filepath, expected_cwe) not in tp_defects_map:
fn_defects_set.add((filepath, expected_cwe))

# 5. Convert maps and sets to lists of objects for downstream use
tp_defects = list(tp_defects_map.values())
Expand All @@ -354,7 +357,7 @@ def validate(self, analysis_result: AnalysisResult) -> FileDatasetData:
fp_cwes = [cwe for _, cwe in fp_defects_map.keys()]
fn_cwes = [cwe for _, cwe in fn_defects_set]

unique_correct_number = len({filename for filename, _ in tp_defects_map.keys()})
unique_correct_number = len({filepath for filepath, _ in tp_defects_map.keys()})

return FileDatasetData(
dataset=self,
Expand Down
10 changes: 8 additions & 2 deletions codesectools/sasts/all/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,13 @@ def benchmark(
dataset: Annotated[
str,
typer.Argument(
click_type=Choice([d.name for d in all_sast.sasts_by_dataset]),
click_type=Choice(
[
f"{d.name}_{lang}"
for d in all_sast.sasts_by_dataset
for lang in d.supported_languages
]
),
metavar="DATASET",
),
],
Expand All @@ -140,7 +146,7 @@ def benchmark(
) -> None:
"""Run a benchmark on a dataset using all available SAST tools."""
dataset_name, lang = dataset.split("_")
for sast in all_sast.sasts_by_dataset.get(lang, []):
for sast in all_sast.sasts_by_dataset.get(DATASETS_ALL[dataset_name], []):
dataset = DATASETS_ALL[dataset_name](lang)
if isinstance(dataset, FileDataset):
sast.analyze_files(dataset, overwrite, testing)
Expand Down
16 changes: 8 additions & 8 deletions codesectools/sasts/all/graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import shutil
import tempfile
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
Expand All @@ -11,6 +10,7 @@
from rich import print

from codesectools.sasts.all.sast import AllSAST
from codesectools.utils import shorten_path

## Matplotlib config
matplotlib.rcParams.update(
Expand Down Expand Up @@ -107,7 +107,7 @@ def __init__(self, project_name: str) -> None:
def plot_overview(self) -> Figure:
"""Generate an overview plot with stats by files, SAST tools, and categories."""
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, layout="constrained")
by_files = {Path(k).name: v for k, v in self.result.stats_by_files().items()}
by_files = self.result.stats_by_files()
by_sasts = self.result.stats_by_sasts()
by_categories = self.result.stats_by_categories()

Expand All @@ -117,7 +117,7 @@ def plot_overview(self) -> Figure:
list(by_files.items()), key=lambda e: e[1]["count"], reverse=True
)
for k, v in sorted_files[: self.limit]:
X_files.append(k)
X_files.append(shorten_path(k))
Y_files.append(v["count"])

COLORS_COUNT = {v: 0 for k, v in self.color_mapping.items()}
Expand All @@ -130,11 +130,11 @@ def plot_overview(self) -> Figure:
current_height = 0
for color, height in COLORS_COUNT.items():
if height > 0:
bars.append((k, current_height + height, color))
bars.append((shorten_path(k), current_height + height, color))
current_height += height

for k, height, color in bars[::-1]:
ax1.bar(k, height, color=color)
for k_short, height, color in bars[::-1]:
ax1.bar(k_short, height, color=color)

ax1.set_xticks(X_files, X_files, rotation=45, ha="right")
ax1.set_title(f"Stats by files (limit to {self.limit})")
Expand Down Expand Up @@ -231,7 +231,7 @@ def plot_top_cwes(self) -> Figure:
def plot_top_scores(self) -> Figure:
"""Generate a stacked bar plot for files with the highest scores."""
fig, ax = plt.subplots(1, 1, layout="constrained")
by_scores = {Path(k).name: v for k, v in self.result.stats_by_scores().items()}
by_scores = self.result.stats_by_scores()

for file, data in by_scores.items():
by_scores[file]["total_score"] = sum(data["score"].values())
Expand All @@ -244,7 +244,7 @@ def plot_top_scores(self) -> Figure:

X_files, score_data = [], []
for file, data in sorted_files[: self.limit]:
X_files.append(file)
X_files.append(shorten_path(file))
score_data.append(data["score"])

score_keys = score_data[0].keys()
Expand Down
12 changes: 6 additions & 6 deletions codesectools/sasts/core/graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import shutil
import tempfile
from pathlib import Path

import matplotlib
import matplotlib.pyplot as plt
Expand All @@ -19,6 +18,7 @@
from codesectools.datasets.core.dataset import FileDataset, GitRepoDataset
from codesectools.sasts.core.sast import SAST
from codesectools.shared.cwe import CWE
from codesectools.utils import shorten_path

## Matplotlib config
matplotlib.rcParams.update(
Expand Down Expand Up @@ -153,7 +153,7 @@ def plot_overview(self) -> Figure:
project_name = self.result.name

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, layout="constrained")
by_files = {Path(k).name: v for k, v in self.result.stats_by_files().items()}
by_files = self.result.stats_by_files()
by_checkers = self.result.stats_by_checkers()
by_categories = self.result.stats_by_categories()

Expand All @@ -163,7 +163,7 @@ def plot_overview(self) -> Figure:
list(by_files.items()), key=lambda e: e[1]["count"], reverse=True
)
for k, v in sorted_files[: self.limit]:
X_files.append(k)
X_files.append(shorten_path(k))
Y_files.append(v["count"])

COLORS_COUNT = {v: 0 for k, v in self.color_mapping.items()}
Expand All @@ -177,11 +177,11 @@ def plot_overview(self) -> Figure:
current_height = 0
for color, height in COLORS_COUNT.items():
if height > 0:
bars.append((k, current_height + height, color))
bars.append((shorten_path(k), current_height + height, color))
current_height += height

for k, height, color in bars[::-1]:
ax1.bar(k, height, color=color)
for k_short, height, color in bars[::-1]:
ax1.bar(k_short, height, color=color)

ax1.set_xticks(X_files, X_files, rotation=45, ha="right")
ax1.set_title(f"Stats by files (limit to {self.limit})")
Expand Down
13 changes: 12 additions & 1 deletion codesectools/sasts/core/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __repr__(self) -> str:

"""
return f"""{self.__class__.__name__}(
file: \t{self.file}
filepath: \t{self.filepath}
checker: \t{self.checker}
category: \t{self.category}
cwe: \t{self.cwe}
Expand Down Expand Up @@ -127,6 +127,17 @@ def __init__(
self.loc = loc
self.data = data

# Ensure all defect filepaths are relative to the source path
for defect in self.defects:
if defect.filepath.is_absolute():
try:
defect.filepath = defect.filepath.relative_to(self.source_path)
defect.filepath_str = str(defect.filepath)
except ValueError:
# This can happen if the path is outside the source_path tree.
# We leave it as is, but it will likely not match during validation.
pass

def __repr__(self) -> str:
"""Return a developer-friendly string representation of the AnalysisResult.

Expand Down
Loading