Skip to content

Commit a500ae0

Browse files
committed
Add initial improver for collect repo fix commits
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 6255cb2 commit a500ae0

File tree

3 files changed

+463
-0
lines changed

3 files changed

+463
-0
lines changed
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
import bisect
10+
import re
11+
from collections import defaultdict
12+
from typing import List
13+
from typing import Optional
14+
from typing import Tuple
15+
16+
from git import Commit
17+
from git import Repo
18+
19+
from vulnerabilities.models import AdvisoryV2
20+
from vulnerabilities.models import CodeFixV2
21+
from vulnerabilities.pipelines import VulnerableCodePipeline
22+
23+
24+
class CollectRepoFixCommitPipeline(VulnerableCodePipeline):
25+
"""
26+
Pipeline to collect fix commits from any git repository.
27+
"""
28+
29+
pipeline_id = "repo_fix_commit_pipeline"
30+
repositories_url = "git+https://github.com/the-tcpdump-group/tcpdump"
31+
32+
@classmethod
33+
def steps(cls):
34+
return (
35+
cls.collect_fix_commits,
36+
cls.store_fix_commits,
37+
)
38+
39+
def classify_commit_type(self, commit) -> str:
40+
num_parents = len(commit.parents)
41+
if num_parents == 0:
42+
return "root"
43+
elif num_parents == 1:
44+
return "normal"
45+
else:
46+
return "merge"
47+
48+
def detect_fix_commit(self, commit) -> str:
49+
"""
50+
Detect whether a commit is a bug-fix or vulnerability-fix commit.
51+
Returns: "vulnerability_fix" or "other"
52+
"""
53+
msg = commit.message.lower()
54+
security_patterns = [
55+
# CVE identifiers
56+
r"\bcve-[0-9]{4}-[0-9]{4,19}\b",
57+
]
58+
if any(re.search(p, msg) for p in security_patterns):
59+
return "vulnerability_fix"
60+
return "other"
61+
62+
def extract_cves(self, text: str) -> List[str]:
63+
if not text:
64+
return []
65+
cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE)
66+
return list({cve.upper() for cve in cves})
67+
68+
def get_previous_releases(
69+
self,
70+
release_tags_sorted: List[Tuple[str, int]],
71+
dates: List[int],
72+
commit_date: int,
73+
) -> List[str]:
74+
index = bisect.bisect_left(dates, commit_date)
75+
return [tag for tag, _ in release_tags_sorted[:index]]
76+
77+
def get_current_or_next_release(
78+
self,
79+
release_tags_sorted: List[Tuple[str, int]],
80+
dates: List[int],
81+
commit_date: int,
82+
) -> Optional[str]:
83+
index = bisect.bisect_left(dates, commit_date)
84+
85+
if index < len(dates) and dates[index] == commit_date:
86+
return release_tags_sorted[index][0]
87+
88+
if index < len(dates):
89+
return release_tags_sorted[index][0]
90+
91+
return None
92+
93+
def get_current_release(
94+
self, repo: Repo, commit: Commit, prev_release_by_date: Optional[str]
95+
) -> str:
96+
try:
97+
return repo.git.describe("--tags", "--exact-match", commit.hexsha)
98+
except Exception:
99+
pass
100+
101+
try:
102+
return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha)
103+
except Exception:
104+
pass
105+
106+
if prev_release_by_date:
107+
return prev_release_by_date
108+
109+
return "NO_TAGS_AVAILABLE"
110+
111+
def collect_fix_commits(self):
112+
self.log("Processing git repository fix commits.")
113+
repo_url = "https://github.com/the-tcpdump-group/tcpdump"
114+
repo_path = "/home/ziad-hany/PycharmProjects/tcpdump"
115+
116+
repo = Repo(repo_path)
117+
cve_list = defaultdict(set)
118+
119+
# Precompute release tags
120+
release_tags = []
121+
for tag in repo.tags:
122+
try:
123+
release_tags.append((tag.name, tag.commit.committed_date))
124+
except Exception:
125+
continue
126+
127+
release_tags_sorted = sorted(release_tags, key=lambda x: x[1])
128+
dates_array = [date for _, date in release_tags_sorted]
129+
130+
for commit in repo.iter_commits("--all"):
131+
commit_type = self.classify_commit_type(commit)
132+
fix_type = self.detect_fix_commit(commit)
133+
134+
if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]:
135+
prev_release_list = self.get_previous_releases(
136+
release_tags_sorted, dates_array, commit.committed_date
137+
)
138+
prev_release_by_date = prev_release_list[-1] if prev_release_list else None
139+
140+
curr_release = self.get_current_release(repo, commit, prev_release_by_date)
141+
commit_info = {
142+
"hash": commit.hexsha,
143+
"url": repo_url + "/commit/" + commit.hexsha,
144+
"message": commit.message.strip(),
145+
"curr_release": curr_release,
146+
"prev_release": prev_release_list,
147+
"fix_type": fix_type,
148+
}
149+
150+
for cve_id in self.extract_cves(commit.message.strip()):
151+
commit_url = f"{repo_url}/commit/{commit.hexsha}"
152+
cve_list[cve_id].add(commit_url)
153+
154+
# Save results into pipeline state
155+
self.fix_commits = {cve: list(commits) for cve, commits in cve_list.items()}
156+
self.log(f"Found {len(self.fix_commits)} unique CVEs with fix commits.")
157+
158+
def store_fix_commits(self):
159+
if not hasattr(self, "fix_commits"):
160+
self.log("No fix commits collected. Run collect_fix_commits() first.")
161+
return
162+
163+
created_fix_count = 0
164+
165+
# FIXME
166+
for vulnerability_id, commit_urls in self.fix_commits.items():
167+
advisories = AdvisoryV2.objects.filter(advisory_id__iendswith=vulnerability_id)
168+
169+
if not advisories.exists():
170+
self.log(f"No advisories found for vulnerability_id: {vulnerability_id}")
171+
continue
172+
173+
for adv in advisories:
174+
for impact in adv.impacted_packages.all():
175+
for package in impact.affecting_packages.all():
176+
for vcs_url in commit_urls:
177+
code_fix, created = CodeFixV2.objects.get_or_create(
178+
commits=[vcs_url],
179+
advisory=adv,
180+
affected_package=package,
181+
)
182+
if created:
183+
created_fix_count += 1
184+
185+
self.log(f"Stored {created_fix_count} new CodeFixV2 entries.")
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
import bisect
2+
import json
3+
import os
4+
import re
5+
from collections import defaultdict
6+
from typing import List
7+
from typing import Optional
8+
from typing import Tuple
9+
10+
from git import Commit
11+
from git import Repo
12+
13+
14+
def clone_repo(repo_url: str, clone_dir: str) -> str:
15+
os.makedirs(clone_dir, exist_ok=True)
16+
try:
17+
print(f"Cloning {repo_url} into {clone_dir}...")
18+
repo = Repo.clone_from(repo_url, clone_dir)
19+
print("Clone successful.")
20+
return repo.working_tree_dir
21+
except Exception as e:
22+
print(f"Failed to clone repository: {e}")
23+
return ""
24+
25+
26+
def classify_commit_type(commit) -> str:
27+
num_parents = len(commit.parents)
28+
if num_parents == 0:
29+
return "root" # never a fix
30+
elif num_parents == 1:
31+
return "normal" # main source of fixes
32+
else:
33+
return "merge" # usually not a fix
34+
35+
36+
def detect_fix_commit(commit) -> str:
37+
"""
38+
Detect whether a commit is a bug-fix or vulnerability-fix commit.
39+
Returns: "vulnerability_fix", "other"
40+
"""
41+
msg = commit.message.lower()
42+
43+
security_patterns = [
44+
# CVE identifiers
45+
r"\bcve-\d{4}-\d{4,}\b",
46+
# Explicitly marked security fixes
47+
r"\bsecurity fix\b",
48+
r"\bfix security issue\b",
49+
r"\bfix(?:es)? for security\b",
50+
# Permission / privilege escalation
51+
r"\bprivilege escalation\b",
52+
r"\bprivesc\b",
53+
r"\bescalat(?:e|ion) of privilege\b",
54+
# No New Privileges / unsafe exec
55+
r"\bno[- ]new[- ]privs\b",
56+
r"\bunsafe exec\b",
57+
# Refcount / UAF (classic kernel vulns, almost always security)
58+
r"\buse[- ]after[- ]free\b",
59+
r"\buaf\b",
60+
r"\brefcount (?:leak|error|overflow|underflow)\b",
61+
r"\bdouble free\b",
62+
# Out-of-bounds (OOB)
63+
r"\bout[- ]of[- ]bounds\b",
64+
r"\boob\b",
65+
# Info leaks (security-relevant, not generic leaks)
66+
r"\binformation leak\b",
67+
r"\binfo leak\b",
68+
r"\bleak (?:kernel|userns|credentials?|mnt_idmap)\b",
69+
# Bypass
70+
r"\bsecurity bypass\b",
71+
r"\baccess control bypass\b",
72+
r"\bpermission check (?:bug|fix|error)\b",
73+
]
74+
75+
SECURITY_REGEX = re.compile("|".join(security_patterns), re.IGNORECASE)
76+
77+
if SECURITY_REGEX.search(msg):
78+
return "vulnerability_fix"
79+
return "other"
80+
81+
82+
def extract_cves(text: str) -> List[str]:
83+
if not text:
84+
return []
85+
cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE)
86+
return list({cve.upper() for cve in cves})
87+
88+
89+
def get_previous_releases(
90+
release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int
91+
) -> List[str]:
92+
"""
93+
Get all release tags with commit dates strictly before the given commit date.
94+
release_tags_sorted: list of (tag_name, committed_date), sorted by committed_date
95+
dates: list of commit dates (parallel to release_tags_sorted, sorted ascending)
96+
"""
97+
index = bisect.bisect_left(dates, commit_date)
98+
return [tag for tag, _ in release_tags_sorted[:index]]
99+
100+
101+
def get_current_or_next_release(
102+
release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int
103+
) -> Optional[str]:
104+
"""
105+
Get the current release if commit matches a release date,
106+
otherwise return the next release after the commit date.
107+
"""
108+
index = bisect.bisect_left(dates, commit_date)
109+
110+
# Exact match → this commit is tagged
111+
if index < len(dates) and dates[index] == commit_date:
112+
return release_tags_sorted[index][0]
113+
114+
# Otherwise, next release after this commit
115+
if index < len(dates):
116+
return release_tags_sorted[index][0]
117+
118+
# No next release available
119+
return None
120+
121+
122+
def get_current_release(repo: Repo, commit: Commit, prev_release_by_date: Optional[str]) -> str:
123+
"""
124+
Return a non-null release tag for the given commit:
125+
1) exact tag if commit is tagged
126+
2) nearest reachable tag (fast, first-parent)
127+
3) latest prior tag by date (fallback)
128+
4) "NO_TAGS_AVAILABLE" if repo has no tags at all
129+
"""
130+
# 1) Exact tag at this commit
131+
try:
132+
return repo.git.describe("--tags", "--exact-match", commit.hexsha)
133+
except Exception:
134+
pass
135+
136+
# 2) Nearest reachable tag along first-parent
137+
try:
138+
return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha)
139+
except Exception:
140+
pass
141+
142+
# 3) Fallback: latest prior tag by date
143+
if prev_release_by_date:
144+
return prev_release_by_date
145+
146+
# 4) No tags at all
147+
return "NO_TAGS_AVAILABLE"
148+
149+
150+
if __name__ == "__main__":
151+
repo_url = "https://github.com/torvalds/linux"
152+
repo_path = "/home/ziad-hany/PycharmProjects/linux"
153+
154+
repo = Repo(repo_path)
155+
commits_data = []
156+
cve_list = defaultdict(set)
157+
158+
# Precompute and sort release tags by commit date
159+
release_tags = []
160+
for tag in repo.tags:
161+
try:
162+
release_tags.append((tag.name, tag.commit, tag.commit.committed_date))
163+
except Exception:
164+
continue
165+
166+
release_tags_sorted = sorted(release_tags, key=lambda x: x[2])
167+
168+
# For previous releases lookup (by date)
169+
release_tags_for_previous = [(tag_name, date) for tag_name, _, date in release_tags_sorted]
170+
dates_array = [date for _, date in release_tags_for_previous]
171+
172+
for commit in repo.iter_commits("--all"):
173+
commit_type = classify_commit_type(commit)
174+
fix_type = detect_fix_commit(commit)
175+
176+
if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]:
177+
# Compute "previous by date" first so we can feed it as a fallback
178+
prev_release_list = get_previous_releases(
179+
release_tags_for_previous, dates_array, commit.committed_date
180+
)
181+
prev_release_by_date = prev_release_list[-1] if prev_release_list else None
182+
183+
curr_release = get_current_release(repo, commit, prev_release_by_date)
184+
185+
commit_info = {
186+
"hash": commit.hexsha,
187+
"url": repo_url + "/commit/" + commit.hexsha,
188+
"message": commit.message.strip(),
189+
"curr_release": curr_release,
190+
"prev_release": prev_release_list,
191+
"fix_type": fix_type,
192+
}
193+
print(commit_info)
194+
commits_data.append(commit_info)
195+
196+
# Optional CVE collection
197+
for cve_id in extract_cves(commit.message.strip()):
198+
cve_list[cve_id].add(repo_url + "/commit/" + commit.hexsha)
199+
200+
result = {cve: list(commits) for cve, commits in cve_list.items()}
201+
print(f"Found {len(result)} unique CVEs")
202+
print(json.dumps(result, indent=2))

0 commit comments

Comments
 (0)