Skip to content

Commit c64ab56

Browse files
committed
Create pipeline for symbol reachability
Signed-off-by: ziad hany <ziadhany2016@gmail.com>
1 parent 904373a commit c64ab56

4 files changed

Lines changed: 264 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ collect_strings_gettext = "scanpipe.pipelines.collect_strings_gettext:CollectStr
141141
collect_symbols_ctags = "scanpipe.pipelines.collect_symbols_ctags:CollectSymbolsCtags"
142142
collect_symbols_pygments = "scanpipe.pipelines.collect_symbols_pygments:CollectSymbolsPygments"
143143
collect_symbols_tree_sitter = "scanpipe.pipelines.collect_symbols_tree_sitter:CollectSymbolsTreeSitter"
144+
collect_symbols_patches = "scanpipe.pipelines.collect_patch_symbols:CollectPatchSymbols"
144145
enrich_with_purldb = "scanpipe.pipelines.enrich_with_purldb:EnrichWithPurlDB"
145146
fetch_scores = "scanpipe.pipelines.fetch_scores:FetchScores"
146147
find_vulnerabilities = "scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities"
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
from scanpipe.pipelines import Pipeline
11+
from scanpipe.pipes import symbols
12+
13+
14+
class CollectPatchSymbols(Pipeline):
15+
"""Collect Patch symbols using (ctags, pygments, tree_sitter)"""
16+
17+
download_inputs = False
18+
is_addon = True
19+
results_url = "/project/{slug}/resources/?extra_data=patch_symbols"
20+
21+
@classmethod
22+
def steps(cls):
23+
return (cls.collect_and_store_patch_symbols_and_strings,)
24+
25+
def collect_and_store_patch_symbols_and_strings(self):
26+
"""
27+
Pipeline(s) that can retrieve vulnerable/fixed symbols, collect local symbols (pur2sym) and match them
28+
"""
29+
symbol_type = "tree_sitter"
30+
symbols.collect_and_store_patch_symbols(self.project, symbol_type, self.log)

scanpipe/pipes/symbols.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,23 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
import os
24+
import tempfile
25+
2326
from django.db.models import Q
2427

28+
from source_inspector import symbols_ctags
29+
from source_inspector import symbols_pygments
30+
from source_inspector import symbols_tree_sitter
31+
2532
from aboutcode.pipeline import LoopProgress
33+
from scanpipe.pipes.fetch import fetch_http
34+
from scanpipe.pipes.pathmap import build_index
35+
from scanpipe.pipes.pathmap import find_paths
36+
from scanpipe.pipes.symbolmap import MATCHING_RATIO_JAVASCRIPT
37+
from scanpipe.pipes.symbolmap import MATCHING_RATIO_JAVASCRIPT_SMALL_FILE
38+
from scanpipe.pipes.symbolmap import SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT
39+
from scanpipe.pipes.symbolmap import get_similarity_between_source_and_deployed_symbols
2640

2741

2842
class UniversalCtagsNotFound(Exception):
@@ -171,3 +185,222 @@ def _collect_and_store_tree_sitter_symbols_and_strings(resource):
171185
"source_strings": result.get("source_strings"),
172186
}
173187
)
188+
189+
190+
SYMBOLS_TYPE_SUPPORTED = {
191+
"ctags": symbols_ctags.get_symbols,
192+
"tree_sitter": symbols_tree_sitter.get_treesitter_symbols,
193+
"pygments": symbols_pygments.get_pygments_symbols,
194+
}
195+
196+
DOC_EXTENSIONS = {
197+
".md",
198+
".rst",
199+
".txt",
200+
".html",
201+
".pdf",
202+
".wiki",
203+
".json",
204+
".yaml",
205+
".yml",
206+
".toml",
207+
}
208+
209+
210+
def get_vulnerability_patch_text(vuln):
211+
# TODO this is a mock, we should delete this function once we migrate to v2 api vulnerablecode
212+
# https://files.pythonhosted.org/packages/99/ab/eedb921f26adf7057ade1291f9c1bfa35a506d64894f58546457ef658772/Flask-1.0.tar.gz
213+
214+
patch_urls = [
215+
# VCID-z6fe-2j8a-aaak
216+
# "https://github.com/pallets/flask/commit/70f906c51ce49c485f1d355703e9cc3386b1cc2b.patch",
217+
# "https://github.com/pallets/flask/commit/afd63b16170b7c047f5758eb910c416511e9c965.patch",
218+
# VCID-e8hf-2zj4-1qhv
219+
"https://github.com/pallets/flask/commit/089cb86dd22bff589a4eafb7ab8e42dc357623b4.patch"
220+
]
221+
222+
for patch_url in patch_urls:
223+
file_path = fetch_http(patch_url).path
224+
with open(file_path) as f:
225+
patch_text = f.read()
226+
yield patch_text
227+
228+
229+
def parse_patch_symbols(raw_code: str, path: str, symbols_type="tree_sitter") -> dict:
230+
if not raw_code or not raw_code.strip():
231+
return {}
232+
233+
_, file_suffix = os.path.splitext(path)
234+
235+
with tempfile.NamedTemporaryFile(mode="w+", suffix=file_suffix, delete=False) as f:
236+
f.write(raw_code)
237+
f.flush()
238+
temp_name = f.name
239+
240+
try:
241+
parser_func = SYMBOLS_TYPE_SUPPORTED.get(symbols_type, lambda f: {})
242+
return parser_func(temp_name) or {}
243+
finally:
244+
os.remove(temp_name)
245+
246+
247+
def get_patch_symbols(vulnerable_files: dict, fixed_files: dict, symbol_type) -> dict:
248+
symbols_results = {}
249+
all_file_paths = set(vulnerable_files.keys()) | set(fixed_files.keys())
250+
251+
for file_path in all_file_paths:
252+
vuln_code = vulnerable_files.get(file_path, "")
253+
fixed_code = fixed_files.get(file_path, "")
254+
vuln_parsed = parse_patch_symbols(vuln_code, file_path, symbol_type)
255+
fixed_parsed = parse_patch_symbols(fixed_code, file_path, symbol_type)
256+
257+
symbols_results[file_path] = {
258+
"vulnerable_symbols": vuln_parsed.get("source_symbols", []),
259+
"vulnerable_strings": vuln_parsed.get("source_strings", []),
260+
"fixed_symbols": fixed_parsed.get("source_symbols", []),
261+
"fixed_strings": fixed_parsed.get("source_strings", []),
262+
}
263+
return symbols_results
264+
265+
266+
def _should_skip(file_path: str):
267+
file_name = os.path.basename(file_path)
268+
_, ext = os.path.splitext(file_name)
269+
270+
if ext.lower() in DOC_EXTENSIONS:
271+
return True
272+
273+
lower_name = file_name.lower()
274+
if (
275+
lower_name.startswith("test_")
276+
or lower_name.startswith("test")
277+
or "_test." in lower_name
278+
):
279+
return True
280+
281+
lower_path = file_path.lower()
282+
if "test/" in lower_path or "tests/" in lower_path or "/testdata/" in lower_path:
283+
return True
284+
285+
return False
286+
287+
288+
def extract_patch_details(patch_text: str):
289+
from unidiff import PatchSet
290+
291+
patch = PatchSet(patch_text)
292+
vulnerable_files = {}
293+
fixed_files = {}
294+
295+
for patched_file in patch:
296+
if _should_skip(patched_file.path):
297+
continue
298+
299+
vuln_lines = []
300+
fixed_lines = []
301+
for hunk in patched_file:
302+
for line in hunk:
303+
if line.is_removed:
304+
vuln_lines.append(line.value)
305+
elif line.is_added:
306+
fixed_lines.append(line.value)
307+
308+
if vuln_lines:
309+
vulnerable_files[patched_file.path] = "".join(vuln_lines)
310+
if fixed_lines:
311+
fixed_files[patched_file.path] = "".join(fixed_lines)
312+
313+
return vulnerable_files, fixed_files
314+
315+
316+
def collect_and_store_patch_symbols(project, symbol_type, logger=None):
317+
packages = project.discoveredpackages.all()
318+
packages_count = packages.count()
319+
320+
if logger:
321+
logger(
322+
f"Collecting patch symbols for {packages_count:,d} discovered packages "
323+
"and computing reachability."
324+
)
325+
326+
progress = LoopProgress(packages_count, logger)
327+
for package in progress.iter(packages.iterator(chunk_size=2000)):
328+
try:
329+
_collect_and_store_patch_symbols(project, package, symbol_type)
330+
except Exception as e:
331+
project.add_error(
332+
description=f"Cannot collect patch symbols for package {package.name}",
333+
exception=e,
334+
model="collect_and_store_patch_symbols",
335+
details={"package_uuid": str(package.uuid)},
336+
)
337+
338+
339+
def calculate_reachability(source_symbols, vulnerable_symbols, fixed_symbols):
340+
is_vulnerable, vulnerable_similarity = (
341+
get_similarity_between_source_and_deployed_symbols(
342+
source_symbols=source_symbols,
343+
deployed_symbols=vulnerable_symbols,
344+
matching_ratio=MATCHING_RATIO_JAVASCRIPT,
345+
matching_ratio_small_file=MATCHING_RATIO_JAVASCRIPT_SMALL_FILE,
346+
small_file_threshold=SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT,
347+
)
348+
)
349+
350+
is_fixed, fixed_similarity = get_similarity_between_source_and_deployed_symbols(
351+
source_symbols=source_symbols,
352+
deployed_symbols=fixed_symbols,
353+
matching_ratio=MATCHING_RATIO_JAVASCRIPT,
354+
matching_ratio_small_file=MATCHING_RATIO_JAVASCRIPT_SMALL_FILE,
355+
small_file_threshold=SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT,
356+
)
357+
358+
return {
359+
"is_vulnerable_matched": is_vulnerable,
360+
"vulnerable_similarity": vulnerable_similarity,
361+
"is_fixed_matched": is_fixed,
362+
"fixed_similarity": fixed_similarity,
363+
"is_reachable": vulnerable_similarity >= fixed_similarity,
364+
}
365+
366+
367+
def _collect_and_store_patch_symbols(project, package, symbol_type):
368+
vulnerabilities = package.affected_by_vulnerabilities
369+
370+
resource_data = project.codebaseresources.values_list("id", "path")
371+
path_index = build_index(resource_data, with_subpaths=True)
372+
373+
for vuln in vulnerabilities:
374+
# TODO fix this to after done with vulnerablecode migration to advisories and merge patch API
375+
for patch_text in get_vulnerability_patch_text(vuln):
376+
if not patch_text or not patch_text.strip():
377+
continue
378+
379+
vulnerable_files, fixed_files = extract_patch_details(patch_text)
380+
patch_symbols_data = get_patch_symbols(
381+
vulnerable_files, fixed_files, symbol_type
382+
)
383+
for file_path, patch_symbols in patch_symbols_data.items():
384+
match = find_paths(file_path, path_index)
385+
matched_resources = project.codebaseresources.filter(
386+
id__in=match.resource_ids
387+
)
388+
if not matched_resources:
389+
print(f"Failed to get the code base resources: {file_path}")
390+
continue
391+
392+
for resource in matched_resources:
393+
resource_symbols = resource.extra_data.get("source_symbols", [])
394+
vulnerable_symbols = patch_symbols.get("vulnerable_symbols", [])
395+
fixed_symbols = patch_symbols.get("fixed_symbols", [])
396+
397+
reachability_percentage = calculate_reachability(
398+
resource_symbols, vulnerable_symbols, fixed_symbols
399+
)
400+
resource.update_extra_data(
401+
{
402+
"vulnerable_symbols": vulnerable_symbols,
403+
"fixed_symbols": fixed_symbols,
404+
"reachability": reachability_percentage,
405+
}
406+
)

scanpipe/tests/pipes/test_patch_symbols.py

Whitespace-only changes.

0 commit comments

Comments
 (0)