|
20 | 20 | # ScanCode.io is a free software code scanning tool from nexB Inc. and others. |
21 | 21 | # Visit https://github.com/aboutcode-org/scancode.io for support and download. |
22 | 22 |
|
| 23 | +import os |
| 24 | +import tempfile |
| 25 | + |
23 | 26 | from django.db.models import Q |
24 | 27 |
|
| 28 | +from source_inspector import symbols_ctags |
| 29 | +from source_inspector import symbols_pygments |
| 30 | +from source_inspector import symbols_tree_sitter |
| 31 | + |
25 | 32 | from aboutcode.pipeline import LoopProgress |
| 33 | +from scanpipe.pipes.fetch import fetch_http |
| 34 | +from scanpipe.pipes.pathmap import build_index |
| 35 | +from scanpipe.pipes.pathmap import find_paths |
| 36 | +from scanpipe.pipes.symbolmap import MATCHING_RATIO_JAVASCRIPT |
| 37 | +from scanpipe.pipes.symbolmap import MATCHING_RATIO_JAVASCRIPT_SMALL_FILE |
| 38 | +from scanpipe.pipes.symbolmap import SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT |
| 39 | +from scanpipe.pipes.symbolmap import get_similarity_between_source_and_deployed_symbols |
26 | 40 |
|
27 | 41 |
|
28 | 42 | class UniversalCtagsNotFound(Exception): |
@@ -171,3 +185,222 @@ def _collect_and_store_tree_sitter_symbols_and_strings(resource): |
171 | 185 | "source_strings": result.get("source_strings"), |
172 | 186 | } |
173 | 187 | ) |
| 188 | + |
| 189 | + |
| 190 | +SYMBOLS_TYPE_SUPPORTED = { |
| 191 | + "ctags": symbols_ctags.get_symbols, |
| 192 | + "tree_sitter": symbols_tree_sitter.get_treesitter_symbols, |
| 193 | + "pygments": symbols_pygments.get_pygments_symbols, |
| 194 | +} |
| 195 | + |
| 196 | +DOC_EXTENSIONS = { |
| 197 | + ".md", |
| 198 | + ".rst", |
| 199 | + ".txt", |
| 200 | + ".html", |
| 201 | + ".pdf", |
| 202 | + ".wiki", |
| 203 | + ".json", |
| 204 | + ".yaml", |
| 205 | + ".yml", |
| 206 | + ".toml", |
| 207 | +} |
| 208 | + |
| 209 | + |
| 210 | +def get_vulnerability_patch_text(vuln): |
| 211 | + # TODO this is a mock, we should delete this function once we migrate to v2 api vulnerablecode |
| 212 | + # https://files.pythonhosted.org/packages/99/ab/eedb921f26adf7057ade1291f9c1bfa35a506d64894f58546457ef658772/Flask-1.0.tar.gz |
| 213 | + |
| 214 | + patch_urls = [ |
| 215 | + # VCID-z6fe-2j8a-aaak |
| 216 | + # "https://github.com/pallets/flask/commit/70f906c51ce49c485f1d355703e9cc3386b1cc2b.patch", |
| 217 | + # "https://github.com/pallets/flask/commit/afd63b16170b7c047f5758eb910c416511e9c965.patch", |
| 218 | + # VCID-e8hf-2zj4-1qhv |
| 219 | + "https://github.com/pallets/flask/commit/089cb86dd22bff589a4eafb7ab8e42dc357623b4.patch" |
| 220 | + ] |
| 221 | + |
| 222 | + for patch_url in patch_urls: |
| 223 | + file_path = fetch_http(patch_url).path |
| 224 | + with open(file_path) as f: |
| 225 | + patch_text = f.read() |
| 226 | + yield patch_text |
| 227 | + |
| 228 | + |
| 229 | +def parse_patch_symbols(raw_code: str, path: str, symbols_type="tree_sitter") -> dict: |
| 230 | + if not raw_code or not raw_code.strip(): |
| 231 | + return {} |
| 232 | + |
| 233 | + _, file_suffix = os.path.splitext(path) |
| 234 | + |
| 235 | + with tempfile.NamedTemporaryFile(mode="w+", suffix=file_suffix, delete=False) as f: |
| 236 | + f.write(raw_code) |
| 237 | + f.flush() |
| 238 | + temp_name = f.name |
| 239 | + |
| 240 | + try: |
| 241 | + parser_func = SYMBOLS_TYPE_SUPPORTED.get(symbols_type, lambda f: {}) |
| 242 | + return parser_func(temp_name) or {} |
| 243 | + finally: |
| 244 | + os.remove(temp_name) |
| 245 | + |
| 246 | + |
| 247 | +def get_patch_symbols(vulnerable_files: dict, fixed_files: dict, symbol_type) -> dict: |
| 248 | + symbols_results = {} |
| 249 | + all_file_paths = set(vulnerable_files.keys()) | set(fixed_files.keys()) |
| 250 | + |
| 251 | + for file_path in all_file_paths: |
| 252 | + vuln_code = vulnerable_files.get(file_path, "") |
| 253 | + fixed_code = fixed_files.get(file_path, "") |
| 254 | + vuln_parsed = parse_patch_symbols(vuln_code, file_path, symbol_type) |
| 255 | + fixed_parsed = parse_patch_symbols(fixed_code, file_path, symbol_type) |
| 256 | + |
| 257 | + symbols_results[file_path] = { |
| 258 | + "vulnerable_symbols": vuln_parsed.get("source_symbols", []), |
| 259 | + "vulnerable_strings": vuln_parsed.get("source_strings", []), |
| 260 | + "fixed_symbols": fixed_parsed.get("source_symbols", []), |
| 261 | + "fixed_strings": fixed_parsed.get("source_strings", []), |
| 262 | + } |
| 263 | + return symbols_results |
| 264 | + |
| 265 | + |
| 266 | +def _should_skip(file_path: str): |
| 267 | + file_name = os.path.basename(file_path) |
| 268 | + _, ext = os.path.splitext(file_name) |
| 269 | + |
| 270 | + if ext.lower() in DOC_EXTENSIONS: |
| 271 | + return True |
| 272 | + |
| 273 | + lower_name = file_name.lower() |
| 274 | + if ( |
| 275 | + lower_name.startswith("test_") |
| 276 | + or lower_name.startswith("test") |
| 277 | + or "_test." in lower_name |
| 278 | + ): |
| 279 | + return True |
| 280 | + |
| 281 | + lower_path = file_path.lower() |
| 282 | + if "test/" in lower_path or "tests/" in lower_path or "/testdata/" in lower_path: |
| 283 | + return True |
| 284 | + |
| 285 | + return False |
| 286 | + |
| 287 | + |
| 288 | +def extract_patch_details(patch_text: str): |
| 289 | + from unidiff import PatchSet |
| 290 | + |
| 291 | + patch = PatchSet(patch_text) |
| 292 | + vulnerable_files = {} |
| 293 | + fixed_files = {} |
| 294 | + |
| 295 | + for patched_file in patch: |
| 296 | + if _should_skip(patched_file.path): |
| 297 | + continue |
| 298 | + |
| 299 | + vuln_lines = [] |
| 300 | + fixed_lines = [] |
| 301 | + for hunk in patched_file: |
| 302 | + for line in hunk: |
| 303 | + if line.is_removed: |
| 304 | + vuln_lines.append(line.value) |
| 305 | + elif line.is_added: |
| 306 | + fixed_lines.append(line.value) |
| 307 | + |
| 308 | + if vuln_lines: |
| 309 | + vulnerable_files[patched_file.path] = "".join(vuln_lines) |
| 310 | + if fixed_lines: |
| 311 | + fixed_files[patched_file.path] = "".join(fixed_lines) |
| 312 | + |
| 313 | + return vulnerable_files, fixed_files |
| 314 | + |
| 315 | + |
| 316 | +def collect_and_store_patch_symbols(project, symbol_type, logger=None): |
| 317 | + packages = project.discoveredpackages.all() |
| 318 | + packages_count = packages.count() |
| 319 | + |
| 320 | + if logger: |
| 321 | + logger( |
| 322 | + f"Collecting patch symbols for {packages_count:,d} discovered packages " |
| 323 | + "and computing reachability." |
| 324 | + ) |
| 325 | + |
| 326 | + progress = LoopProgress(packages_count, logger) |
| 327 | + for package in progress.iter(packages.iterator(chunk_size=2000)): |
| 328 | + try: |
| 329 | + _collect_and_store_patch_symbols(project, package, symbol_type) |
| 330 | + except Exception as e: |
| 331 | + project.add_error( |
| 332 | + description=f"Cannot collect patch symbols for package {package.name}", |
| 333 | + exception=e, |
| 334 | + model="collect_and_store_patch_symbols", |
| 335 | + details={"package_uuid": str(package.uuid)}, |
| 336 | + ) |
| 337 | + |
| 338 | + |
| 339 | +def calculate_reachability(source_symbols, vulnerable_symbols, fixed_symbols): |
| 340 | + is_vulnerable, vulnerable_similarity = ( |
| 341 | + get_similarity_between_source_and_deployed_symbols( |
| 342 | + source_symbols=source_symbols, |
| 343 | + deployed_symbols=vulnerable_symbols, |
| 344 | + matching_ratio=MATCHING_RATIO_JAVASCRIPT, |
| 345 | + matching_ratio_small_file=MATCHING_RATIO_JAVASCRIPT_SMALL_FILE, |
| 346 | + small_file_threshold=SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT, |
| 347 | + ) |
| 348 | + ) |
| 349 | + |
| 350 | + is_fixed, fixed_similarity = get_similarity_between_source_and_deployed_symbols( |
| 351 | + source_symbols=source_symbols, |
| 352 | + deployed_symbols=fixed_symbols, |
| 353 | + matching_ratio=MATCHING_RATIO_JAVASCRIPT, |
| 354 | + matching_ratio_small_file=MATCHING_RATIO_JAVASCRIPT_SMALL_FILE, |
| 355 | + small_file_threshold=SMALL_FILE_SYMBOLS_THRESHOLD_JAVASCRIPT, |
| 356 | + ) |
| 357 | + |
| 358 | + return { |
| 359 | + "is_vulnerable_matched": is_vulnerable, |
| 360 | + "vulnerable_similarity": vulnerable_similarity, |
| 361 | + "is_fixed_matched": is_fixed, |
| 362 | + "fixed_similarity": fixed_similarity, |
| 363 | + "is_reachable": vulnerable_similarity >= fixed_similarity, |
| 364 | + } |
| 365 | + |
| 366 | + |
| 367 | +def _collect_and_store_patch_symbols(project, package, symbol_type): |
| 368 | + vulnerabilities = package.affected_by_vulnerabilities |
| 369 | + |
| 370 | + resource_data = project.codebaseresources.values_list("id", "path") |
| 371 | + path_index = build_index(resource_data, with_subpaths=True) |
| 372 | + |
| 373 | + for vuln in vulnerabilities: |
| 374 | + # TODO fix this to after done with vulnerablecode migration to advisories and merge patch API |
| 375 | + for patch_text in get_vulnerability_patch_text(vuln): |
| 376 | + if not patch_text or not patch_text.strip(): |
| 377 | + continue |
| 378 | + |
| 379 | + vulnerable_files, fixed_files = extract_patch_details(patch_text) |
| 380 | + patch_symbols_data = get_patch_symbols( |
| 381 | + vulnerable_files, fixed_files, symbol_type |
| 382 | + ) |
| 383 | + for file_path, patch_symbols in patch_symbols_data.items(): |
| 384 | + match = find_paths(file_path, path_index) |
| 385 | + matched_resources = project.codebaseresources.filter( |
| 386 | + id__in=match.resource_ids |
| 387 | + ) |
| 388 | + if not matched_resources: |
| 389 | + print(f"Failed to get the code base resources: {file_path}") |
| 390 | + continue |
| 391 | + |
| 392 | + for resource in matched_resources: |
| 393 | + resource_symbols = resource.extra_data.get("source_symbols", []) |
| 394 | + vulnerable_symbols = patch_symbols.get("vulnerable_symbols", []) |
| 395 | + fixed_symbols = patch_symbols.get("fixed_symbols", []) |
| 396 | + |
| 397 | + reachability_percentage = calculate_reachability( |
| 398 | + resource_symbols, vulnerable_symbols, fixed_symbols |
| 399 | + ) |
| 400 | + resource.update_extra_data( |
| 401 | + { |
| 402 | + "vulnerable_symbols": vulnerable_symbols, |
| 403 | + "fixed_symbols": fixed_symbols, |
| 404 | + "reachability": reachability_percentage, |
| 405 | + } |
| 406 | + ) |
0 commit comments