Skip to content

Commit 30ddcbe

Browse files
Do not codebase walk outside input paths
For multiple inputs, do not walk outside the input paths, from their common prefix. Instead create only the directory relationships between the common prefix and input paths and start the codebase walk from every input paths. Also deprecate --include options to only support ignoring paths through path patterns. Signed-off-by: Ayan Sinha Mahapatra <asmahapatra@aboutcode.org>
1 parent a1c1a0e commit 30ddcbe

5 files changed

Lines changed: 83 additions & 49 deletions

File tree

src/commoncode/resource.py

Lines changed: 61 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363

6464
# Tracing flags
6565
TRACE = False
66-
TRACE_DEEP = True
66+
TRACE_DEEP = False
6767

6868

6969
def logger_debug(*args):
@@ -110,7 +110,7 @@ def skip_ignored(location):
110110
return is_special(location) or ignored(location)
111111

112112

113-
def is_ignored(location, includes=None, excludes=None):
113+
def is_ignored(location, includes=tuple(), excludes=tuple()):
114114

115115
excludes = {
116116
pattern: 'User ignore: Supplied by --ignore' for pattern in excludes
@@ -339,8 +339,8 @@ def __init__(
339339

340340
# finally populate
341341
self.paths = self._prepare_clean_paths(paths)
342+
self.includes = self._prepare_clean_paths(includes)
342343
self.ignores = ignores
343-
self.includes = includes
344344
self._populate()
345345

346346
def _prepare_clean_paths(self, paths=tuple()):
@@ -593,30 +593,48 @@ def err(_error):
593593
f"ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}"
594594
)
595595

596-
skip_ignored = partial(is_ignored, includes=includes, excludes=ignores)
596+
# ignore creating resources based on path patterns
597+
skip_ignored = partial(is_ignored, excludes=ignores)
597598

598599
if TRACE_DEEP:
599600
logger_debug(f"parents_by_loc: {parents_by_loc}, ignores: {ignores}, includes: {includes}")
600601

601-
# Walk over the directory and build the resource tree
602-
for top, dirs, files in depth_walk(
603-
root_location=root.location,
604-
skip_ignored=skip_ignored,
605-
max_depth=self.max_depth,
606-
error_handler=err,
607-
):
608-
parent = parents_by_loc.pop(top)
609-
for created in self._create_resources(
610-
parent=parent,
611-
top=top,
612-
dirs=dirs,
613-
files=files,
614-
skip_ignored=skip_ignored,
602+
# in the case of a single input location, walking starts from
603+
# the root and only the root location
604+
if not includes:
605+
includes = [root.location]
606+
else:
607+
# create the directory resources between the common
608+
# prefix and the included locations so that they are
609+
# connected to the root
610+
for created in self._create_resources_common_prefix_to_inputs(
611+
root=root,
612+
includes=includes,
615613
):
616-
# on the plain, bare FS, files cannot be parents
617614
if not created.is_file:
618615
parents_by_loc[created.location] = created
619616

617+
# we start walking through all the input locations
618+
for included_location in includes:
619+
# Walk over the directory and build the resource tree
620+
for top, dirs, files in depth_walk(
621+
root_location=included_location,
622+
skip_ignored=skip_ignored,
623+
max_depth=self.max_depth,
624+
error_handler=err,
625+
):
626+
parent = parents_by_loc.pop(top)
627+
for created in self._create_resources(
628+
parent=parent,
629+
top=top,
630+
dirs=dirs,
631+
files=files,
632+
skip_ignored=skip_ignored,
633+
):
634+
# on the plain, bare FS, files cannot be parents
635+
if not created.is_file:
636+
parents_by_loc[created.location] = created
637+
620638
def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored):
621639
"""
622640
Create and yield ``files`` and ``dirs`` children Resources of a
@@ -641,6 +659,28 @@ def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored)
641659
logger_debug("Codebase.create_resources:", res)
642660
yield res
643661

662+
def _create_resources_common_prefix_to_inputs(self, root, includes):
663+
664+
if TRACE_DEEP:
665+
logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}")
666+
667+
for included_path in includes:
668+
_, _, extra_dir_path = included_path.rpartition(root.location)
669+
extra_dirs = extra_dir_path.strip("/").split("/")
670+
if TRACE_DEEP:
671+
logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}")
672+
673+
dir_resource = root
674+
for dir_segment in extra_dirs:
675+
dir_resource = self._get_or_create_resource(
676+
name=dir_segment,
677+
parent=dir_resource,
678+
is_file=False,
679+
)
680+
if TRACE:
681+
logger_debug("Codebase.create_resources:", dir_resource)
682+
yield dir_resource
683+
644684
def _create_root_resource(self):
645685
"""
646686
Create and return the root Resource of this codebase.
@@ -1606,8 +1646,8 @@ def clean_path(path):
16061646
Return a cleaned and normalized POSIX ``path``.
16071647
"""
16081648
path = path or ""
1609-
# convert to posix and ensure we have no slash at both ends
1610-
path = posixpath_normpath(path.replace("\\", "/").strip("/"))
1649+
# convert to posix and ensure we have no slash at the end
1650+
path = posixpath_normpath(path.replace("\\", "/").rstrip("/"))
16111651
if path == ".":
16121652
path = ""
16131653
return path

src/scancode/cli.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -221,16 +221,6 @@ def default_processes():
221221
callback=validate_input_path,
222222
type=click.Path(exists=True, readable=True, path_type=str))
223223

224-
@click.option('--include',
225-
multiple=True,
226-
default=None,
227-
metavar='<pattern>',
228-
help='Include files matching <pattern>.',
229-
sort_order=11,
230-
help_group=cliutils.CORE_GROUP,
231-
cls=PluggableCommandLineOption,
232-
)
233-
234224
@click.option('--ignore',
235225
multiple=True,
236226
default=None,
@@ -415,7 +405,6 @@ def default_processes():
415405
def scancode(
416406
ctx,
417407
input, # NOQA
418-
include,
419408
ignore,
420409
strip_root,
421410
full_root,
@@ -527,7 +516,6 @@ def scancode(
527516
# run proper
528517
success, _results = run_scan(
529518
input=input,
530-
include=include,
531519
ignore=ignore,
532520
from_json=from_json,
533521
strip_root=strip_root,
@@ -570,7 +558,6 @@ def scancode(
570558

571559
def run_scan(
572560
input, #
573-
include=[],
574561
ignore=[],
575562
from_json=False,
576563
strip_root=False,
@@ -623,6 +610,9 @@ def echo_func(*_args, **_kwargs):
623610
msg = 'At least one input path is required.'
624611
raise ScancodeError(msg)
625612

613+
# To support multiple path inputs
614+
include = []
615+
626616
if not isinstance(input, (list, tuple)):
627617
if not isinstance(input, str):
628618
msg = 'Unknown <input> format: "{}".'.format(repr(input))
@@ -637,8 +627,6 @@ def echo_func(*_args, **_kwargs):
637627
# VirtualCodebase; otherwise we have to process `input` to make it a single
638628
# root with excludes.
639629
elif not from_json:
640-
# FIXME: support the multiple root better. This is quirky at best
641-
642630
# This is the case where we have a list of input path and the
643631
# `from_json` option is not selected: we can handle this IFF they share
644632
# a common root directory and none is an absolute path
@@ -650,13 +638,15 @@ def echo_func(*_args, **_kwargs):
650638
)
651639
raise ScancodeError(msg)
652640

641+
abs_input = [os.path.abspath(i) for i in input]
642+
653643
# find the common prefix directory (note that this is a pre string
654644
# operation hence it may return non-existing paths
655-
common_prefix = os.path.commonprefix(input)
645+
common_prefix = os.path.commonprefix(abs_input)
656646

657647
if not common_prefix:
658648
# we have no common prefix, but all relative. therefore the
659-
# parent/root is the current ddirectory
649+
# parent/root is the current directory
660650
common_prefix = str('.')
661651

662652
elif not os.path.isdir(common_prefix):
@@ -667,13 +657,9 @@ def echo_func(*_args, **_kwargs):
667657

668658
raise ScancodeError(msg)
669659

670-
# and we craft a list of synthetic --include path pattern options from
671-
# the input list of paths
672-
included_paths = [as_posixpath(path).rstrip('/') for path in input]
673-
include.extend(included_paths)
674-
675-
# ... and use the common prefix as our new input
676-
# FIXME: we should not walk outside inputs
660+
# and we craft a list of include paths where the codebase walks
661+
# will start from, even though the root is the common prefix
662+
include = [as_posixpath(path).rstrip('/') for path in abs_input]
677663
input = common_prefix # NOQA
678664

679665
# build mappings of all options to pass down to plugins

tests/commoncode/test_resource.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def test_get_resource_for_multiple_resource_codebase(self):
353353

354354
codebase = Codebase(test_codebase)
355355
assert codebase.get_resource("resource/a").path == "resource/a"
356-
assert codebase.get_resource("/resource/c").path == "resource/c"
356+
assert codebase.get_resource("resource/c").path == "resource/c"
357357
assert codebase.get_resource("resource/dsasda/../b/").path == "resource/b"
358358

359359
def test_Resource_build_path(self):

tests/scancode/data/help/help_linux.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@ Options:
144144
-n, --processes INT Set the number of parallel processes to use. Disable
145145
parallel processing if 0. Also disable threading if
146146
-1. [default: (number of CPUs)-1]
147-
--include <pattern> Include files matching <pattern>.
148147
-q, --quiet Do not print summary or progress.
149148
-v, --verbose Print progress as file-by-file path instead of a
150149
progress bar. Print verbose scan counters.

tests/scancode/test_cli.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def test_scan_info_returns_full_root():
168168
file_paths = [f['path'] for f in result_data['files']]
169169
assert len(file_paths) == 12
170170
# note that we strip paths from leading and trailing slashes
171-
root = fileutils.as_posixpath(test_dir).strip('/')
171+
root = fileutils.as_posixpath(test_dir)
172172
assert all(p.startswith(root) for p in file_paths)
173173

174174

@@ -184,7 +184,7 @@ def test_scan_info_returns_correct_full_root_with_single_file():
184184
scanned_file = files[0]
185185
# and we check that the path is the full path without repeating the file name
186186
# note that the path never contain leading and trailing slashes
187-
assert scanned_file['path'] == fileutils.as_posixpath(test_file).strip('/')
187+
assert scanned_file['path'] == fileutils.as_posixpath(test_file)
188188

189189

190190
def test_scan_info_returns_does_not_strip_root_with_single_file():
@@ -837,6 +837,15 @@ def test_scan_should_not_fail_with_low_max_in_memory_setting_when_ignoring_files
837837
run_scan_click(args, expected_rc=0)
838838

839839

840+
def test_scan_supports_multiple_input_paths():
841+
test_file_1 = test_env.get_test_loc('summaries/client')
842+
test_file_2 = test_env.get_test_loc('summaries/counts')
843+
result_file = test_env.get_temp_file('json')
844+
args = ['--info', '-n', '1', test_file_1, test_file_2, '--json', result_file]
845+
run_scan_click(args, expected_rc=0)
846+
847+
848+
840849
def test_get_displayable_summary():
841850
from scancode.cli import get_displayable_summary
842851
from commoncode.resource import Codebase

0 commit comments

Comments
 (0)