diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 4cb98319c..2db3c773d 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -571,7 +571,12 @@ def search_for_folders( verbose: bool = True, return_full_path: bool = False, ) -> Tuple[List[Any], List[Any]]: - """Determine the method used to search for search prefix folders in the search path. + """Search for files and folders in the search path. + + If searching the local filesystem, use a separate function that does not call `rclone`, + which is faster as a new process does not have to be created. This slowness + is mostly a problem on Windows. This does mean there are two duplicate functions, + these are tested against each-other in unit tests. Parameters ---------- @@ -602,34 +607,72 @@ def search_for_folders( if ( local_or_central == "local" or cfg["connection_method"] == "local_filesystem" - ) and not search_path.exists(): - if verbose: - utils.log_and_message(f"No file found at {search_path.as_posix()}") - return [], [] + ): + if not search_path.exists(): + if verbose: + utils.log_and_message( + f"No file found at {search_path.as_posix()}" + ) + return [], [] - if local_or_central == "local": - rclone_config_name = None - else: - rclone_config_name = cfg.get_rclone_config_name( - cfg["connection_method"] + all_folder_names, all_filenames = search_local_filesystem( + search_path, search_prefix, return_full_path ) - all_folder_names, all_filenames = search_local_or_remote( - search_path, - search_prefix, - rclone_config_name, - return_full_path, - ) + else: + all_folder_names, all_filenames = search_central_via_connection( + cfg, + search_path, + search_prefix, + return_full_path, + ) return all_folder_names, all_filenames -def search_local_or_remote( +def search_local_filesystem( + search_path: Path, search_prefix: str, return_full_path: bool = False +) -> tuple[List[Any], List[Any]]: + """Search local filesystem recursively. + + Partner function to `search_central_via_connection()` for use + locally as is much faster than calling `rclone` through a new process. + + Parameters + ---------- + search_path + The path to search (relative to the local or remote drive). For example, + for "local_filesystem" this is the path on the local machine. For any other + connection to central, this is the path on the central storage that has been + connected to. + + search_prefix + The search string e.g. "sub-*". + + return_full_path + If `True`, return the full filepath, otherwise return only the folder/file name. + + """ + all_folder_names = [] + all_filenames = [] + + for item in search_path.glob(search_prefix): + to_append = item if return_full_path else item.name + + if item.is_dir(): + all_folder_names.append(to_append) + elif item.is_file(): + all_filenames.append(to_append) + + return sorted(all_folder_names), sorted(all_filenames) + + +def search_central_via_connection( + cfg: Configs, search_path: Path, search_prefix: str, - rclone_config_name: str | None, return_full_path: bool = False, -) -> Tuple[List[Any], List[Any]]: +) -> tuple[List[Any], List[Any]]: """Search for files and folders in central path using `rclone lsjson` command. This command lists all the files and folders in the central path in a json format. @@ -637,29 +680,30 @@ def search_local_or_remote( Parameters ---------- + cfg + datashuttle Configs class. search_path The path to search (relative to the local or remote drive). For example, - for "local_filesystem" this is the path on the local machine. For "ssh", this - is the path on the machine that has been connected to. + for "local_filesystem" this is the path on the local machine. For any other + connection to central, this is the path on the central storage that has been + connected to. + search_prefix The search string e.g. "sub-*". - rclone_config_name - Name of the rclone config for the remote (not set for local). `rclone config` - can be used in the terminal to see how rclone has stored these. In datashuttle, - these are managed by `Configs`. + return_full_path If `True`, return the full filepath, otherwise return only the folder/file name. """ - config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:" + rclone_config_name = cfg.get_rclone_config_name(cfg["connection_method"]) output = rclone.call_rclone( - f'lsjson {config_prefix}"{search_path.as_posix()}"', + f'lsjson {rclone_config_name}:"{search_path.as_posix()}"', pipe_std=True, ) - all_folder_names: List[str] = [] - all_filenames: List[str] = [] + all_folder_names: list = [] + all_filenames: list = [] if output.returncode != 0: utils.log_and_message( @@ -685,4 +729,4 @@ def search_local_or_remote( else: all_filenames.append(to_append) - return all_folder_names, all_filenames + return sorted(all_folder_names), sorted(all_filenames) diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py new file mode 100644 index 000000000..a53953146 --- /dev/null +++ b/tests/tests_integration/test_search_methods.py @@ -0,0 +1,103 @@ +from pathlib import Path + +import pytest + +from datashuttle.utils.folders import ( + search_central_via_connection, + search_local_filesystem, +) +from datashuttle.utils.rclone import call_rclone + +from .. import test_utils +from ..base import BaseTest + +# ----------------------------------------------------------------------------- +# Inconsistent sub or ses value lengths +# ----------------------------------------------------------------------------- + + +class TestSubSesSearches(BaseTest): + @pytest.mark.parametrize("return_full_path", [True, False]) + def test_local_vs_central_search_methods( + self, project, monkeypatch, return_full_path + ): + """ + Test the `search_local_filesystem` and `search_central_via_connection` + functions. These functions should have the same outputs but `search_local_filesystem` + is used for local filesystem for speed. Here we check the outputs of these + functions match. + + These functions are individually tested in many places, primarily in + transfer tests. Local filesystem transfer tests are very extensive, + because they are quicker, while central connection tests are less + thorough. We test the outputs of these two functions directly + under a range of test cases to ensure they are matched under many conditions. + + """ + central_path = project.get_central_path() + + # Create a project of folders and files + # fmt: off + paths_to_make = [] + for i in range(1, 4): + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-001/behav")) + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/behav")) + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/anat")) + + paths_to_make.append(Path("rawdata/sub-003_condition-test/ses-001_hello-world/ephys")) + paths_to_make.append(Path("rawdata/sub-004_condition-test/ses-002_hello-world/funcimg")) + + for path_ in paths_to_make: + (central_path / path_).mkdir(parents=True) + test_utils.write_file(central_path / path_ / f"{path_.name}_file.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent / f"{path_.name}.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent.parent / f"{path_.parent.name}.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent.parent.parent / f"{path_.parent.parent.name}.md", contents="hello_world",) + # fmt: on + + # Monkeycatch `get_rclone_config_name` to return `local` and set `local` + # as a rclone config entry associated with the local filesystem. By this + # method we can hijack `search_central_via_connection` to run locally + # (though it is set up in practice to run via ssh, gdrive or aws). + monkeypatch.setattr( + project.cfg, + "get_rclone_config_name", + lambda connection_method: "local", + ) + + call_rclone("config create local local nounc true") + + # Perform a range of checks across folders and files + # and check the outputs of both approaches match. + # fmt: off + for search_path, search_str in ( + (central_path / "rawdata", "*"), + (central_path / "rawdata", "sub-*"), + (central_path / "rawdata" / "sub-003_condition-test", "ses-*"), + (central_path / "rawdata/sub-001/ses-002_date-20250402", "behav"), + (central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "behav_file.md",), + (central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "*",), + (central_path / "rawdata/sub-002", "*"), + (central_path / "rawdata/sub-002/ses-002_date-20250402", "*"), + (central_path / "rawdata", "sub-003*"), + ): + # fmt: on + + central_method_folders, central_method_files = ( + search_central_via_connection( + project.cfg, + search_path, + search_str, + return_full_path=return_full_path, + ) + ) + local_method_folders, local_method_files = search_local_filesystem( + search_path, search_str, return_full_path=return_full_path + ) + + assert central_method_folders == local_method_folders, ( + f"Failed folders, search_str: {search_str}, search_path: {search_path}" + ) + assert central_method_files == local_method_files, ( + f"Failed files, search_str: {search_str}, search_path: {search_path}" + )