From bc9cb9d92b6d5b69ab02c49a9b3ff6e9a190396c Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Mon, 4 Aug 2025 21:55:49 +0100 Subject: [PATCH 01/10] Revert lsjson for local filesystem searches. --- datashuttle/utils/folders.py | 95 +++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 4cb98319c..a840036a8 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -573,6 +573,10 @@ def search_for_folders( ) -> Tuple[List[Any], List[Any]]: """Determine the method used to search for search prefix folders in the search path. + If local filesystem, use a bespoke function because it is faster. `rclone` + can be used to search local filesystem, but it is slow because a new process + must be created. These functions are tested against each-other. + Parameters ---------- cfg @@ -602,34 +606,69 @@ def search_for_folders( if ( local_or_central == "local" or cfg["connection_method"] == "local_filesystem" - ) and not search_path.exists(): - if verbose: - utils.log_and_message(f"No file found at {search_path.as_posix()}") - return [], [] + ): + if not search_path.exists(): + if verbose: + utils.log_and_message( + f"No file found at {search_path.as_posix()}" + ) + return [], [] + + all_folder_names, all_filenames = search_local_filesystem( + search_path, search_prefix, return_full_path + ) - if local_or_central == "local": - rclone_config_name = None else: - rclone_config_name = cfg.get_rclone_config_name( - cfg["connection_method"] + all_folder_names, all_filenames = search_central_via_connection( + cfg, + search_path, + search_prefix, + return_full_path, ) - all_folder_names, all_filenames = search_local_or_remote( - search_path, - search_prefix, - rclone_config_name, - return_full_path, - ) + return all_folder_names, all_filenames + + +def search_local_filesystem( + search_path: Path, search_prefix: str, return_full_path: bool = False +) -> tuple[List[Any], List[Any]]: + """Search local filesystem recursively. + + Partner function to `search_central_via_connection()` for use + locally as is much faster than calling `rclone` through a new process. + + Parameters + ---------- + search_path + The path to search (relative to the local or remote drive). For example, + for "local_filesystem" this is the path on the local machine. For "ssh", this + is the path on the machine that has been connected to. + search_prefix + The search string e.g. "sub-*". + return_full_path + If `True`, return the full filepath, otherwise return only the folder/file name. + + """ + all_folder_names = [] + all_filenames = [] + + for item in search_path.glob(search_prefix): + to_append = item if return_full_path else item.name + + if item.is_dir(): + all_folder_names.append(to_append) + elif item.is_file(): + all_filenames.append(to_append) return all_folder_names, all_filenames -def search_local_or_remote( +def search_central_via_connection( + cfg: Configs, search_path: Path, search_prefix: str, - rclone_config_name: str | None, return_full_path: bool = False, -) -> Tuple[List[Any], List[Any]]: +) -> tuple[List[Any], List[Any]]: """Search for files and folders in central path using `rclone lsjson` command. This command lists all the files and folders in the central path in a json format. @@ -637,39 +676,37 @@ def search_local_or_remote( Parameters ---------- + cfg + datashuttle Configs class. search_path The path to search (relative to the local or remote drive). For example, for "local_filesystem" this is the path on the local machine. For "ssh", this is the path on the machine that has been connected to. search_prefix The search string e.g. "sub-*". - rclone_config_name - Name of the rclone config for the remote (not set for local). `rclone config` - can be used in the terminal to see how rclone has stored these. In datashuttle, - these are managed by `Configs`. return_full_path If `True`, return the full filepath, otherwise return only the folder/file name. """ - config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:" + rclone_config_name = cfg.get_rclone_config_name(cfg["connection_method"]) output = rclone.call_rclone( - f'lsjson {config_prefix}"{search_path.as_posix()}"', + f'lsjson {rclone_config_name}:"{search_path.as_posix()}"', pipe_std=True, ) - all_folder_names: List[str] = [] - all_filenames: List[str] = [] - if output.returncode != 0: - utils.log_and_message( + utils.log_and_raise_error( f"Error searching files at {search_path.as_posix()}\n" - f"{output.stderr.decode('utf-8') if output.stderr else ''}" + f"{output.stderr.decode('utf-8') if output.stderr else ''}", + RuntimeError, ) - return all_folder_names, all_filenames files_and_folders = json.loads(output.stdout) + all_folder_names = [] + all_filenames = [] + for file_or_folder in files_and_folders: name = file_or_folder["Name"] From c39317343f8626faa985f24caba161cd44ad977d Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Tue, 5 Aug 2025 01:27:56 +0100 Subject: [PATCH 02/10] working on tests. --- .../tests_integration/test_search_methods.py | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/tests_integration/test_search_methods.py diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py new file mode 100644 index 000000000..2cef381b0 --- /dev/null +++ b/tests/tests_integration/test_search_methods.py @@ -0,0 +1,76 @@ +from pathlib import Path + +from .. import test_utils +from ..base import BaseTest + +# ----------------------------------------------------------------------------- +# Inconsistent sub or ses value lengths +# ----------------------------------------------------------------------------- + + +class TestSubSesSearches(BaseTest): + def test_local_vs_central_search_methods(self, project, monkeypatch): + """ """ + central_path = project.get_central_path() + + paths_to_make = [] + for i in range(1, 4): + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-001/behav")) + paths_to_make.append( + Path(f"rawdata/sub-00{i}/ses-002_date-20250402/behav") + ) + paths_to_make.append( + Path(f"rawdata/sub-00{i}/ses-002_date-20250402/anat") + ) + + paths_to_make.append( + Path("rawdata/sub-003_condition-test/ses-001_hello-world/ephys") + ) + paths_to_make.append( + Path("rawdata/sub-004_condition-test/ses-002_hello-world/funcimg") + ) + + for path_ in paths_to_make: + (central_path / path_).mkdir(parents=True) + breakpoint() + test_utils.write_file( + central_path / path_ / f"{path_.name}.md", + contents="hello_world", + ) + test_utils.write_file( + central_path / path_.parent / f"{path_.parent.name}.md", + contents="hello_world", + ) + test_utils.write_file( + central_path + / path_.parent.parent + / f"{path_.parent.parent.name}.md", + contents="hello_world", + ) + + from datashuttle.utils.folders import ( + search_central_via_connection, + ) + + # -- monkeypatch cfg.get_rclone_config to return dummy config + monkeypatch.setattr( + project.cfg, + "get_rclone_config_name", + lambda connection_method: "local", + ) + + import subprocess + + subprocess.run( + ["rclone", "config", "create", "local", "local", "nounc", "true"], + shell=True, + ) + + hello, world = search_central_via_connection( + project.cfg, + central_path / "rawdata", + "sub-*", + return_full_path=True, + ) + + breakpoint() From ae944191d2539522be6e1259a6f7db5d0feeb7c5 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 6 Aug 2025 00:38:52 +0100 Subject: [PATCH 03/10] Still working on test. --- .../tests_integration/test_search_methods.py | 87 +++++++++++-------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py index 2cef381b0..87af6a045 100644 --- a/tests/tests_integration/test_search_methods.py +++ b/tests/tests_integration/test_search_methods.py @@ -1,5 +1,7 @@ from pathlib import Path +import pytest + from .. import test_utils from ..base import BaseTest @@ -9,47 +11,35 @@ class TestSubSesSearches(BaseTest): - def test_local_vs_central_search_methods(self, project, monkeypatch): + # TODO: return full path + @pytest.mark.parametrize("return_full_path", [True, False]) + def test_local_vs_central_search_methods( + self, project, monkeypatch, return_full_path + ): """ """ central_path = project.get_central_path() + # fmt: off paths_to_make = [] for i in range(1, 4): paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-001/behav")) - paths_to_make.append( - Path(f"rawdata/sub-00{i}/ses-002_date-20250402/behav") - ) - paths_to_make.append( - Path(f"rawdata/sub-00{i}/ses-002_date-20250402/anat") - ) + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/behav")) + paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/anat")) - paths_to_make.append( - Path("rawdata/sub-003_condition-test/ses-001_hello-world/ephys") - ) - paths_to_make.append( - Path("rawdata/sub-004_condition-test/ses-002_hello-world/funcimg") - ) + paths_to_make.append(Path("rawdata/sub-003_condition-test/ses-001_hello-world/ephys")) + paths_to_make.append(Path("rawdata/sub-004_condition-test/ses-002_hello-world/funcimg")) for path_ in paths_to_make: (central_path / path_).mkdir(parents=True) - breakpoint() - test_utils.write_file( - central_path / path_ / f"{path_.name}.md", - contents="hello_world", - ) - test_utils.write_file( - central_path / path_.parent / f"{path_.parent.name}.md", - contents="hello_world", - ) - test_utils.write_file( - central_path - / path_.parent.parent - / f"{path_.parent.parent.name}.md", - contents="hello_world", - ) + test_utils.write_file(central_path / path_ / f"{path_.name}_file.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent / f"{path_.name}.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent.parent / f"{path_.parent.name}.md", contents="hello_world",) + test_utils.write_file(central_path / path_.parent.parent.parent / f"{path_.parent.parent.name}.md", contents="hello_world",) + # fmt: on from datashuttle.utils.folders import ( search_central_via_connection, + search_local_filesystem, ) # -- monkeypatch cfg.get_rclone_config to return dummy config @@ -66,11 +56,38 @@ def test_local_vs_central_search_methods(self, project, monkeypatch): shell=True, ) - hello, world = search_central_via_connection( - project.cfg, - central_path / "rawdata", - "sub-*", - return_full_path=True, - ) + for search_path, search_str in ( + (central_path / "rawdata", "*"), + (central_path / "rawdata", "sub-*"), + (central_path / "rawdata" / "sub-003_condition-test", "ses-*"), + (central_path / "rawdata/sub-001/ses-002_date-20250402", "behav"), + ( + central_path / "rawdata/sub-001/ses-002_date-20250402/behav", + "behav_file.md", + ), # ses-002_date-20250402 + ( + central_path / "rawdata/sub-001/ses-002_date-20250402/behav", + "*", + ), + (central_path / "rawdata/sub-002", "*"), + (central_path / "rawdata/sub-002/ses-002_date-20250402", "*"), + (central_path / "rawdata", "sub-003*"), + ): + central_method_folders, central_method_files = ( + search_central_via_connection( + project.cfg, + search_path, + search_str, + return_full_path=return_full_path, + ) + ) + local_method_folders, local_method_files = search_local_filesystem( + search_path, search_str, return_full_path=return_full_path + ) - breakpoint() + assert central_method_folders == local_method_folders, ( + f"Failed folders, search_str: {search_str}, search_path: {search_path}" + ) + assert central_method_files == local_method_files, ( + f"Failed files, search_str: {search_str}, search_path: {search_path}" + ) From 0a83429ac407080ae7a802f2439498da75a81e30 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 6 Aug 2025 01:27:04 +0100 Subject: [PATCH 04/10] Document tests. --- .../tests_integration/test_search_methods.py | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py index 87af6a045..18933abb4 100644 --- a/tests/tests_integration/test_search_methods.py +++ b/tests/tests_integration/test_search_methods.py @@ -1,7 +1,13 @@ +import subprocess from pathlib import Path import pytest +from datashuttle.utils.folders import ( + search_central_via_connection, + search_local_filesystem, +) + from .. import test_utils from ..base import BaseTest @@ -11,14 +17,26 @@ class TestSubSesSearches(BaseTest): - # TODO: return full path @pytest.mark.parametrize("return_full_path", [True, False]) def test_local_vs_central_search_methods( self, project, monkeypatch, return_full_path ): - """ """ + """ + Test the `search_local_filesystem` and `search_central_via_connection` + functions. These functions perform the same function but `search_local_filesystem` + is used for local filesystem for speed. Here we check the outputs of these + functions match. + + These functions are individually tested in many places, primarily in + transfer tests. Local filesystem transfer tests are very extensive, + because they are quicker, while central connection tests are less + thorough. We test the outputs of these two functions directly + under a range of test cases to ensure they are matched under many conditions. + + """ central_path = project.get_central_path() + # Create a set of folders and files # fmt: off paths_to_make = [] for i in range(1, 4): @@ -37,42 +55,36 @@ def test_local_vs_central_search_methods( test_utils.write_file(central_path / path_.parent.parent.parent / f"{path_.parent.parent.name}.md", contents="hello_world",) # fmt: on - from datashuttle.utils.folders import ( - search_central_via_connection, - search_local_filesystem, - ) - - # -- monkeypatch cfg.get_rclone_config to return dummy config + # Monkeycatch `get_rclone_config_name` to return `local` and set `local` + # as a rclone config entry associated with the local filesystem. By this + # method we can hijack `search_central_via_connection` to run locally + # (though it is set up in practice to run via ssh, gdrive or aws). monkeypatch.setattr( project.cfg, "get_rclone_config_name", lambda connection_method: "local", ) - import subprocess - subprocess.run( ["rclone", "config", "create", "local", "local", "nounc", "true"], shell=True, ) + # Perform a range of checks across folders and files + # and check the outputs of both approaches match. + # fmt: off for search_path, search_str in ( (central_path / "rawdata", "*"), (central_path / "rawdata", "sub-*"), (central_path / "rawdata" / "sub-003_condition-test", "ses-*"), (central_path / "rawdata/sub-001/ses-002_date-20250402", "behav"), - ( - central_path / "rawdata/sub-001/ses-002_date-20250402/behav", - "behav_file.md", - ), # ses-002_date-20250402 - ( - central_path / "rawdata/sub-001/ses-002_date-20250402/behav", - "*", - ), + (central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "behav_file.md",), + (central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "*",), (central_path / "rawdata/sub-002", "*"), (central_path / "rawdata/sub-002/ses-002_date-20250402", "*"), (central_path / "rawdata", "sub-003*"), ): + # fmt: on central_method_folders, central_method_files = ( search_central_via_connection( project.cfg, From 6d7f92b790355fc85dc1ac1e9beacf19fc0b756a Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 6 Aug 2025 01:52:12 +0100 Subject: [PATCH 05/10] Fix tests. --- datashuttle/utils/folders.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index a840036a8..c80da2c3d 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -695,18 +695,18 @@ def search_central_via_connection( pipe_std=True, ) + all_folder_names: list = [] + all_filenames: list = [] + if output.returncode != 0: - utils.log_and_raise_error( + utils.log_and_message( f"Error searching files at {search_path.as_posix()}\n" - f"{output.stderr.decode('utf-8') if output.stderr else ''}", - RuntimeError, + f"{output.stderr.decode('utf-8') if output.stderr else ''}" ) + return all_folder_names, all_filenames files_and_folders = json.loads(output.stdout) - all_folder_names = [] - all_filenames = [] - for file_or_folder in files_and_folders: name = file_or_folder["Name"] From d7472a9e8a349b27189bdeb8cdc9771a76a50ec0 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 6 Aug 2025 16:15:58 +0100 Subject: [PATCH 06/10] Try sorting outputs to fix tests. --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index c80da2c3d..8c3fce233 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -660,7 +660,7 @@ def search_local_filesystem( elif item.is_file(): all_filenames.append(to_append) - return all_folder_names, all_filenames + return sorted(all_folder_names), sorted(all_filenames) def search_central_via_connection( From 35b553434371650834ede7bc2bfb276e9f76b4f7 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Wed, 6 Aug 2025 17:28:07 +0100 Subject: [PATCH 07/10] Sort all outputs for complete cross-platform equivalence. --- datashuttle/utils/folders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 8c3fce233..203e77917 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -722,4 +722,4 @@ def search_central_via_connection( else: all_filenames.append(to_append) - return all_folder_names, all_filenames + return sorted(all_folder_names), sorted(all_filenames) From c42dd418b40f5087a46e78e86c9af4826935d752 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Thu, 7 Aug 2025 15:25:59 +0100 Subject: [PATCH 08/10] Fix tests on linux. --- tests/tests_integration/test_search_methods.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py index 18933abb4..6cf87bbe2 100644 --- a/tests/tests_integration/test_search_methods.py +++ b/tests/tests_integration/test_search_methods.py @@ -66,8 +66,7 @@ def test_local_vs_central_search_methods( ) subprocess.run( - ["rclone", "config", "create", "local", "local", "nounc", "true"], - shell=True, + "rclone config create local local nounc true", shell=True ) # Perform a range of checks across folders and files From 8bcec2127ea266e26e4a36ce5ebb2df7f99d5ef4 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Thu, 21 Aug 2025 10:49:04 +0100 Subject: [PATCH 09/10] Minor changes to docstrings. --- datashuttle/utils/folders.py | 11 +++++++---- tests/tests_integration/test_search_methods.py | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 203e77917..c53167299 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -571,11 +571,12 @@ def search_for_folders( verbose: bool = True, return_full_path: bool = False, ) -> Tuple[List[Any], List[Any]]: - """Determine the method used to search for search prefix folders in the search path. + """Search for files and folders in the search path. - If local filesystem, use a bespoke function because it is faster. `rclone` - can be used to search local filesystem, but it is slow because a new process - must be created. These functions are tested against each-other. + If searching the local filesystem, use a separate function that does not call `rclone`, + which is faster as a new process does not have to be created. This slowness + is mostly a problem on Windows. This does mean there are two duplicate functions, + these are tested against each-other in unit tests. Parameters ---------- @@ -643,8 +644,10 @@ def search_local_filesystem( The path to search (relative to the local or remote drive). For example, for "local_filesystem" this is the path on the local machine. For "ssh", this is the path on the machine that has been connected to. + search_prefix The search string e.g. "sub-*". + return_full_path If `True`, return the full filepath, otherwise return only the folder/file name. diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py index 6cf87bbe2..cfca5a5a0 100644 --- a/tests/tests_integration/test_search_methods.py +++ b/tests/tests_integration/test_search_methods.py @@ -23,7 +23,7 @@ def test_local_vs_central_search_methods( ): """ Test the `search_local_filesystem` and `search_central_via_connection` - functions. These functions perform the same function but `search_local_filesystem` + functions. These functions should have the same outputs but `search_local_filesystem` is used for local filesystem for speed. Here we check the outputs of these functions match. @@ -36,7 +36,7 @@ def test_local_vs_central_search_methods( """ central_path = project.get_central_path() - # Create a set of folders and files + # Create a project of folders and files # fmt: off paths_to_make = [] for i in range(1, 4): @@ -84,6 +84,7 @@ def test_local_vs_central_search_methods( (central_path / "rawdata", "sub-003*"), ): # fmt: on + central_method_folders, central_method_files = ( search_central_via_connection( project.cfg, From cbd4feebe32a94fca0136282d2143508fd1a2194 Mon Sep 17 00:00:00 2001 From: JoeZiminski Date: Mon, 25 Aug 2025 14:47:52 +0200 Subject: [PATCH 10/10] Apply feedback from review. --- datashuttle/utils/folders.py | 12 ++++++++---- tests/tests_integration/test_search_methods.py | 6 ++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index c53167299..2db3c773d 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -642,8 +642,9 @@ def search_local_filesystem( ---------- search_path The path to search (relative to the local or remote drive). For example, - for "local_filesystem" this is the path on the local machine. For "ssh", this - is the path on the machine that has been connected to. + for "local_filesystem" this is the path on the local machine. For any other + connection to central, this is the path on the central storage that has been + connected to. search_prefix The search string e.g. "sub-*". @@ -683,10 +684,13 @@ def search_central_via_connection( datashuttle Configs class. search_path The path to search (relative to the local or remote drive). For example, - for "local_filesystem" this is the path on the local machine. For "ssh", this - is the path on the machine that has been connected to. + for "local_filesystem" this is the path on the local machine. For any other + connection to central, this is the path on the central storage that has been + connected to. + search_prefix The search string e.g. "sub-*". + return_full_path If `True`, return the full filepath, otherwise return only the folder/file name. diff --git a/tests/tests_integration/test_search_methods.py b/tests/tests_integration/test_search_methods.py index cfca5a5a0..a53953146 100644 --- a/tests/tests_integration/test_search_methods.py +++ b/tests/tests_integration/test_search_methods.py @@ -1,4 +1,3 @@ -import subprocess from pathlib import Path import pytest @@ -7,6 +6,7 @@ search_central_via_connection, search_local_filesystem, ) +from datashuttle.utils.rclone import call_rclone from .. import test_utils from ..base import BaseTest @@ -65,9 +65,7 @@ def test_local_vs_central_search_methods( lambda connection_method: "local", ) - subprocess.run( - "rclone config create local local nounc true", shell=True - ) + call_rclone("config create local local nounc true") # Perform a range of checks across folders and files # and check the outputs of both approaches match.