Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 68 additions & 28 deletions datashuttle/utils/folders.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,12 @@ def search_for_folders(
verbose: bool = True,
return_full_path: bool = False,
) -> Tuple[List[Any], List[Any]]:
"""Determine the method used to search for search prefix folders in the search path.
"""Search for files and folders in the search path.

If searching the local filesystem, use a separate function that does not call `rclone`,
which is faster as a new process does not have to be created. This slowness
is mostly a problem on Windows. This does mean there are two duplicate functions,
these are tested against each-other in unit tests.

Parameters
----------
Expand Down Expand Up @@ -602,64 +607,99 @@ def search_for_folders(
if (
local_or_central == "local"
or cfg["connection_method"] == "local_filesystem"
) and not search_path.exists():
if verbose:
utils.log_and_message(f"No file found at {search_path.as_posix()}")
return [], []
):
if not search_path.exists():
if verbose:
utils.log_and_message(
f"No file found at {search_path.as_posix()}"
)
return [], []

if local_or_central == "local":
rclone_config_name = None
else:
rclone_config_name = cfg.get_rclone_config_name(
cfg["connection_method"]
all_folder_names, all_filenames = search_local_filesystem(
search_path, search_prefix, return_full_path
)

all_folder_names, all_filenames = search_local_or_remote(
search_path,
search_prefix,
rclone_config_name,
return_full_path,
)
else:
all_folder_names, all_filenames = search_central_via_connection(
cfg,
search_path,
search_prefix,
return_full_path,
)

return all_folder_names, all_filenames


def search_local_or_remote(
def search_local_filesystem(
search_path: Path, search_prefix: str, return_full_path: bool = False
) -> tuple[List[Any], List[Any]]:
"""Search local filesystem recursively.

Partner function to `search_central_via_connection()` for use
locally as is much faster than calling `rclone` through a new process.

Parameters
----------
search_path
The path to search (relative to the local or remote drive). For example,
for "local_filesystem" this is the path on the local machine. For "ssh", this
is the path on the machine that has been connected to.
Comment thread
JoeZiminski marked this conversation as resolved.
Outdated

search_prefix
The search string e.g. "sub-*".

return_full_path
If `True`, return the full filepath, otherwise return only the folder/file name.

"""
all_folder_names = []
all_filenames = []

for item in search_path.glob(search_prefix):
to_append = item if return_full_path else item.name

if item.is_dir():
all_folder_names.append(to_append)
elif item.is_file():
all_filenames.append(to_append)

return sorted(all_folder_names), sorted(all_filenames)


def search_central_via_connection(
cfg: Configs,
search_path: Path,
search_prefix: str,
rclone_config_name: str | None,
return_full_path: bool = False,
) -> Tuple[List[Any], List[Any]]:
) -> tuple[List[Any], List[Any]]:
"""Search for files and folders in central path using `rclone lsjson` command.

This command lists all the files and folders in the central path in a json format.
The json contains file/folder info about each file/folder like name, type, etc.

Parameters
----------
cfg
datashuttle Configs class.
search_path
The path to search (relative to the local or remote drive). For example,
for "local_filesystem" this is the path on the local machine. For "ssh", this
is the path on the machine that has been connected to.
search_prefix
The search string e.g. "sub-*".
rclone_config_name
Name of the rclone config for the remote (not set for local). `rclone config`
can be used in the terminal to see how rclone has stored these. In datashuttle,
these are managed by `Configs`.
return_full_path
If `True`, return the full filepath, otherwise return only the folder/file name.

"""
config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:"
rclone_config_name = cfg.get_rclone_config_name(cfg["connection_method"])

output = rclone.call_rclone(
f'lsjson {config_prefix}"{search_path.as_posix()}"',
f'lsjson {rclone_config_name}:"{search_path.as_posix()}"',
pipe_std=True,
)

all_folder_names: List[str] = []
all_filenames: List[str] = []
all_folder_names: list = []
all_filenames: list = []

if output.returncode != 0:
utils.log_and_message(
Expand All @@ -685,4 +725,4 @@ def search_local_or_remote(
else:
all_filenames.append(to_append)

return all_folder_names, all_filenames
return sorted(all_folder_names), sorted(all_filenames)
105 changes: 105 additions & 0 deletions tests/tests_integration/test_search_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import subprocess
from pathlib import Path

import pytest

from datashuttle.utils.folders import (
search_central_via_connection,
search_local_filesystem,
)

from .. import test_utils
from ..base import BaseTest

# -----------------------------------------------------------------------------
# Inconsistent sub or ses value lengths
# -----------------------------------------------------------------------------


class TestSubSesSearches(BaseTest):
@pytest.mark.parametrize("return_full_path", [True, False])
def test_local_vs_central_search_methods(
self, project, monkeypatch, return_full_path
):
"""
Test the `search_local_filesystem` and `search_central_via_connection`
functions. These functions should have the same outputs but `search_local_filesystem`
is used for local filesystem for speed. Here we check the outputs of these
functions match.

These functions are individually tested in many places, primarily in
transfer tests. Local filesystem transfer tests are very extensive,
because they are quicker, while central connection tests are less
thorough. We test the outputs of these two functions directly
under a range of test cases to ensure they are matched under many conditions.

"""
central_path = project.get_central_path()

# Create a project of folders and files
# fmt: off
paths_to_make = []
for i in range(1, 4):
paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-001/behav"))
paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/behav"))
paths_to_make.append(Path(f"rawdata/sub-00{i}/ses-002_date-20250402/anat"))

paths_to_make.append(Path("rawdata/sub-003_condition-test/ses-001_hello-world/ephys"))
paths_to_make.append(Path("rawdata/sub-004_condition-test/ses-002_hello-world/funcimg"))

for path_ in paths_to_make:
(central_path / path_).mkdir(parents=True)
test_utils.write_file(central_path / path_ / f"{path_.name}_file.md", contents="hello_world",)
test_utils.write_file(central_path / path_.parent / f"{path_.name}.md", contents="hello_world",)
test_utils.write_file(central_path / path_.parent.parent / f"{path_.parent.name}.md", contents="hello_world",)
test_utils.write_file(central_path / path_.parent.parent.parent / f"{path_.parent.parent.name}.md", contents="hello_world",)
# fmt: on

# Monkeycatch `get_rclone_config_name` to return `local` and set `local`
# as a rclone config entry associated with the local filesystem. By this
# method we can hijack `search_central_via_connection` to run locally
# (though it is set up in practice to run via ssh, gdrive or aws).
monkeypatch.setattr(
project.cfg,
"get_rclone_config_name",
lambda connection_method: "local",
)

subprocess.run(
"rclone config create local local nounc true", shell=True
)
Comment thread
JoeZiminski marked this conversation as resolved.
Outdated

# Perform a range of checks across folders and files
# and check the outputs of both approaches match.
# fmt: off
for search_path, search_str in (
(central_path / "rawdata", "*"),
(central_path / "rawdata", "sub-*"),
(central_path / "rawdata" / "sub-003_condition-test", "ses-*"),
(central_path / "rawdata/sub-001/ses-002_date-20250402", "behav"),
(central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "behav_file.md",),
(central_path / "rawdata/sub-001/ses-002_date-20250402/behav", "*",),
(central_path / "rawdata/sub-002", "*"),
(central_path / "rawdata/sub-002/ses-002_date-20250402", "*"),
(central_path / "rawdata", "sub-003*"),
):
# fmt: on

central_method_folders, central_method_files = (
search_central_via_connection(
project.cfg,
search_path,
search_str,
return_full_path=return_full_path,
)
)
local_method_folders, local_method_files = search_local_filesystem(
search_path, search_str, return_full_path=return_full_path
)

assert central_method_folders == local_method_folders, (
f"Failed folders, search_str: {search_str}, search_path: {search_path}"
)
assert central_method_files == local_method_files, (
f"Failed files, search_str: {search_str}, search_path: {search_path}"
)
Loading