Skip to content

Commit bef1d3d

Browse files
JoeZiminskics7-shrey
authored andcommitted
Use lsjson for searches (neuroinformatics-unit#551)
* Add get_rclone_config_name_local and rename get_rclone_config_name to get_rclone_config_name_central. * Add new transfer function. * Fix rename in tests. * Tidy up the search functions. * Fix circular import. * Adding more tests and fixing an edge case. * Add wildcards to local filesystem transfer tests. * Added tests to ssh. * Remove unused function. * Move teardown to fix tests on macos. * Revert get_rclone_config_name_central name change. * Add documentation to tests. * Remove unecessary sorted. * Fix cyclical import. * Remove unecessary wildcard. * Remove unused functions.
1 parent 37e7960 commit bef1d3d

8 files changed

Lines changed: 304 additions & 245 deletions

File tree

datashuttle/configs/config_class.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def get_base_folder(
250250
def get_rclone_config_name(
251251
self, connection_method: Optional[str] = None
252252
) -> str:
253-
"""Generate the rclone configuration name for the project.
253+
"""Generate the rclone configuration name for the central project.
254254
255255
These configs are created by datashuttle but managed and stored by rclone.
256256
"""

datashuttle/utils/folders.py

Lines changed: 54 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
from datashuttle.configs.config_class import Configs
1818
from datashuttle.utils.custom_types import TopLevelFolder
1919

20-
import glob
20+
import fnmatch
2121
from pathlib import Path
2222

2323
from datashuttle.configs import canonical_folders, canonical_tags
24-
from datashuttle.utils import rclone, ssh, utils, validation
24+
from datashuttle.utils import rclone, utils, validation
2525
from datashuttle.utils.custom_exceptions import NeuroBlueprintError
2626

2727
# -----------------------------------------------------------------------------
@@ -599,56 +599,62 @@ def search_for_folders(
599599
Discovered folders (`all_folder_names`) and files (`all_filenames`).
600600
601601
"""
602-
if local_or_central == "central" and cfg["connection_method"] in [
603-
"ssh",
604-
"gdrive",
605-
"aws",
606-
]:
607-
if cfg["connection_method"] == "ssh":
608-
all_folder_names, all_filenames = (
609-
ssh.search_ssh_central_for_folders(
610-
search_path,
611-
search_prefix,
612-
cfg,
613-
verbose,
614-
return_full_path,
615-
)
616-
)
617-
618-
else:
619-
all_folder_names, all_filenames = search_gdrive_or_aws_for_folders(
620-
search_path, search_prefix, cfg, return_full_path
621-
)
622-
602+
if (
603+
local_or_central == "local"
604+
or cfg["connection_method"] == "local_filesystem"
605+
) and not search_path.exists():
606+
if verbose:
607+
utils.log_and_message(f"No file found at {search_path.as_posix()}")
608+
return [], []
609+
610+
if local_or_central == "local":
611+
rclone_config_name = None
623612
else:
624-
if not search_path.exists():
625-
if verbose:
626-
utils.log_and_message(
627-
f"No file found at {search_path.as_posix()}"
628-
)
629-
return [], []
630-
631-
all_folder_names, all_filenames = search_filesystem_path_for_folders(
632-
search_path / search_prefix, return_full_path
613+
rclone_config_name = cfg.get_rclone_config_name(
614+
cfg["connection_method"]
633615
)
616+
617+
all_folder_names, all_filenames = search_local_or_remote(
618+
search_path,
619+
search_prefix,
620+
rclone_config_name,
621+
return_full_path,
622+
)
623+
634624
return all_folder_names, all_filenames
635625

636626

637-
def search_gdrive_or_aws_for_folders(
627+
def search_local_or_remote(
638628
search_path: Path,
639629
search_prefix: str,
640-
cfg: Configs,
630+
rclone_config_name: str | None,
641631
return_full_path: bool = False,
642632
) -> Tuple[List[Any], List[Any]]:
643633
"""Search for files and folders in central path using `rclone lsjson` command.
644634
645635
This command lists all the files and folders in the central path in a json format.
646636
The json contains file/folder info about each file/folder like name, type, etc.
637+
638+
Parameters
639+
----------
640+
search_path
641+
The path to search (relative to the local or remote drive). For example,
642+
for "local_filesystem" this is the path on the local machine. For "ssh", this
643+
is the path on the machine that has been connected to.
644+
search_prefix
645+
The search string e.g. "sub-*".
646+
rclone_config_name
647+
Name of the rclone config for the remote (not set for local). `rclone config`
648+
can be used in the terminal to see how rclone has stored these. In datashuttle,
649+
these are managed by `Configs`.
650+
return_full_path
651+
If `True`, return the full filepath, otherwise return only the folder/file name.
652+
647653
"""
654+
config_prefix = "" if not rclone_config_name else f"{rclone_config_name}:"
655+
648656
output = rclone.call_rclone(
649-
"lsjson "
650-
f"{cfg.get_rclone_config_name()}:{search_path.as_posix()} "
651-
f'--include "{search_prefix}"',
657+
f'lsjson {config_prefix}"{search_path.as_posix()}"',
652658
pipe_std=True,
653659
)
654660

@@ -657,73 +663,26 @@ def search_gdrive_or_aws_for_folders(
657663

658664
if output.returncode != 0:
659665
utils.log_and_message(
660-
f"Error searching files at {search_path.as_posix()} \n {output.stderr.decode('utf-8') if output.stderr else ''}"
666+
f"Error searching files at {search_path.as_posix()}\n"
667+
f"{output.stderr.decode('utf-8') if output.stderr else ''}"
661668
)
662669
return all_folder_names, all_filenames
663670

664671
files_and_folders = json.loads(output.stdout)
665672

666-
try:
667-
for file_or_folder in files_and_folders:
668-
name = file_or_folder["Name"]
669-
is_dir = file_or_folder.get("IsDir", False)
670-
671-
to_append = (
672-
(search_path / name).as_posix() if return_full_path else name
673-
)
674-
675-
if is_dir:
676-
all_folder_names.append(to_append)
677-
else:
678-
all_filenames.append(to_append)
679-
680-
except Exception:
681-
utils.log_and_message(
682-
f"Error searching files at {search_path.as_posix()}"
683-
)
684-
685-
return all_folder_names, all_filenames
686-
687-
688-
# Actual function implementation
689-
def search_filesystem_path_for_folders(
690-
search_path_with_prefix: Path, return_full_path: bool = False
691-
) -> Tuple[List[Path | str], List[Path | str]]:
692-
r"""Search a folder through the local filesystem.
693-
694-
Use glob to search the full search path (including prefix) with glob.
695-
Files are filtered out of results, returning folders only.
696-
697-
Parameters
698-
----------
699-
search_path_with_prefix
700-
Path to search along with search prefix e.g. "C:\drive\project\sub-*"
701-
702-
return_full_path
703-
If `True` returns the path to the discovered folder or file,
704-
otherwise just the name.
705-
706-
Returns
707-
-------
708-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
673+
for file_or_folder in files_and_folders:
674+
name = file_or_folder["Name"]
709675

710-
"""
711-
all_folder_names = []
712-
all_filenames = []
676+
if not fnmatch.fnmatch(name, search_prefix):
677+
continue
713678

714-
all_files_and_folders = list(glob.glob(search_path_with_prefix.as_posix()))
715-
sorter_files_and_folders = sorted(all_files_and_folders)
679+
is_dir = file_or_folder.get("IsDir", False)
716680

717-
for file_or_folder_str in sorter_files_and_folders:
718-
file_or_folder = Path(file_or_folder_str)
681+
to_append = search_path / name if return_full_path else name
719682

720-
if file_or_folder.is_dir():
721-
all_folder_names.append(
722-
file_or_folder if return_full_path else file_or_folder.name
723-
)
683+
if is_dir:
684+
all_folder_names.append(to_append)
724685
else:
725-
all_filenames.append(
726-
file_or_folder if return_full_path else file_or_folder.name
727-
)
686+
all_filenames.append(to_append)
728687

729688
return all_folder_names, all_filenames

datashuttle/utils/formatting.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,13 @@ def check_and_format_names(
6464
names_to_format, reserved_keywords = [], []
6565
for name in names:
6666
if name in canonical_reserved_keywords() or tags("*") in name:
67-
reserved_keywords.append(name)
67+
if tags("to") in name:
68+
# handle an edge case where use searches with both tags
69+
reserved_keywords += update_names_with_range_to_flag(
70+
[name], prefix
71+
)
72+
else:
73+
reserved_keywords.append(name)
6874
else:
6975
names_to_format.append(name)
7076

datashuttle/utils/getters.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,9 @@ def get_existing_project_paths() -> List[Path]:
293293
"""
294294
datashuttle_path = canonical_folders.get_datashuttle_path()
295295

296-
all_folders, _ = folders.search_filesystem_path_for_folders(
297-
datashuttle_path / "*"
298-
)
296+
all_folders = [
297+
path_ for path_ in datashuttle_path.glob("*") if path_.is_dir()
298+
]
299299

300300
existing_project_paths = []
301301
for folder_name in all_folders:

datashuttle/utils/rclone.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -325,11 +325,7 @@ def check_successful_connection_and_raise_error_on_fail(cfg: Configs) -> None:
325325
If the command fails, it raises a ConnectionError. The created file is
326326
deleted thereafter.
327327
"""
328-
if cfg["central_path"] is None:
329-
tempfile_path = "temp.txt"
330-
else:
331-
tempfile_path = (cfg["central_path"] / "temp.txt").as_posix()
332-
328+
tempfile_path = (cfg["central_path"] / "temp.txt").as_posix()
333329
output = call_rclone(
334330
f"touch {cfg.get_rclone_config_name()}:{tempfile_path}", pipe_std=True
335331
)

datashuttle/utils/ssh.py

Lines changed: 1 addition & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55
if TYPE_CHECKING:
66
from datashuttle.configs.config_class import Configs
77

8-
import fnmatch
9-
import stat
108
from pathlib import Path
11-
from typing import Any, List, Optional, Tuple
9+
from typing import Optional
1210

1311
import paramiko
1412

@@ -306,119 +304,3 @@ def verify_ssh_central_host(
306304
utils.log("Host not accepted. No connection made.")
307305

308306
return success
309-
310-
311-
# -----------------------------------------------------------------------------
312-
# Search over SSH
313-
# -----------------------------------------------------------------------------
314-
315-
316-
def search_ssh_central_for_folders(
317-
search_path: Path,
318-
search_prefix: str,
319-
cfg: Configs,
320-
verbose: bool = True,
321-
return_full_path: bool = False,
322-
) -> Tuple[List[Any], List[Any]]:
323-
"""Search for the search prefix in the search path over SSH.
324-
325-
Parameters
326-
----------
327-
search_path
328-
Path to search for folders in.
329-
330-
search_prefix
331-
Search prefix for folder names e.g. "sub-*".
332-
333-
cfg
334-
See connect_client_with_logging().
335-
336-
verbose
337-
If `True`, if a search folder cannot be found, a message
338-
will be printed with the un-found path.
339-
340-
return_full_path
341-
include the search_path in the returned paths
342-
343-
Returns
344-
-------
345-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
346-
347-
"""
348-
client: paramiko.SSHClient
349-
with paramiko.SSHClient() as client:
350-
connect_client_with_logging(
351-
client, cfg, message_on_sucessful_connection=verbose
352-
)
353-
354-
sftp = client.open_sftp()
355-
356-
all_folder_names, all_filenames = get_list_of_folder_names_over_sftp(
357-
sftp,
358-
search_path,
359-
search_prefix,
360-
verbose,
361-
return_full_path,
362-
)
363-
364-
return all_folder_names, all_filenames
365-
366-
367-
def get_list_of_folder_names_over_sftp(
368-
sftp: paramiko.sftp_client.SFTPClient,
369-
search_path: Path,
370-
search_prefix: str,
371-
verbose: bool = True,
372-
return_full_path: bool = False,
373-
) -> Tuple[List[Any], List[Any]]:
374-
"""Use paramiko's sftp to search a path over ssh for folders.
375-
376-
Return the folder names.
377-
378-
Parameters
379-
----------
380-
sftp
381-
Connected paramiko stfp object
382-
(see search_ssh_central_for_folders()).
383-
384-
search_path
385-
Path to search for folders in.
386-
387-
search_prefix
388-
Prefix (can include wildcards)
389-
to search folder names.
390-
391-
verbose
392-
If `True`, if a search folder cannot be found, a message
393-
will be printed with the un-found path.
394-
395-
return_full_path
396-
include the search_path in the returned paths.
397-
398-
Returns
399-
-------
400-
Discovered folders (`all_folder_names`) and files (`all_filenames`).
401-
402-
"""
403-
all_folder_names = []
404-
all_filenames = []
405-
try:
406-
for file_or_folder in sftp.listdir_attr(search_path.as_posix()):
407-
if file_or_folder.st_mode is not None and fnmatch.fnmatch(
408-
file_or_folder.filename, search_prefix
409-
):
410-
to_append = (
411-
search_path / file_or_folder.filename
412-
if return_full_path
413-
else file_or_folder.filename
414-
)
415-
if stat.S_ISDIR(file_or_folder.st_mode):
416-
all_folder_names.append(to_append)
417-
else:
418-
all_filenames.append(to_append)
419-
420-
except FileNotFoundError:
421-
if verbose:
422-
utils.log_and_message(f"No file found at {search_path.as_posix()}")
423-
424-
return all_folder_names, all_filenames

0 commit comments

Comments
 (0)