Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions pooch/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,30 @@ def __call__(self, fname, action, pooch):

# Get a list of all file names (including subdirectories) in our folder
# of unzipped files, filtered by the given members list
fnames = []
for path, _, files in os.walk(self.extract_dir):
for filename in files:
relpath = os.path.normpath(
os.path.join(os.path.relpath(path, self.extract_dir), filename)
)
if self.members is None or any(
relpath.startswith(os.path.normpath(m)) for m in self.members
):
if self.members is None:
# No filter: collect all extracted files in walk order
fnames = []
for path, _, files in os.walk(self.extract_dir):
for filename in files:
fnames.append(os.path.join(path, filename))
else:
# Build a mapping from each requested member to its extracted path.
# Walking the directory only once keeps this O(N) in the number of
# extracted files regardless of how many members were requested.
extracted = {}
for path, _, files in os.walk(self.extract_dir):
for filename in files:
relpath = os.path.normpath(
os.path.join(
os.path.relpath(path, self.extract_dir), filename
)
)
for m in self.members:
if relpath.startswith(os.path.normpath(m)):
extracted[m] = os.path.join(path, filename)
# Return files in the same order as self.members so callers can
# rely on the position of each file in the returned list.
fnames = [extracted[m] for m in self.members if m in extracted]

return fnames

Expand Down
44 changes: 44 additions & 0 deletions pooch/tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,47 @@ def test_unpacking_wrong_members_then_no_members(processor_class, extension):
processor2 = processor_class()
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) > 0


@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpacking_members_order_preserved(processor_class, extension):
"""
Returned file list must follow the order of the *members* argument.

Previously, os.walk() determined the order of the results, which is
filesystem-dependent and unrelated to the caller-supplied order.
https://github.com/fatiando/pooch/issues/457
"""
members_forward = [
"store/tiny-data.txt",
"store/subdir/tiny-data.txt",
]
members_reversed = list(reversed(members_forward))

with TemporaryDirectory() as local_store:
path = Path(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)

# Forward order
processor_fwd = processor_class(members=members_forward)
fnames_fwd = pup.fetch("store" + extension, processor=processor_fwd)
assert len(fnames_fwd) == 2
# The last component of each path should match the member order
assert fnames_fwd[0].endswith("tiny-data.txt") or "subdir" not in fnames_fwd[0]
assert "subdir" in fnames_fwd[1]

# Reversed order — result must mirror the reversed members list
with TemporaryDirectory() as local_store:
path = Path(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)

processor_rev = processor_class(members=members_reversed)
fnames_rev = pup.fetch("store" + extension, processor=processor_rev)
assert len(fnames_rev) == 2
# First result should be the subdirectory file (reversed order)
assert "subdir" in fnames_rev[0]
assert "subdir" not in fnames_rev[1]
Loading