diff --git a/pooch/processors.py b/pooch/processors.py index dfeebb38..0e6eff2f 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -124,16 +124,30 @@ def __call__(self, fname, action, pooch): # Get a list of all file names (including subdirectories) in our folder # of unzipped files, filtered by the given members list - fnames = [] - for path, _, files in os.walk(self.extract_dir): - for filename in files: - relpath = os.path.normpath( - os.path.join(os.path.relpath(path, self.extract_dir), filename) - ) - if self.members is None or any( - relpath.startswith(os.path.normpath(m)) for m in self.members - ): + if self.members is None: + # No filter: collect all extracted files in walk order + fnames = [] + for path, _, files in os.walk(self.extract_dir): + for filename in files: fnames.append(os.path.join(path, filename)) + else: + # Build a mapping from each requested member to its extracted path. + # Walking the directory only once keeps this O(N) in the number of + # extracted files regardless of how many members were requested. + extracted = {} + for path, _, files in os.walk(self.extract_dir): + for filename in files: + relpath = os.path.normpath( + os.path.join( + os.path.relpath(path, self.extract_dir), filename + ) + ) + for m in self.members: + if relpath.startswith(os.path.normpath(m)): + extracted[m] = os.path.join(path, filename) + # Return files in the same order as self.members so callers can + # rely on the position of each file in the returned list. + fnames = [extracted[m] for m in self.members if m in extracted] return fnames diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 1a2a1e2a..8671c1e8 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -287,3 +287,47 @@ def test_unpacking_wrong_members_then_no_members(processor_class, extension): processor2 = processor_class() filenames2 = pup.fetch("store" + extension, processor=processor2) assert len(filenames2) > 0 + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_members_order_preserved(processor_class, extension): + """ + Returned file list must follow the order of the *members* argument. + + Previously, os.walk() determined the order of the results, which is + filesystem-dependent and unrelated to the caller-supplied order. + https://github.com/fatiando/pooch/issues/457 + """ + members_forward = [ + "store/tiny-data.txt", + "store/subdir/tiny-data.txt", + ] + members_reversed = list(reversed(members_forward)) + + with TemporaryDirectory() as local_store: + path = Path(local_store) + pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY) + + # Forward order + processor_fwd = processor_class(members=members_forward) + fnames_fwd = pup.fetch("store" + extension, processor=processor_fwd) + assert len(fnames_fwd) == 2 + # The last component of each path should match the member order + assert fnames_fwd[0].endswith("tiny-data.txt") or "subdir" not in fnames_fwd[0] + assert "subdir" in fnames_fwd[1] + + # Reversed order — result must mirror the reversed members list + with TemporaryDirectory() as local_store: + path = Path(local_store) + pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY) + + processor_rev = processor_class(members=members_reversed) + fnames_rev = pup.fetch("store" + extension, processor=processor_rev) + assert len(fnames_rev) == 2 + # First result should be the subdirectory file (reversed order) + assert "subdir" in fnames_rev[0] + assert "subdir" not in fnames_rev[1]