From f0f7cf600dbf9801fbf24abed07431310f0984b5 Mon Sep 17 00:00:00 2001 From: terminalchai Date: Wed, 22 Apr 2026 00:13:10 +0530 Subject: [PATCH] fix: preserve members order in ExtractorProcessor.__call__() When the 'members' argument is supplied to Unzip or Untar, the returned list of extracted file paths now follows the same order as 'members' instead of the arbitrary order produced by os.walk(). Previously the code walked the extract directory and appended files whenever any member prefix matched, meaning the order of results was determined by the filesystem rather than the caller. The fix replaces the single os.walk() loop with two code paths: - members is None -> walk and collect all files (behaviour unchanged) - members provided -> walk once into a dict {member: full_path}, then return [dict[m] for m in self.members] to guarantee caller-specified order. Walking only once preserves O(N) complexity. Add test_unpacking_members_order_preserved for both Unzip and Untar. Closes #457 --- pooch/processors.py | 32 ++++++++++++++++++------- pooch/tests/test_processors.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index dfeebb387..0e6eff2f2 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -124,16 +124,30 @@ def __call__(self, fname, action, pooch): # Get a list of all file names (including subdirectories) in our folder # of unzipped files, filtered by the given members list - fnames = [] - for path, _, files in os.walk(self.extract_dir): - for filename in files: - relpath = os.path.normpath( - os.path.join(os.path.relpath(path, self.extract_dir), filename) - ) - if self.members is None or any( - relpath.startswith(os.path.normpath(m)) for m in self.members - ): + if self.members is None: + # No filter: collect all extracted files in walk order + fnames = [] + for path, _, files in os.walk(self.extract_dir): + for filename in files: fnames.append(os.path.join(path, filename)) + else: + # Build a mapping from each requested member to its extracted path. + # Walking the directory only once keeps this O(N) in the number of + # extracted files regardless of how many members were requested. + extracted = {} + for path, _, files in os.walk(self.extract_dir): + for filename in files: + relpath = os.path.normpath( + os.path.join( + os.path.relpath(path, self.extract_dir), filename + ) + ) + for m in self.members: + if relpath.startswith(os.path.normpath(m)): + extracted[m] = os.path.join(path, filename) + # Return files in the same order as self.members so callers can + # rely on the position of each file in the returned list. + fnames = [extracted[m] for m in self.members if m in extracted] return fnames diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 1a2a1e2a0..8671c1e81 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -287,3 +287,47 @@ def test_unpacking_wrong_members_then_no_members(processor_class, extension): processor2 = processor_class() filenames2 = pup.fetch("store" + extension, processor=processor2) assert len(filenames2) > 0 + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_members_order_preserved(processor_class, extension): + """ + Returned file list must follow the order of the *members* argument. + + Previously, os.walk() determined the order of the results, which is + filesystem-dependent and unrelated to the caller-supplied order. + https://github.com/fatiando/pooch/issues/457 + """ + members_forward = [ + "store/tiny-data.txt", + "store/subdir/tiny-data.txt", + ] + members_reversed = list(reversed(members_forward)) + + with TemporaryDirectory() as local_store: + path = Path(local_store) + pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY) + + # Forward order + processor_fwd = processor_class(members=members_forward) + fnames_fwd = pup.fetch("store" + extension, processor=processor_fwd) + assert len(fnames_fwd) == 2 + # The last component of each path should match the member order + assert fnames_fwd[0].endswith("tiny-data.txt") or "subdir" not in fnames_fwd[0] + assert "subdir" in fnames_fwd[1] + + # Reversed order — result must mirror the reversed members list + with TemporaryDirectory() as local_store: + path = Path(local_store) + pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY) + + processor_rev = processor_class(members=members_reversed) + fnames_rev = pup.fetch("store" + extension, processor=processor_rev) + assert len(fnames_rev) == 2 + # First result should be the subdirectory file (reversed order) + assert "subdir" in fnames_rev[0] + assert "subdir" not in fnames_rev[1]