Skip to content
This repository was archived by the owner on Mar 31, 2026. It is now read-only.

Commit 0cc2884

Browse files
committed
feat(storage): support returning skipped items as UserWarning in download_many_to_path
1 parent 14cfd61 commit 0cc2884

3 files changed

Lines changed: 168 additions & 16 deletions

File tree

google/cloud/storage/transfer_manager.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -805,43 +805,59 @@ def download_many_to_path(
805805
806806
:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
807807
808-
:rtype: list
808+
:rtype: List[None|Exception|UserWarning]
809809
:returns: A list of results corresponding to, in order, each item in the
810810
input list. If an exception was received, it will be the result
811811
for that operation. Otherwise, the return value from the successful
812812
download method is used (which will be None).
813813
"""
814+
results = [None] * len(blob_names)
814815
blob_file_pairs = []
816+
indices_to_process = []
815817

816-
for blob_name in blob_names:
818+
for i, blob_name in enumerate(blob_names):
817819
full_blob_name = blob_name_prefix + blob_name
818820
resolved_path = _resolve_path(destination_directory, blob_name)
819821
if not resolved_path.parent.is_relative_to(
820822
Path(destination_directory).resolve()
821823
):
822-
warnings.warn(
824+
msg = (
823825
f"The blob {blob_name} will **NOT** be downloaded. "
824826
f"The resolved destination_directory - {resolved_path.parent} - is either invalid or "
825827
f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`"
826828
)
829+
warnings.warn(msg)
830+
results[i] = UserWarning(msg)
827831
continue
828832

829833
resolved_path = str(resolved_path)
834+
if skip_if_exists and os.path.isfile(resolved_path):
835+
msg = f"The blob {blob_name} is skipped because destination file already exists"
836+
results[i] = UserWarning(msg)
837+
continue
838+
830839
if create_directories:
831840
directory, _ = os.path.split(resolved_path)
832841
os.makedirs(directory, exist_ok=True)
833842
blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path))
843+
indices_to_process.append(i)
834844

835-
return download_many(
845+
many_results = download_many(
836846
blob_file_pairs,
837847
download_kwargs=download_kwargs,
838848
deadline=deadline,
839849
raise_exception=raise_exception,
840850
worker_type=worker_type,
841851
max_workers=max_workers,
842-
skip_if_exists=skip_if_exists,
852+
skip_if_exists=False,
843853
)
844854

855+
for meta_index, result in zip(indices_to_process, many_results):
856+
results[meta_index] = result
857+
858+
return results
859+
860+
845861

846862
def download_chunks_concurrently(
847863
blob,

tests/system/test_transfer_manager.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download(
187187
[str(warning.message) for warning in w]
188188
)
189189

190-
# 1 total - 1 skipped = 0 results
191-
assert len(results) == 0
190+
# 1 total - 1 skipped = 1 result (containing Warning)
191+
assert len(results) == 1
192+
assert isinstance(results[0], UserWarning)
192193

193194

194195
@pytest.mark.parametrize(
@@ -266,6 +267,75 @@ def test_download_many_to_path_downloads_within_dest_dir(
266267
assert downloaded_contents == source_contents
267268

268269

270+
271+
def test_download_many_to_path_mixed_results(
272+
shared_bucket, file_data, blobs_to_delete
273+
):
274+
"""
275+
Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips.
276+
"""
277+
PREFIX = "mixed_results/"
278+
BLOBNAMES = [
279+
"success1.txt",
280+
"success2.txt",
281+
"exists.txt",
282+
"../escape.txt"
283+
]
284+
285+
FILE_BLOB_PAIRS = [
286+
(
287+
file_data["simple"]["path"],
288+
shared_bucket.blob(PREFIX + name),
289+
)
290+
for name in BLOBNAMES
291+
]
292+
293+
results = transfer_manager.upload_many(
294+
FILE_BLOB_PAIRS,
295+
skip_if_exists=True,
296+
deadline=DEADLINE,
297+
)
298+
for result in results:
299+
assert result is None
300+
301+
blobs = list(shared_bucket.list_blobs(prefix=PREFIX))
302+
blobs_to_delete.extend(blobs)
303+
assert len(blobs) == 4
304+
305+
# Actual Test
306+
with tempfile.TemporaryDirectory() as tempdir:
307+
existing_file_path = os.path.join(tempdir, "exists.txt")
308+
with open(existing_file_path, "w") as f:
309+
f.write("already here")
310+
311+
import warnings
312+
with warnings.catch_warnings(record=True) as w:
313+
warnings.simplefilter("always")
314+
results = transfer_manager.download_many_to_path(
315+
shared_bucket,
316+
BLOBNAMES,
317+
destination_directory=tempdir,
318+
blob_name_prefix=PREFIX,
319+
deadline=DEADLINE,
320+
create_directories=True,
321+
skip_if_exists=True,
322+
)
323+
324+
assert len(results) == 4
325+
assert results[0] is None
326+
assert results[1] is None
327+
assert isinstance(results[2], UserWarning)
328+
assert "skipped because destination file already exists" in str(results[2])
329+
assert isinstance(results[3], UserWarning)
330+
assert "will **NOT** be downloaded" in str(results[3])
331+
332+
assert os.path.exists(os.path.join(tempdir, "success1.txt"))
333+
assert os.path.exists(os.path.join(tempdir, "success2.txt"))
334+
335+
with open(existing_file_path, "r") as f:
336+
assert f.read() == "already here"
337+
338+
269339
def test_download_many(listable_bucket):
270340
blobs = list(listable_bucket.list_blobs())
271341
with tempfile.TemporaryDirectory() as tempdir:

tests/unit/test_transfer_manager.py

Lines changed: 75 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -530,9 +530,10 @@ def test_download_many_to_path():
530530
]
531531

532532
with mock.patch(
533-
"google.cloud.storage.transfer_manager.download_many"
533+
"google.cloud.storage.transfer_manager.download_many",
534+
return_value=[FAKE_RESULT] * len(BLOBNAMES),
534535
) as mock_download_many:
535-
transfer_manager.download_many_to_path(
536+
results = transfer_manager.download_many_to_path(
536537
bucket,
537538
BLOBNAMES,
538539
destination_directory=PATH_ROOT,
@@ -553,11 +554,71 @@ def test_download_many_to_path():
553554
raise_exception=True,
554555
max_workers=MAX_WORKERS,
555556
worker_type=WORKER_TYPE,
556-
skip_if_exists=True,
557+
skip_if_exists=False,
557558
)
559+
assert results == [FAKE_RESULT] * len(BLOBNAMES)
558560
for blobname in BLOBNAMES:
559561
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)
560562

563+
def test_download_many_to_path_with_skip_if_exists():
564+
bucket = mock.Mock()
565+
566+
BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"]
567+
PATH_ROOT = "mypath/"
568+
BLOB_NAME_PREFIX = "myprefix/"
569+
DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"}
570+
MAX_WORKERS = 7
571+
DEADLINE = 10
572+
WORKER_TYPE = transfer_manager.THREAD
573+
574+
from google.cloud.storage.transfer_manager import _resolve_path
575+
576+
existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt"))
577+
578+
def isfile_side_effect(path):
579+
return path == existing_file
580+
581+
EXPECTED_BLOB_FILE_PAIRS = [
582+
(mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))),
583+
(mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))),
584+
]
585+
586+
with mock.patch("os.path.isfile", side_effect=isfile_side_effect):
587+
with mock.patch(
588+
"google.cloud.storage.transfer_manager.download_many",
589+
return_value=[FAKE_RESULT, FAKE_RESULT],
590+
) as mock_download_many:
591+
results = transfer_manager.download_many_to_path(
592+
bucket,
593+
BLOBNAMES,
594+
destination_directory=PATH_ROOT,
595+
blob_name_prefix=BLOB_NAME_PREFIX,
596+
download_kwargs=DOWNLOAD_KWARGS,
597+
deadline=DEADLINE,
598+
create_directories=False,
599+
raise_exception=True,
600+
max_workers=MAX_WORKERS,
601+
worker_type=WORKER_TYPE,
602+
skip_if_exists=True,
603+
)
604+
605+
mock_download_many.assert_called_once_with(
606+
EXPECTED_BLOB_FILE_PAIRS,
607+
download_kwargs=DOWNLOAD_KWARGS,
608+
deadline=DEADLINE,
609+
raise_exception=True,
610+
max_workers=MAX_WORKERS,
611+
worker_type=WORKER_TYPE,
612+
skip_if_exists=False,
613+
)
614+
615+
assert len(results) == 3
616+
assert isinstance(results[0], UserWarning)
617+
assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists"
618+
assert results[1] == FAKE_RESULT
619+
assert results[2] == FAKE_RESULT
620+
621+
561622

562623
@pytest.mark.parametrize(
563624
"blobname",
@@ -584,9 +645,10 @@ def test_download_many_to_path_skips_download(blobname):
584645
with warnings.catch_warnings(record=True) as w:
585646
warnings.simplefilter("always")
586647
with mock.patch(
587-
"google.cloud.storage.transfer_manager.download_many"
648+
"google.cloud.storage.transfer_manager.download_many",
649+
return_value=[],
588650
) as mock_download_many:
589-
transfer_manager.download_many_to_path(
651+
results = transfer_manager.download_many_to_path(
590652
bucket,
591653
BLOBNAMES,
592654
destination_directory=PATH_ROOT,
@@ -614,8 +676,10 @@ def test_download_many_to_path_skips_download(blobname):
614676
raise_exception=True,
615677
max_workers=MAX_WORKERS,
616678
worker_type=WORKER_TYPE,
617-
skip_if_exists=True,
679+
skip_if_exists=False,
618680
)
681+
assert len(results) == 1
682+
assert isinstance(results[0], UserWarning)
619683

620684

621685
@pytest.mark.parametrize(
@@ -649,9 +713,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
649713
]
650714

651715
with mock.patch(
652-
"google.cloud.storage.transfer_manager.download_many"
716+
"google.cloud.storage.transfer_manager.download_many",
717+
return_value=[FAKE_RESULT],
653718
) as mock_download_many:
654-
transfer_manager.download_many_to_path(
719+
results = transfer_manager.download_many_to_path(
655720
bucket,
656721
BLOBNAMES,
657722
destination_directory=PATH_ROOT,
@@ -672,8 +737,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
672737
raise_exception=True,
673738
max_workers=MAX_WORKERS,
674739
worker_type=WORKER_TYPE,
675-
skip_if_exists=True,
740+
skip_if_exists=False,
676741
)
742+
assert results == [FAKE_RESULT]
677743
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)
678744

679745

0 commit comments

Comments
 (0)