Skip to content
This repository was archived by the owner on Mar 31, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions google/cloud/storage/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@
DataCorruptionDynamicParent = Exception


class InvalidPathError(Exception):
"""Raised when the provided path string is malformed."""

pass


class InvalidResponse(InvalidResponseDynamicParent):
"""Error class for responses which are not in the correct state.

Expand Down
44 changes: 34 additions & 10 deletions google/cloud/storage/transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

from google.cloud.storage._media.requests.upload import XMLMPUContainer
from google.cloud.storage._media.requests.upload import XMLMPUPart
from google.cloud.storage.exceptions import DataCorruption
from google.cloud.storage.exceptions import DataCorruption, InvalidPathError

TM_DEFAULT_CHUNK_SIZE = 32 * 1024 * 1024
DEFAULT_MAX_WORKERS = 8
Expand Down Expand Up @@ -263,6 +263,8 @@ def upload_many(


def _resolve_path(target_dir, blob_path):
if os.name == "nt" and ":" in blob_path:
raise InvalidPathError(f"{blob_path} cannot be downloaded into {target_dir}")
Comment thread
chandra-siri marked this conversation as resolved.
target_dir = Path(target_dir)
blob_path = Path(blob_path)
# blob_path.anchor will be '/' if `blob_path` is full path else it'll empty.
Expand Down Expand Up @@ -805,43 +807,65 @@ def download_many_to_path(

:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.

:rtype: list
:rtype: List[None|Exception|UserWarning]
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received, it will be the result
for that operation. Otherwise, the return value from the successful
download method is used (which will be None).
input list. If an exception was received or a download was skipped
(e.g., due to existing file or path traversal), it will be the result
for that operation (as an Exception or UserWarning, respectively).
Otherwise, the result will be None for a successful download.
"""
results = [None] * len(blob_names)
blob_file_pairs = []
indices_to_process = []

for blob_name in blob_names:
for i, blob_name in enumerate(blob_names):
full_blob_name = blob_name_prefix + blob_name
resolved_path = _resolve_path(destination_directory, blob_name)
try:
resolved_path = _resolve_path(destination_directory, blob_name)
except InvalidPathError as e:
msg = f"The blob {blob_name} will **NOT** be downloaded. {e}"
warnings.warn(msg)
results[i] = UserWarning(msg)
Comment thread
chandra-siri marked this conversation as resolved.
continue
if not resolved_path.parent.is_relative_to(
Path(destination_directory).resolve()
):
warnings.warn(
msg = (
f"The blob {blob_name} will **NOT** be downloaded. "
f"The resolved destination_directory - {resolved_path.parent} - is either invalid or "
f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`"
)
warnings.warn(msg)
results[i] = UserWarning(msg)
continue

resolved_path = str(resolved_path)
if skip_if_exists and os.path.isfile(resolved_path):
msg = f"The blob {blob_name} is skipped because destination file already exists"
results[i] = UserWarning(msg)
continue

if create_directories:
directory, _ = os.path.split(resolved_path)
os.makedirs(directory, exist_ok=True)
blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path))
indices_to_process.append(i)

return download_many(
many_results = download_many(
blob_file_pairs,
download_kwargs=download_kwargs,
deadline=deadline,
raise_exception=raise_exception,
worker_type=worker_type,
max_workers=max_workers,
skip_if_exists=skip_if_exists,
skip_if_exists=False,
)

for meta_index, result in zip(indices_to_process, many_results):
results[meta_index] = result

return results


def download_chunks_concurrently(
blob,
Expand Down
86 changes: 84 additions & 2 deletions tests/system/test_transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download(
[str(warning.message) for warning in w]
)

# 1 total - 1 skipped = 0 results
assert len(results) == 0
# 1 total - 1 skipped = 1 result (containing Warning)
assert len(results) == 1
assert isinstance(results[0], UserWarning)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -266,6 +267,87 @@ def test_download_many_to_path_downloads_within_dest_dir(
assert downloaded_contents == source_contents



def test_download_many_to_path_mixed_results(
shared_bucket, file_data, blobs_to_delete
):
"""
Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips.
"""
PREFIX = "mixed_results/"
BLOBNAMES = [
"success1.txt",
"success2.txt",
"exists.txt",
"../escape.txt"
]

FILE_BLOB_PAIRS = [
(
file_data["simple"]["path"],
shared_bucket.blob(PREFIX + name),
)
for name in BLOBNAMES
]

results = transfer_manager.upload_many(
FILE_BLOB_PAIRS,
skip_if_exists=True,
deadline=DEADLINE,
)
for result in results:
assert result is None

blobs = list(shared_bucket.list_blobs(prefix=PREFIX))
blobs_to_delete.extend(blobs)
assert len(blobs) == 4

# Actual Test
with tempfile.TemporaryDirectory() as tempdir:
existing_file_path = os.path.join(tempdir, "exists.txt")
with open(existing_file_path, "w") as f:
f.write("already here")

import warnings
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
results = transfer_manager.download_many_to_path(
shared_bucket,
BLOBNAMES,
destination_directory=tempdir,
blob_name_prefix=PREFIX,
deadline=DEADLINE,
create_directories=True,
skip_if_exists=True,
)

assert len(results) == 4

path_traversal_warnings = [
warning
for warning in w
if str(warning.message).startswith("The blob ")
and "will **NOT** be downloaded. The resolved destination_directory"
in str(warning.message)
]
assert len(path_traversal_warnings) == 1, "---".join(
[str(warning.message) for warning in w]
)

assert results[0] is None
assert results[1] is None
assert isinstance(results[2], UserWarning)
assert "skipped because destination file already exists" in str(results[2])
assert isinstance(results[3], UserWarning)
assert "will **NOT** be downloaded" in str(results[3])

assert os.path.exists(os.path.join(tempdir, "success1.txt"))
assert os.path.exists(os.path.join(tempdir, "success2.txt"))

with open(existing_file_path, "r") as f:
assert f.read() == "already here"


def test_download_many(listable_bucket):
blobs = list(listable_bucket.list_blobs())
with tempfile.TemporaryDirectory() as tempdir:
Expand Down
135 changes: 126 additions & 9 deletions tests/unit/test_transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,57 @@ def test_upload_many_from_filenames_additional_properties():
assert getattr(blob, attrib) == value



def test__resolve_path_raises_invalid_path_error_on_windows():
from google.cloud.storage.transfer_manager import _resolve_path, InvalidPathError

with mock.patch("os.name", "nt"):
with pytest.raises(InvalidPathError) as exc_info:
_resolve_path("C:\\target", "C:\\target\\file.txt")
assert "cannot be downloaded into" in str(exc_info.value)
Comment on lines +517 to +523
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The assertion message cannot be downloaded into is too specific and duplicates the implementation detail. It would be better to assert a more general message indicating that the path is invalid, which is less prone to break if the implementation changes.

        with pytest.raises(InvalidPathError) as exc_info:
            _resolve_path("C:\\target", "C:\\target\\file.txt")
        assert "Invalid path" in str(exc_info.value)


# Test that it DOES NOT raise on non-windows
with mock.patch("os.name", "posix"):
# Should not raise
_resolve_path("/target", "C:\\target\\file.txt")


def test_download_many_to_path_raises_invalid_path_error():
bucket = mock.Mock()

BLOBNAMES = ["C:\\target\\file.txt"]
PATH_ROOT = "mypath/"
BLOB_NAME_PREFIX = "myprefix/"
DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"}
MAX_WORKERS = 7
DEADLINE = 10
WORKER_TYPE = transfer_manager.THREAD

with mock.patch("os.name", "nt"):
import warnings

with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
blob_name_prefix=BLOB_NAME_PREFIX,
download_kwargs=DOWNLOAD_KWARGS,
deadline=DEADLINE,
create_directories=False,
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
)

assert len(w) == 1
assert "will **NOT** be downloaded" in str(w[0].message)
assert len(results) == 1
assert isinstance(results[0], UserWarning)
Comment thread
chandra-siri marked this conversation as resolved.


def test_download_many_to_path():
bucket = mock.Mock()

Expand All @@ -530,9 +581,10 @@ def test_download_many_to_path():
]

with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT] * len(BLOBNAMES),
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand All @@ -553,11 +605,71 @@ def test_download_many_to_path():
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert results == [FAKE_RESULT] * len(BLOBNAMES)
for blobname in BLOBNAMES:
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)

def test_download_many_to_path_with_skip_if_exists():
bucket = mock.Mock()

BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"]
PATH_ROOT = "mypath/"
BLOB_NAME_PREFIX = "myprefix/"
DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"}
MAX_WORKERS = 7
DEADLINE = 10
WORKER_TYPE = transfer_manager.THREAD

from google.cloud.storage.transfer_manager import _resolve_path

existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt"))

def isfile_side_effect(path):
return path == existing_file

EXPECTED_BLOB_FILE_PAIRS = [
(mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))),
(mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))),
]

with mock.patch("os.path.isfile", side_effect=isfile_side_effect):
with mock.patch(
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT, FAKE_RESULT],
) as mock_download_many:
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
blob_name_prefix=BLOB_NAME_PREFIX,
download_kwargs=DOWNLOAD_KWARGS,
deadline=DEADLINE,
create_directories=False,
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
)

mock_download_many.assert_called_once_with(
EXPECTED_BLOB_FILE_PAIRS,
download_kwargs=DOWNLOAD_KWARGS,
deadline=DEADLINE,
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=False,
)

assert len(results) == 3
assert isinstance(results[0], UserWarning)
assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists"
assert results[1] == FAKE_RESULT
assert results[2] == FAKE_RESULT



@pytest.mark.parametrize(
"blobname",
Expand All @@ -584,9 +696,10 @@ def test_download_many_to_path_skips_download(blobname):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[],
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand Down Expand Up @@ -614,8 +727,10 @@ def test_download_many_to_path_skips_download(blobname):
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert len(results) == 1
assert isinstance(results[0], UserWarning)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -649,9 +764,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
]

with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT],
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand All @@ -672,8 +788,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert results == [FAKE_RESULT]
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)


Expand Down