Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions google/cloud/storage/transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,43 +805,60 @@ def download_many_to_path(

:raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.

:rtype: list
:rtype: List[None|Exception|UserWarning]
:returns: A list of results corresponding to, in order, each item in the
input list. If an exception was received, it will be the result
for that operation. Otherwise, the return value from the successful
download method is used (which will be None).
input list. If an exception was received or a download was skipped
(e.g., due to existing file or path traversal), it will be the result
for that operation (as an Exception or UserWarning, respectively).
Otherwise, the result will be None for a successful download.
"""
results = [None] * len(blob_names)
blob_file_pairs = []
indices_to_process = []

for blob_name in blob_names:
for i, blob_name in enumerate(blob_names):
full_blob_name = blob_name_prefix + blob_name
resolved_path = _resolve_path(destination_directory, blob_name)
if not resolved_path.parent.is_relative_to(
Path(destination_directory).resolve()
):
warnings.warn(
msg = (
f"The blob {blob_name} will **NOT** be downloaded. "
f"The resolved destination_directory - {resolved_path.parent} - is either invalid or "
f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`"
)
warnings.warn(msg)
results[i] = UserWarning(msg)
continue

resolved_path = str(resolved_path)
if skip_if_exists and os.path.isfile(resolved_path):
msg = f"The blob {blob_name} is skipped because destination file already exists"
results[i] = UserWarning(msg)
continue

if create_directories:
directory, _ = os.path.split(resolved_path)
os.makedirs(directory, exist_ok=True)
blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path))
indices_to_process.append(i)

return download_many(
many_results = download_many(
blob_file_pairs,
download_kwargs=download_kwargs,
deadline=deadline,
raise_exception=raise_exception,
worker_type=worker_type,
max_workers=max_workers,
skip_if_exists=skip_if_exists,
skip_if_exists=False, # skip_if_exists is handled in the loop above
)

for meta_index, result in zip(indices_to_process, many_results):
results[meta_index] = result

return results



def download_chunks_concurrently(
blob,
Expand Down
86 changes: 84 additions & 2 deletions tests/system/test_transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download(
[str(warning.message) for warning in w]
)

# 1 total - 1 skipped = 0 results
assert len(results) == 0
# 1 total - 1 skipped = 1 result (containing Warning)
assert len(results) == 1
assert isinstance(results[0], UserWarning)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -266,6 +267,87 @@ def test_download_many_to_path_downloads_within_dest_dir(
assert downloaded_contents == source_contents



def test_download_many_to_path_mixed_results(
shared_bucket, file_data, blobs_to_delete
):
"""
Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips.
"""
PREFIX = "mixed_results/"
BLOBNAMES = [
"success1.txt",
"success2.txt",
"exists.txt",
"../escape.txt"
]

FILE_BLOB_PAIRS = [
(
file_data["simple"]["path"],
shared_bucket.blob(PREFIX + name),
)
for name in BLOBNAMES
]

results = transfer_manager.upload_many(
FILE_BLOB_PAIRS,
skip_if_exists=True,
deadline=DEADLINE,
)
for result in results:
assert result is None

blobs = list(shared_bucket.list_blobs(prefix=PREFIX))
blobs_to_delete.extend(blobs)
assert len(blobs) == 4

# Actual Test
with tempfile.TemporaryDirectory() as tempdir:
existing_file_path = os.path.join(tempdir, "exists.txt")
with open(existing_file_path, "w") as f:
f.write("already here")

import warnings
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
results = transfer_manager.download_many_to_path(
shared_bucket,
BLOBNAMES,
destination_directory=tempdir,
blob_name_prefix=PREFIX,
deadline=DEADLINE,
create_directories=True,
skip_if_exists=True,
)

assert len(results) == 4

path_traversal_warnings = [
warning
for warning in w
if str(warning.message).startswith("The blob ")
and "will **NOT** be downloaded. The resolved destination_directory"
in str(warning.message)
]
assert len(path_traversal_warnings) == 1, "---".join(
[str(warning.message) for warning in w]
)

assert results[0] is None
assert results[1] is None
assert isinstance(results[2], UserWarning)
assert "skipped because destination file already exists" in str(results[2])
assert isinstance(results[3], UserWarning)
assert "will **NOT** be downloaded" in str(results[3])

assert os.path.exists(os.path.join(tempdir, "success1.txt"))
assert os.path.exists(os.path.join(tempdir, "success2.txt"))

with open(existing_file_path, "r") as f:
assert f.read() == "already here"


def test_download_many(listable_bucket):
blobs = list(listable_bucket.list_blobs())
with tempfile.TemporaryDirectory() as tempdir:
Expand Down
84 changes: 75 additions & 9 deletions tests/unit/test_transfer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,9 +530,10 @@ def test_download_many_to_path():
]

with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT] * len(BLOBNAMES),
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand All @@ -553,11 +554,71 @@ def test_download_many_to_path():
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert results == [FAKE_RESULT] * len(BLOBNAMES)
for blobname in BLOBNAMES:
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)

def test_download_many_to_path_with_skip_if_exists():
bucket = mock.Mock()

BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"]
PATH_ROOT = "mypath/"
BLOB_NAME_PREFIX = "myprefix/"
DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"}
MAX_WORKERS = 7
DEADLINE = 10
WORKER_TYPE = transfer_manager.THREAD

from google.cloud.storage.transfer_manager import _resolve_path

existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt"))

def isfile_side_effect(path):
return path == existing_file

EXPECTED_BLOB_FILE_PAIRS = [
(mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))),
(mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))),
]

with mock.patch("os.path.isfile", side_effect=isfile_side_effect):
with mock.patch(
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT, FAKE_RESULT],
) as mock_download_many:
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
blob_name_prefix=BLOB_NAME_PREFIX,
download_kwargs=DOWNLOAD_KWARGS,
deadline=DEADLINE,
create_directories=False,
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
)

mock_download_many.assert_called_once_with(
EXPECTED_BLOB_FILE_PAIRS,
download_kwargs=DOWNLOAD_KWARGS,
deadline=DEADLINE,
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=False,
)

assert len(results) == 3
assert isinstance(results[0], UserWarning)
assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists"
assert results[1] == FAKE_RESULT
assert results[2] == FAKE_RESULT



@pytest.mark.parametrize(
"blobname",
Expand All @@ -584,9 +645,10 @@ def test_download_many_to_path_skips_download(blobname):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[],
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand Down Expand Up @@ -614,8 +676,10 @@ def test_download_many_to_path_skips_download(blobname):
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert len(results) == 1
assert isinstance(results[0], UserWarning)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -649,9 +713,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
]

with mock.patch(
"google.cloud.storage.transfer_manager.download_many"
"google.cloud.storage.transfer_manager.download_many",
return_value=[FAKE_RESULT],
) as mock_download_many:
transfer_manager.download_many_to_path(
results = transfer_manager.download_many_to_path(
bucket,
BLOBNAMES,
destination_directory=PATH_ROOT,
Expand All @@ -672,8 +737,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname):
raise_exception=True,
max_workers=MAX_WORKERS,
worker_type=WORKER_TYPE,
skip_if_exists=True,
skip_if_exists=False,
)
assert results == [FAKE_RESULT]
bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)


Expand Down
Loading