diff --git a/google/cloud/storage/transfer_manager.py b/google/cloud/storage/transfer_manager.py index c655244b0..b72c9ab24 100644 --- a/google/cloud/storage/transfer_manager.py +++ b/google/cloud/storage/transfer_manager.py @@ -805,43 +805,60 @@ def download_many_to_path( :raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded. - :rtype: list + :rtype: List[None|Exception|UserWarning] :returns: A list of results corresponding to, in order, each item in the - input list. If an exception was received, it will be the result - for that operation. Otherwise, the return value from the successful - download method is used (which will be None). + input list. If an exception was received or a download was skipped + (e.g., due to existing file or path traversal), it will be the result + for that operation (as an Exception or UserWarning, respectively). + Otherwise, the result will be None for a successful download. """ + results = [None] * len(blob_names) blob_file_pairs = [] + indices_to_process = [] - for blob_name in blob_names: + for i, blob_name in enumerate(blob_names): full_blob_name = blob_name_prefix + blob_name resolved_path = _resolve_path(destination_directory, blob_name) if not resolved_path.parent.is_relative_to( Path(destination_directory).resolve() ): - warnings.warn( + msg = ( f"The blob {blob_name} will **NOT** be downloaded. " f"The resolved destination_directory - {resolved_path.parent} - is either invalid or " f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`" ) + warnings.warn(msg) + results[i] = UserWarning(msg) continue resolved_path = str(resolved_path) + if skip_if_exists and os.path.isfile(resolved_path): + msg = f"The blob {blob_name} is skipped because destination file already exists" + results[i] = UserWarning(msg) + continue + if create_directories: directory, _ = os.path.split(resolved_path) os.makedirs(directory, exist_ok=True) blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path)) + indices_to_process.append(i) - return download_many( + many_results = download_many( blob_file_pairs, download_kwargs=download_kwargs, deadline=deadline, raise_exception=raise_exception, worker_type=worker_type, max_workers=max_workers, - skip_if_exists=skip_if_exists, + skip_if_exists=False, # skip_if_exists is handled in the loop above ) + for meta_index, result in zip(indices_to_process, many_results): + results[meta_index] = result + + return results + + def download_chunks_concurrently( blob, diff --git a/tests/system/test_transfer_manager.py b/tests/system/test_transfer_manager.py index 844562c90..6bb0e03fd 100644 --- a/tests/system/test_transfer_manager.py +++ b/tests/system/test_transfer_manager.py @@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download( [str(warning.message) for warning in w] ) - # 1 total - 1 skipped = 0 results - assert len(results) == 0 + # 1 total - 1 skipped = 1 result (containing Warning) + assert len(results) == 1 + assert isinstance(results[0], UserWarning) @pytest.mark.parametrize( @@ -266,6 +267,87 @@ def test_download_many_to_path_downloads_within_dest_dir( assert downloaded_contents == source_contents + +def test_download_many_to_path_mixed_results( + shared_bucket, file_data, blobs_to_delete +): + """ + Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips. + """ + PREFIX = "mixed_results/" + BLOBNAMES = [ + "success1.txt", + "success2.txt", + "exists.txt", + "../escape.txt" + ] + + FILE_BLOB_PAIRS = [ + ( + file_data["simple"]["path"], + shared_bucket.blob(PREFIX + name), + ) + for name in BLOBNAMES + ] + + results = transfer_manager.upload_many( + FILE_BLOB_PAIRS, + skip_if_exists=True, + deadline=DEADLINE, + ) + for result in results: + assert result is None + + blobs = list(shared_bucket.list_blobs(prefix=PREFIX)) + blobs_to_delete.extend(blobs) + assert len(blobs) == 4 + + # Actual Test + with tempfile.TemporaryDirectory() as tempdir: + existing_file_path = os.path.join(tempdir, "exists.txt") + with open(existing_file_path, "w") as f: + f.write("already here") + + import warnings + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + results = transfer_manager.download_many_to_path( + shared_bucket, + BLOBNAMES, + destination_directory=tempdir, + blob_name_prefix=PREFIX, + deadline=DEADLINE, + create_directories=True, + skip_if_exists=True, + ) + + assert len(results) == 4 + + path_traversal_warnings = [ + warning + for warning in w + if str(warning.message).startswith("The blob ") + and "will **NOT** be downloaded. The resolved destination_directory" + in str(warning.message) + ] + assert len(path_traversal_warnings) == 1, "---".join( + [str(warning.message) for warning in w] + ) + + assert results[0] is None + assert results[1] is None + assert isinstance(results[2], UserWarning) + assert "skipped because destination file already exists" in str(results[2]) + assert isinstance(results[3], UserWarning) + assert "will **NOT** be downloaded" in str(results[3]) + + assert os.path.exists(os.path.join(tempdir, "success1.txt")) + assert os.path.exists(os.path.join(tempdir, "success2.txt")) + + with open(existing_file_path, "r") as f: + assert f.read() == "already here" + + def test_download_many(listable_bucket): blobs = list(listable_bucket.list_blobs()) with tempfile.TemporaryDirectory() as tempdir: diff --git a/tests/unit/test_transfer_manager.py b/tests/unit/test_transfer_manager.py index 85ffd9eaa..799bfd314 100644 --- a/tests/unit/test_transfer_manager.py +++ b/tests/unit/test_transfer_manager.py @@ -530,9 +530,10 @@ def test_download_many_to_path(): ] with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT] * len(BLOBNAMES), ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -553,11 +554,71 @@ def test_download_many_to_path(): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert results == [FAKE_RESULT] * len(BLOBNAMES) for blobname in BLOBNAMES: bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname) +def test_download_many_to_path_with_skip_if_exists(): + bucket = mock.Mock() + + BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"] + PATH_ROOT = "mypath/" + BLOB_NAME_PREFIX = "myprefix/" + DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"} + MAX_WORKERS = 7 + DEADLINE = 10 + WORKER_TYPE = transfer_manager.THREAD + + from google.cloud.storage.transfer_manager import _resolve_path + + existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt")) + + def isfile_side_effect(path): + return path == existing_file + + EXPECTED_BLOB_FILE_PAIRS = [ + (mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))), + (mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))), + ] + + with mock.patch("os.path.isfile", side_effect=isfile_side_effect): + with mock.patch( + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT, FAKE_RESULT], + ) as mock_download_many: + results = transfer_manager.download_many_to_path( + bucket, + BLOBNAMES, + destination_directory=PATH_ROOT, + blob_name_prefix=BLOB_NAME_PREFIX, + download_kwargs=DOWNLOAD_KWARGS, + deadline=DEADLINE, + create_directories=False, + raise_exception=True, + max_workers=MAX_WORKERS, + worker_type=WORKER_TYPE, + skip_if_exists=True, + ) + + mock_download_many.assert_called_once_with( + EXPECTED_BLOB_FILE_PAIRS, + download_kwargs=DOWNLOAD_KWARGS, + deadline=DEADLINE, + raise_exception=True, + max_workers=MAX_WORKERS, + worker_type=WORKER_TYPE, + skip_if_exists=False, + ) + + assert len(results) == 3 + assert isinstance(results[0], UserWarning) + assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists" + assert results[1] == FAKE_RESULT + assert results[2] == FAKE_RESULT + + @pytest.mark.parametrize( "blobname", @@ -584,9 +645,10 @@ def test_download_many_to_path_skips_download(blobname): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[], ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -614,8 +676,10 @@ def test_download_many_to_path_skips_download(blobname): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert len(results) == 1 + assert isinstance(results[0], UserWarning) @pytest.mark.parametrize( @@ -649,9 +713,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname): ] with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT], ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -672,8 +737,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert results == [FAKE_RESULT] bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname)