Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
Attributes:
browser_config (BrowserConfig): Configuration object containing browser settings.
logger (AsyncLogger): Logger instance for recording events and errors.
_downloaded_files (List[str]): List of downloaded file paths.
_downloads_by_crawl (Dict[str, List[str]]): Downloaded file paths keyed by crawl invocation ID.
hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
browser_manager (BrowserManager): Manager for browser creation and management.

Expand Down Expand Up @@ -95,7 +95,7 @@ def __init__(
self.adapter = browser_adapter or PlaywrightAdapter()

# Initialize session management
self._downloaded_files = []
self._downloads_by_crawl: Dict[str, List[str]] = {}

# Initialize hooks system
self.hooks = {
Expand Down Expand Up @@ -532,8 +532,9 @@ async def _crawl_web(
redirected_url = url
redirected_status_code = None

# Reset downloaded files list for new crawl
self._downloaded_files = []
# Scope downloaded files to this crawl invocation
crawl_id = uuid.uuid4().hex
self._downloads_by_crawl[crawl_id] = []

# Initialize capture lists
captured_requests = []
Expand Down Expand Up @@ -711,12 +712,12 @@ async def handle_request_failed_capture(request):
if config.fetch_ssl_certificate:
ssl_cert = SSLCertificate.from_url(url)

# Set up download handling
# Set up download handling scoped to this crawl invocation
if self.browser_config.accept_downloads:
page.on(
"download",
lambda download: asyncio.create_task(
self._handle_download(download)
lambda download, _cid=crawl_id: asyncio.create_task(
self._handle_download(download, _cid)
),
)

Expand Down Expand Up @@ -1155,7 +1156,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
get_delayed_content=get_delayed_content,
ssl_certificate=ssl_cert,
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
self._downloads_by_crawl.pop(crawl_id, []) or None
),
redirected_url=redirected_url,
redirected_status_code=redirected_status_code,
Expand Down Expand Up @@ -1454,23 +1455,13 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
)
# Continue with normal flow even if virtual scroll fails

async def _handle_download(self, download):
async def _handle_download(self, download, crawl_id: str):
"""
Handle file downloads.

How it works:
1. Get the suggested filename.
2. Get the download path.
3. Log the download.
4. Start the download.
5. Save the downloaded file.
6. Log the completion.
Handle file downloads, scoped to a specific crawl invocation.

Args:
download (Download): The Playwright download object

Returns:
None
crawl_id (str): Unique ID of the crawl invocation that triggered this download
"""
try:
suggested_filename = download.suggested_filename
Expand All @@ -1485,7 +1476,18 @@ async def _handle_download(self, download):
start_time = time.perf_counter()
await download.save_as(download_path)
end_time = time.perf_counter()
self._downloaded_files.append(download_path)

# Append to the correct crawl's list (if it still exists).
# If the crawl already returned and popped its list, the download
# was too late — log it but don't corrupt another crawl's results.
if crawl_id in self._downloads_by_crawl:
self._downloads_by_crawl[crawl_id].append(download_path)
else:
self.logger.warning(
message="Download {filename} completed after crawl finished, not attached to any result",
tag="DOWNLOAD",
params={"filename": suggested_filename},
)

self.logger.success(
message="Downloaded {filename} successfully",
Expand Down
193 changes: 193 additions & 0 deletions tests/test_download_race_condition_1889.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""
Tests for #1889: downloaded_files race condition — cross-contamination between CrawlResults.

Verifies that downloads are scoped per crawl invocation using crawl_id,
and that late-completing downloads don't leak to subsequent results.
"""
import asyncio
import os
import uuid
import pytest
from unittest.mock import AsyncMock, MagicMock, patch

import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
from crawl4ai.async_configs import BrowserConfig


class FakeDownload:
"""Mock Playwright Download object."""
def __init__(self, filename, delay=0):
self.suggested_filename = filename
self._delay = delay
self._saved = False

async def save_as(self, path):
if self._delay:
await asyncio.sleep(self._delay)
self._saved = True


@pytest.fixture
def strategy():
"""Create a strategy with downloads enabled."""
config = BrowserConfig(
accept_downloads=True,
downloads_path="/tmp/test-downloads",
)
s = AsyncPlaywrightCrawlerStrategy(browser_config=config)
return s


class TestDownloadScoping:
"""Verify downloads are scoped per crawl invocation."""

def test_downloads_by_crawl_initialized_as_dict(self, strategy):
"""_downloads_by_crawl should be a dict, not a list."""
assert isinstance(strategy._downloads_by_crawl, dict)
assert len(strategy._downloads_by_crawl) == 0

@pytest.mark.asyncio
async def test_handle_download_appends_to_correct_crawl(self, strategy):
"""Download should go to the crawl_id it was associated with."""
crawl_id = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_id] = []

download = FakeDownload("test.pdf")
await strategy._handle_download(download, crawl_id)

assert len(strategy._downloads_by_crawl[crawl_id]) == 1
assert strategy._downloads_by_crawl[crawl_id][0].endswith("test.pdf")

@pytest.mark.asyncio
async def test_two_crawls_isolated(self, strategy):
"""Downloads from crawl A must not appear in crawl B."""
crawl_a = uuid.uuid4().hex
crawl_b = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_a] = []
strategy._downloads_by_crawl[crawl_b] = []

await strategy._handle_download(FakeDownload("file_a.pdf"), crawl_a)
await strategy._handle_download(FakeDownload("file_b.xlsx"), crawl_b)

assert len(strategy._downloads_by_crawl[crawl_a]) == 1
assert "file_a.pdf" in strategy._downloads_by_crawl[crawl_a][0]
assert len(strategy._downloads_by_crawl[crawl_b]) == 1
assert "file_b.xlsx" in strategy._downloads_by_crawl[crawl_b][0]

@pytest.mark.asyncio
async def test_late_download_does_not_contaminate(self, strategy):
"""If crawl A finishes (pops its list), a late download for A should not go to B."""
crawl_a = uuid.uuid4().hex
crawl_b = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_a] = []
strategy._downloads_by_crawl[crawl_b] = []

# Simulate crawl A finishing — pop its list
result_a = strategy._downloads_by_crawl.pop(crawl_a, [])
assert result_a == []

# Late download for crawl A completes
await strategy._handle_download(FakeDownload("late_file.pdf"), crawl_a)

# Crawl B's list must be untouched
assert len(strategy._downloads_by_crawl[crawl_b]) == 0

@pytest.mark.asyncio
async def test_late_download_logged_not_lost(self, strategy):
"""Late download should still be saved to disk, just not attached to results."""
crawl_id = uuid.uuid4().hex
# Don't create the crawl entry — simulate it already popped
download = FakeDownload("orphan.pdf")
await strategy._handle_download(download, crawl_id)

# File was still saved (download.save_as was called)
assert download._saved
# But no crawl list was contaminated
assert crawl_id not in strategy._downloads_by_crawl

@pytest.mark.asyncio
async def test_multiple_downloads_same_crawl(self, strategy):
"""Multiple downloads for the same crawl should all be captured."""
crawl_id = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_id] = []

for i in range(5):
await strategy._handle_download(FakeDownload(f"doc_{i}.pdf"), crawl_id)

assert len(strategy._downloads_by_crawl[crawl_id]) == 5

@pytest.mark.asyncio
async def test_pop_returns_files_and_cleans_up(self, strategy):
"""Popping the crawl_id list should return files and remove the entry."""
crawl_id = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_id] = []

await strategy._handle_download(FakeDownload("report.pdf"), crawl_id)

files = strategy._downloads_by_crawl.pop(crawl_id, [])
assert len(files) == 1
assert "report.pdf" in files[0]
assert crawl_id not in strategy._downloads_by_crawl


class TestConcurrentRace:
"""Simulate the actual race condition described in #1889."""

@pytest.mark.asyncio
async def test_concurrent_downloads_no_crosscontamination(self, strategy):
"""
Simulate: crawl A starts slow download, crawl B starts and finishes,
then A's download completes. B must not get A's file.
"""
crawl_a = uuid.uuid4().hex
crawl_b = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_a] = []

# Start slow download for A (won't complete immediately)
slow_download = FakeDownload("big_file.xlsx", delay=0.2)
task_a = asyncio.create_task(
strategy._handle_download(slow_download, crawl_a)
)

# Crawl B starts and finishes before A's download completes
strategy._downloads_by_crawl[crawl_b] = []
await strategy._handle_download(FakeDownload("page_b.html"), crawl_b)
result_b = strategy._downloads_by_crawl.pop(crawl_b, [])

# Wait for A's download to finish
await task_a
result_a = strategy._downloads_by_crawl.pop(crawl_a, [])

# A has its file, B has its file — no cross-contamination
assert len(result_a) == 1
assert "big_file.xlsx" in result_a[0]
assert len(result_b) == 1
assert "page_b.html" in result_b[0]

@pytest.mark.asyncio
async def test_late_download_after_pop_no_crash(self, strategy):
"""
Simulate: crawl A starts download, crawl A finishes (pops),
then download completes. Should not crash or contaminate.
"""
crawl_a = uuid.uuid4().hex
strategy._downloads_by_crawl[crawl_a] = []

# Start slow download
slow_download = FakeDownload("late.pdf", delay=0.1)
task = asyncio.create_task(
strategy._handle_download(slow_download, crawl_a)
)

# Crawl A finishes before download completes
result_a = strategy._downloads_by_crawl.pop(crawl_a, [])
assert result_a == [] # download hadn't completed yet

# Download finishes — should not crash
await task

# Dict is clean
assert crawl_a not in strategy._downloads_by_crawl