diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b9de25f6b..42ecc8c28 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -50,7 +50,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Attributes: browser_config (BrowserConfig): Configuration object containing browser settings. logger (AsyncLogger): Logger instance for recording events and errors. - _downloaded_files (List[str]): List of downloaded file paths. + _downloads_by_crawl (Dict[str, List[str]]): Downloaded file paths keyed by crawl invocation ID. hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. browser_manager (BrowserManager): Manager for browser creation and management. @@ -95,7 +95,7 @@ def __init__( self.adapter = browser_adapter or PlaywrightAdapter() # Initialize session management - self._downloaded_files = [] + self._downloads_by_crawl: Dict[str, List[str]] = {} # Initialize hooks system self.hooks = { @@ -532,8 +532,9 @@ async def _crawl_web( redirected_url = url redirected_status_code = None - # Reset downloaded files list for new crawl - self._downloaded_files = [] + # Scope downloaded files to this crawl invocation + crawl_id = uuid.uuid4().hex + self._downloads_by_crawl[crawl_id] = [] # Initialize capture lists captured_requests = [] @@ -711,12 +712,12 @@ async def handle_request_failed_capture(request): if config.fetch_ssl_certificate: ssl_cert = SSLCertificate.from_url(url) - # Set up download handling + # Set up download handling scoped to this crawl invocation if self.browser_config.accept_downloads: page.on( "download", - lambda download: asyncio.create_task( - self._handle_download(download) + lambda download, _cid=crawl_id: asyncio.create_task( + self._handle_download(download, _cid) ), ) @@ -1155,7 +1156,7 @@ async def get_delayed_content(delay: float = 5.0) -> str: get_delayed_content=get_delayed_content, ssl_certificate=ssl_cert, downloaded_files=( - self._downloaded_files if self._downloaded_files else None + self._downloads_by_crawl.pop(crawl_id, []) or None ), redirected_url=redirected_url, redirected_status_code=redirected_status_code, @@ -1454,23 +1455,13 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig" ) # Continue with normal flow even if virtual scroll fails - async def _handle_download(self, download): + async def _handle_download(self, download, crawl_id: str): """ - Handle file downloads. - - How it works: - 1. Get the suggested filename. - 2. Get the download path. - 3. Log the download. - 4. Start the download. - 5. Save the downloaded file. - 6. Log the completion. + Handle file downloads, scoped to a specific crawl invocation. Args: download (Download): The Playwright download object - - Returns: - None + crawl_id (str): Unique ID of the crawl invocation that triggered this download """ try: suggested_filename = download.suggested_filename @@ -1485,7 +1476,18 @@ async def _handle_download(self, download): start_time = time.perf_counter() await download.save_as(download_path) end_time = time.perf_counter() - self._downloaded_files.append(download_path) + + # Append to the correct crawl's list (if it still exists). + # If the crawl already returned and popped its list, the download + # was too late — log it but don't corrupt another crawl's results. + if crawl_id in self._downloads_by_crawl: + self._downloads_by_crawl[crawl_id].append(download_path) + else: + self.logger.warning( + message="Download {filename} completed after crawl finished, not attached to any result", + tag="DOWNLOAD", + params={"filename": suggested_filename}, + ) self.logger.success( message="Downloaded {filename} successfully", diff --git a/tests/test_download_race_condition_1889.py b/tests/test_download_race_condition_1889.py new file mode 100644 index 000000000..03a10f9c2 --- /dev/null +++ b/tests/test_download_race_condition_1889.py @@ -0,0 +1,193 @@ +""" +Tests for #1889: downloaded_files race condition — cross-contamination between CrawlResults. + +Verifies that downloads are scoped per crawl invocation using crawl_id, +and that late-completing downloads don't leak to subsequent results. +""" +import asyncio +import os +import uuid +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) + +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy +from crawl4ai.async_configs import BrowserConfig + + +class FakeDownload: + """Mock Playwright Download object.""" + def __init__(self, filename, delay=0): + self.suggested_filename = filename + self._delay = delay + self._saved = False + + async def save_as(self, path): + if self._delay: + await asyncio.sleep(self._delay) + self._saved = True + + +@pytest.fixture +def strategy(): + """Create a strategy with downloads enabled.""" + config = BrowserConfig( + accept_downloads=True, + downloads_path="/tmp/test-downloads", + ) + s = AsyncPlaywrightCrawlerStrategy(browser_config=config) + return s + + +class TestDownloadScoping: + """Verify downloads are scoped per crawl invocation.""" + + def test_downloads_by_crawl_initialized_as_dict(self, strategy): + """_downloads_by_crawl should be a dict, not a list.""" + assert isinstance(strategy._downloads_by_crawl, dict) + assert len(strategy._downloads_by_crawl) == 0 + + @pytest.mark.asyncio + async def test_handle_download_appends_to_correct_crawl(self, strategy): + """Download should go to the crawl_id it was associated with.""" + crawl_id = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_id] = [] + + download = FakeDownload("test.pdf") + await strategy._handle_download(download, crawl_id) + + assert len(strategy._downloads_by_crawl[crawl_id]) == 1 + assert strategy._downloads_by_crawl[crawl_id][0].endswith("test.pdf") + + @pytest.mark.asyncio + async def test_two_crawls_isolated(self, strategy): + """Downloads from crawl A must not appear in crawl B.""" + crawl_a = uuid.uuid4().hex + crawl_b = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_a] = [] + strategy._downloads_by_crawl[crawl_b] = [] + + await strategy._handle_download(FakeDownload("file_a.pdf"), crawl_a) + await strategy._handle_download(FakeDownload("file_b.xlsx"), crawl_b) + + assert len(strategy._downloads_by_crawl[crawl_a]) == 1 + assert "file_a.pdf" in strategy._downloads_by_crawl[crawl_a][0] + assert len(strategy._downloads_by_crawl[crawl_b]) == 1 + assert "file_b.xlsx" in strategy._downloads_by_crawl[crawl_b][0] + + @pytest.mark.asyncio + async def test_late_download_does_not_contaminate(self, strategy): + """If crawl A finishes (pops its list), a late download for A should not go to B.""" + crawl_a = uuid.uuid4().hex + crawl_b = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_a] = [] + strategy._downloads_by_crawl[crawl_b] = [] + + # Simulate crawl A finishing — pop its list + result_a = strategy._downloads_by_crawl.pop(crawl_a, []) + assert result_a == [] + + # Late download for crawl A completes + await strategy._handle_download(FakeDownload("late_file.pdf"), crawl_a) + + # Crawl B's list must be untouched + assert len(strategy._downloads_by_crawl[crawl_b]) == 0 + + @pytest.mark.asyncio + async def test_late_download_logged_not_lost(self, strategy): + """Late download should still be saved to disk, just not attached to results.""" + crawl_id = uuid.uuid4().hex + # Don't create the crawl entry — simulate it already popped + download = FakeDownload("orphan.pdf") + await strategy._handle_download(download, crawl_id) + + # File was still saved (download.save_as was called) + assert download._saved + # But no crawl list was contaminated + assert crawl_id not in strategy._downloads_by_crawl + + @pytest.mark.asyncio + async def test_multiple_downloads_same_crawl(self, strategy): + """Multiple downloads for the same crawl should all be captured.""" + crawl_id = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_id] = [] + + for i in range(5): + await strategy._handle_download(FakeDownload(f"doc_{i}.pdf"), crawl_id) + + assert len(strategy._downloads_by_crawl[crawl_id]) == 5 + + @pytest.mark.asyncio + async def test_pop_returns_files_and_cleans_up(self, strategy): + """Popping the crawl_id list should return files and remove the entry.""" + crawl_id = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_id] = [] + + await strategy._handle_download(FakeDownload("report.pdf"), crawl_id) + + files = strategy._downloads_by_crawl.pop(crawl_id, []) + assert len(files) == 1 + assert "report.pdf" in files[0] + assert crawl_id not in strategy._downloads_by_crawl + + +class TestConcurrentRace: + """Simulate the actual race condition described in #1889.""" + + @pytest.mark.asyncio + async def test_concurrent_downloads_no_crosscontamination(self, strategy): + """ + Simulate: crawl A starts slow download, crawl B starts and finishes, + then A's download completes. B must not get A's file. + """ + crawl_a = uuid.uuid4().hex + crawl_b = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_a] = [] + + # Start slow download for A (won't complete immediately) + slow_download = FakeDownload("big_file.xlsx", delay=0.2) + task_a = asyncio.create_task( + strategy._handle_download(slow_download, crawl_a) + ) + + # Crawl B starts and finishes before A's download completes + strategy._downloads_by_crawl[crawl_b] = [] + await strategy._handle_download(FakeDownload("page_b.html"), crawl_b) + result_b = strategy._downloads_by_crawl.pop(crawl_b, []) + + # Wait for A's download to finish + await task_a + result_a = strategy._downloads_by_crawl.pop(crawl_a, []) + + # A has its file, B has its file — no cross-contamination + assert len(result_a) == 1 + assert "big_file.xlsx" in result_a[0] + assert len(result_b) == 1 + assert "page_b.html" in result_b[0] + + @pytest.mark.asyncio + async def test_late_download_after_pop_no_crash(self, strategy): + """ + Simulate: crawl A starts download, crawl A finishes (pops), + then download completes. Should not crash or contaminate. + """ + crawl_a = uuid.uuid4().hex + strategy._downloads_by_crawl[crawl_a] = [] + + # Start slow download + slow_download = FakeDownload("late.pdf", delay=0.1) + task = asyncio.create_task( + strategy._handle_download(slow_download, crawl_a) + ) + + # Crawl A finishes before download completes + result_a = strategy._downloads_by_crawl.pop(crawl_a, []) + assert result_a == [] # download hadn't completed yet + + # Download finishes — should not crash + await task + + # Dict is clean + assert crawl_a not in strategy._downloads_by_crawl