unclecode · hafezparast · Apr 1, 2026
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -50,7 +50,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     Attributes:
         browser_config (BrowserConfig): Configuration object containing browser settings.
         logger (AsyncLogger): Logger instance for recording events and errors.
-        _downloaded_files (List[str]): List of downloaded file paths.
+        _downloads_by_crawl (Dict[str, List[str]]): Downloaded file paths keyed by crawl invocation ID.
         hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
         browser_manager (BrowserManager): Manager for browser creation and management.
 
@@ -95,7 +95,7 @@ def __init__(
         self.adapter = browser_adapter or PlaywrightAdapter()
 
         # Initialize session management
-        self._downloaded_files = []
+        self._downloads_by_crawl: Dict[str, List[str]] = {}
 
         # Initialize hooks system
         self.hooks = {
@@ -532,8 +532,9 @@ async def _crawl_web(
         redirected_url = url
         redirected_status_code = None
 
-        # Reset downloaded files list for new crawl
-        self._downloaded_files = []
+        # Scope downloaded files to this crawl invocation
+        crawl_id = uuid.uuid4().hex
+        self._downloads_by_crawl[crawl_id] = []
 
         # Initialize capture lists
         captured_requests = []
@@ -711,12 +712,12 @@ async def handle_request_failed_capture(request):
             if config.fetch_ssl_certificate:
                 ssl_cert = SSLCertificate.from_url(url)
 
-            # Set up download handling
+            # Set up download handling scoped to this crawl invocation
             if self.browser_config.accept_downloads:
                 page.on(
                     "download",
-                    lambda download: asyncio.create_task(
-                        self._handle_download(download)
+                    lambda download, _cid=crawl_id: asyncio.create_task(
+                        self._handle_download(download, _cid)
                     ),
                 )
 
@@ -1155,7 +1156,7 @@ async def get_delayed_content(delay: float = 5.0) -> str:
                 get_delayed_content=get_delayed_content,
                 ssl_certificate=ssl_cert,
                 downloaded_files=(
-                    self._downloaded_files if self._downloaded_files else None
+                    self._downloads_by_crawl.pop(crawl_id, []) or None
                 ),
                 redirected_url=redirected_url,
                 redirected_status_code=redirected_status_code,
@@ -1454,23 +1455,13 @@ async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"
             )
             # Continue with normal flow even if virtual scroll fails
 
-    async def _handle_download(self, download):
+    async def _handle_download(self, download, crawl_id: str):
         """
-        Handle file downloads.
-
-        How it works:
-        1. Get the suggested filename.
-        2. Get the download path.
-        3. Log the download.
-        4. Start the download.
-        5. Save the downloaded file.
-        6. Log the completion.
+        Handle file downloads, scoped to a specific crawl invocation.
 
         Args:
             download (Download): The Playwright download object
-
-        Returns:
-            None
+            crawl_id (str): Unique ID of the crawl invocation that triggered this download
         """
         try:
             suggested_filename = download.suggested_filename
@@ -1485,7 +1476,18 @@ async def _handle_download(self, download):
             start_time = time.perf_counter()
             await download.save_as(download_path)
             end_time = time.perf_counter()
-            self._downloaded_files.append(download_path)
+
+            # Append to the correct crawl's list (if it still exists).
+            # If the crawl already returned and popped its list, the download
+            # was too late — log it but don't corrupt another crawl's results.
+            if crawl_id in self._downloads_by_crawl:
+                self._downloads_by_crawl[crawl_id].append(download_path)
+            else:
+                self.logger.warning(
+                    message="Download {filename} completed after crawl finished, not attached to any result",
+                    tag="DOWNLOAD",
+                    params={"filename": suggested_filename},
+                )
 
             self.logger.success(
                 message="Downloaded {filename} successfully",

diff --git a/tests/test_download_race_condition_1889.py b/tests/test_download_race_condition_1889.py
@@ -0,0 +1,193 @@
+"""
+Tests for #1889: downloaded_files race condition — cross-contamination between CrawlResults.
+
+Verifies that downloads are scoped per crawl invocation using crawl_id,
+and that late-completing downloads don't leak to subsequent results.
+"""
+import asyncio
+import os
+import uuid
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+from crawl4ai.async_configs import BrowserConfig
+
+
+class FakeDownload:
+    """Mock Playwright Download object."""
+    def __init__(self, filename, delay=0):
+        self.suggested_filename = filename
+        self._delay = delay
+        self._saved = False
+
+    async def save_as(self, path):
+        if self._delay:
+            await asyncio.sleep(self._delay)
+        self._saved = True
+
+
+@pytest.fixture
+def strategy():
+    """Create a strategy with downloads enabled."""
+    config = BrowserConfig(
+        accept_downloads=True,
+        downloads_path="/tmp/test-downloads",
+    )
+    s = AsyncPlaywrightCrawlerStrategy(browser_config=config)
+    return s
+
+
+class TestDownloadScoping:
+    """Verify downloads are scoped per crawl invocation."""
+
+    def test_downloads_by_crawl_initialized_as_dict(self, strategy):
+        """_downloads_by_crawl should be a dict, not a list."""
+        assert isinstance(strategy._downloads_by_crawl, dict)
+        assert len(strategy._downloads_by_crawl) == 0
+
+    @pytest.mark.asyncio
+    async def test_handle_download_appends_to_correct_crawl(self, strategy):
+        """Download should go to the crawl_id it was associated with."""
+        crawl_id = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_id] = []
+
+        download = FakeDownload("test.pdf")
+        await strategy._handle_download(download, crawl_id)
+
+        assert len(strategy._downloads_by_crawl[crawl_id]) == 1
+        assert strategy._downloads_by_crawl[crawl_id][0].endswith("test.pdf")
+
+    @pytest.mark.asyncio
+    async def test_two_crawls_isolated(self, strategy):
+        """Downloads from crawl A must not appear in crawl B."""
+        crawl_a = uuid.uuid4().hex
+        crawl_b = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_a] = []
+        strategy._downloads_by_crawl[crawl_b] = []
+
+        await strategy._handle_download(FakeDownload("file_a.pdf"), crawl_a)
+        await strategy._handle_download(FakeDownload("file_b.xlsx"), crawl_b)
+
+        assert len(strategy._downloads_by_crawl[crawl_a]) == 1
+        assert "file_a.pdf" in strategy._downloads_by_crawl[crawl_a][0]
+        assert len(strategy._downloads_by_crawl[crawl_b]) == 1
+        assert "file_b.xlsx" in strategy._downloads_by_crawl[crawl_b][0]
+
+    @pytest.mark.asyncio
+    async def test_late_download_does_not_contaminate(self, strategy):
+        """If crawl A finishes (pops its list), a late download for A should not go to B."""
+        crawl_a = uuid.uuid4().hex
+        crawl_b = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_a] = []
+        strategy._downloads_by_crawl[crawl_b] = []
+
+        # Simulate crawl A finishing — pop its list
+        result_a = strategy._downloads_by_crawl.pop(crawl_a, [])
+        assert result_a == []
+
+        # Late download for crawl A completes
+        await strategy._handle_download(FakeDownload("late_file.pdf"), crawl_a)
+
+        # Crawl B's list must be untouched
+        assert len(strategy._downloads_by_crawl[crawl_b]) == 0
+
+    @pytest.mark.asyncio
+    async def test_late_download_logged_not_lost(self, strategy):
+        """Late download should still be saved to disk, just not attached to results."""
+        crawl_id = uuid.uuid4().hex
+        # Don't create the crawl entry — simulate it already popped
+        download = FakeDownload("orphan.pdf")
+        await strategy._handle_download(download, crawl_id)
+
+        # File was still saved (download.save_as was called)
+        assert download._saved
+        # But no crawl list was contaminated
+        assert crawl_id not in strategy._downloads_by_crawl
+
+    @pytest.mark.asyncio
+    async def test_multiple_downloads_same_crawl(self, strategy):
+        """Multiple downloads for the same crawl should all be captured."""
+        crawl_id = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_id] = []
+
+        for i in range(5):
+            await strategy._handle_download(FakeDownload(f"doc_{i}.pdf"), crawl_id)
+
+        assert len(strategy._downloads_by_crawl[crawl_id]) == 5
+
+    @pytest.mark.asyncio
+    async def test_pop_returns_files_and_cleans_up(self, strategy):
+        """Popping the crawl_id list should return files and remove the entry."""
+        crawl_id = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_id] = []
+
+        await strategy._handle_download(FakeDownload("report.pdf"), crawl_id)
+
+        files = strategy._downloads_by_crawl.pop(crawl_id, [])
+        assert len(files) == 1
+        assert "report.pdf" in files[0]
+        assert crawl_id not in strategy._downloads_by_crawl
+
+
+class TestConcurrentRace:
+    """Simulate the actual race condition described in #1889."""
+
+    @pytest.mark.asyncio
+    async def test_concurrent_downloads_no_crosscontamination(self, strategy):
+        """
+        Simulate: crawl A starts slow download, crawl B starts and finishes,
+        then A's download completes. B must not get A's file.
+        """
+        crawl_a = uuid.uuid4().hex
+        crawl_b = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_a] = []
+
+        # Start slow download for A (won't complete immediately)
+        slow_download = FakeDownload("big_file.xlsx", delay=0.2)
+        task_a = asyncio.create_task(
+            strategy._handle_download(slow_download, crawl_a)
+        )
+
+        # Crawl B starts and finishes before A's download completes
+        strategy._downloads_by_crawl[crawl_b] = []
+        await strategy._handle_download(FakeDownload("page_b.html"), crawl_b)
+        result_b = strategy._downloads_by_crawl.pop(crawl_b, [])
+
+        # Wait for A's download to finish
+        await task_a
+        result_a = strategy._downloads_by_crawl.pop(crawl_a, [])
+
+        # A has its file, B has its file — no cross-contamination
+        assert len(result_a) == 1
+        assert "big_file.xlsx" in result_a[0]
+        assert len(result_b) == 1
+        assert "page_b.html" in result_b[0]
+
+    @pytest.mark.asyncio
+    async def test_late_download_after_pop_no_crash(self, strategy):
+        """
+        Simulate: crawl A starts download, crawl A finishes (pops),
+        then download completes. Should not crash or contaminate.
+        """
+        crawl_a = uuid.uuid4().hex
+        strategy._downloads_by_crawl[crawl_a] = []
+
+        # Start slow download
+        slow_download = FakeDownload("late.pdf", delay=0.1)
+        task = asyncio.create_task(
+            strategy._handle_download(slow_download, crawl_a)
+        )
+
+        # Crawl A finishes before download completes
+        result_a = strategy._downloads_by_crawl.pop(crawl_a, [])
+        assert result_a == []  # download hadn't completed yet
+
+        # Download finishes — should not crash
+        await task
+
+        # Dict is clean
+        assert crawl_a not in strategy._downloads_by_crawl