unclecode · hafezparast · Apr 3, 2026
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -42,6 +42,7 @@
     LLMContentFilter,
     RelevantContentFilter,
 )
+from .document_extraction_strategy import DocumentExtractionStrategy, DocumentExtractionResult
 from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
 from .components.crawler_monitor import CrawlerMonitor
 from .link_preview import LinkPreview

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -1403,6 +1403,7 @@ def __init__(
         extraction_strategy: ExtractionStrategy = None,
         chunking_strategy: ChunkingStrategy = RegexChunking(),
         markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
+        document_extraction_strategy=None,
         only_text: bool = False,
         css_selector: str = None,
         target_elements: List[str] = None,
@@ -1526,6 +1527,7 @@ def __init__(
         self.extraction_strategy = extraction_strategy
         self.chunking_strategy = chunking_strategy
         self.markdown_generator = markdown_generator
+        self.document_extraction_strategy = document_extraction_strategy
         self.only_text = only_text
         self.css_selector = css_selector
         self.target_elements = target_elements or []

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -455,45 +455,83 @@ async def arun(
                                 async_response = await self.crawler_strategy.crawl(
                                     url, config=config)
 
-                                html = sanitize_input_encode(async_response.html)
-                                screenshot_data = async_response.screenshot
-                                pdf_data = async_response.pdf_data
-                                js_execution_result = async_response.js_execution_result
-
-                                self.logger.url_status(
-                                    url=cache_context.display_url,
-                                    success=bool(html),
-                                    timing=time.perf_counter() - t1,
-                                    tag="FETCH",
-                                )
-
-                                crawl_result = await self.aprocess_html(
-                                    url=url, html=html,
-                                    extracted_content=extracted_content,
-                                    config=config,
-                                    screenshot_data=screenshot_data,
-                                    pdf_data=pdf_data,
-                                    verbose=config.verbose,
-                                    is_raw_html=True if url.startswith("raw:") else False,
-                                    redirected_url=async_response.redirected_url,
-                                    original_scheme=urlparse(url).scheme,
-                                    **kwargs,
-                                )
-
-                                crawl_result.status_code = async_response.status_code
-                                is_raw_url = url.startswith("raw:") or url.startswith("raw://")
-                                crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
-                                crawl_result.redirected_status_code = async_response.redirected_status_code
-                                crawl_result.response_headers = async_response.response_headers
-                                crawl_result.downloaded_files = async_response.downloaded_files
-                                crawl_result.js_execution_result = js_execution_result
-                                crawl_result.mhtml = async_response.mhtml_data
-                                crawl_result.ssl_certificate = async_response.ssl_certificate
-                                crawl_result.network_requests = async_response.network_requests
-                                crawl_result.console_messages = async_response.console_messages
-                                crawl_result.success = bool(html)
-                                crawl_result.session_id = getattr(config, "session_id", None)
-                                crawl_result.cache_status = "miss"
+                                # Document extraction: detect binary documents before HTML processing
+                                doc_strategy = getattr(config, "document_extraction_strategy", None)
+                                if doc_strategy and doc_strategy.detect(async_response):
+                                    self.logger.info(
+                                        message="Document detected for {url}, using document extraction",
+                                        tag="DOCUMENT",
+                                        params={"url": url},
+                                    )
+                                    doc_result = await doc_strategy.extract(async_response, url)
+                                    crawl_result = CrawlResult(
+                                        url=url,
+                                        html="",
+                                        success=True,
+                                        cleaned_html="",
+                                        _markdown=MarkdownGenerationResult(
+                                            raw_markdown=doc_result.content,
+                                            markdown_with_citations=doc_result.content,
+                                            references_markdown="",
+                                            fit_markdown="",
+                                            fit_html="",
+                                        ),
+                                        metadata={
+                                            "is_document": True,
+                                            "content_type": doc_result.content_type,
+                                            **(doc_result.metadata or {}),
+                                        },
+                                        status_code=async_response.status_code,
+                                        response_headers=async_response.response_headers,
+                                        downloaded_files=async_response.downloaded_files,
+                                        redirected_url=async_response.redirected_url,
+                                        redirected_status_code=async_response.redirected_status_code,
+                                        ssl_certificate=async_response.ssl_certificate,
+                                        network_requests=async_response.network_requests,
+                                        console_messages=async_response.console_messages,
+                                        session_id=getattr(config, "session_id", None),
+                                        cache_status="miss",
+                                    )
+                                else:
+                                    html = sanitize_input_encode(async_response.html)
+                                    screenshot_data = async_response.screenshot
+                                    pdf_data = async_response.pdf_data
+                                    js_execution_result = async_response.js_execution_result
+
+                                    self.logger.url_status(
+                                        url=cache_context.display_url,
+                                        success=bool(html),
+                                        timing=time.perf_counter() - t1,
+                                        tag="FETCH",
+                                    )
+
+                                    crawl_result = await self.aprocess_html(
+                                        url=url, html=html,
+                                        extracted_content=extracted_content,
+                                        config=config,
+                                        screenshot_data=screenshot_data,
+                                        pdf_data=pdf_data,
+                                        verbose=config.verbose,
+                                        is_raw_html=True if url.startswith("raw:") else False,
+                                        redirected_url=async_response.redirected_url,
+                                        original_scheme=urlparse(url).scheme,
+                                        **kwargs,
+                                    )
+
+                                    crawl_result.status_code = async_response.status_code
+                                    is_raw_url = url.startswith("raw:") or url.startswith("raw://")
+                                    crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
+                                    crawl_result.redirected_status_code = async_response.redirected_status_code
+                                    crawl_result.response_headers = async_response.response_headers
+                                    crawl_result.downloaded_files = async_response.downloaded_files
+                                    crawl_result.js_execution_result = js_execution_result
+                                    crawl_result.mhtml = async_response.mhtml_data
+                                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                                    crawl_result.network_requests = async_response.network_requests
+                                    crawl_result.console_messages = async_response.console_messages
+                                    crawl_result.success = bool(html)
+                                    crawl_result.session_id = getattr(config, "session_id", None)
+                                    crawl_result.cache_status = "miss"
 
                                 # Check if blocked (skip for raw: URLs —
                                 # caller-provided content, anti-bot N/A)

diff --git a/crawl4ai/document_extraction_strategy.py b/crawl4ai/document_extraction_strategy.py
@@ -0,0 +1,90 @@
+"""
+Document Extraction Strategy — abstract base for detecting and extracting
+text from binary documents (PDF, DOCX, XLSX, etc.) during the crawl pipeline.
+
+When configured on CrawlerRunConfig, the strategy is checked after browser
+navigation but before HTML content scraping. If it detects a document, it
+extracts text directly — skipping the HTML pipeline entirely.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from .models import AsyncCrawlResponse
+
+
+@dataclass
+class DocumentExtractionResult:
+    """Result of extracting text from a binary document."""
+
+    content: str
+    """Extracted text content (plain text or markdown)."""
+
+    content_type: str
+    """MIME type or file extension (e.g., 'application/pdf', 'pdf')."""
+
+    source_path: Optional[Path] = None
+    """Local file path if the document was downloaded."""
+
+    metadata: dict = field(default_factory=dict)
+    """Optional metadata (title, author, page count, etc.)."""
+
+
+class DocumentExtractionStrategy(ABC):
+    """
+    Abstract strategy for detecting and extracting text from binary documents.
+
+    Subclass this and implement ``detect()`` and ``extract()`` using your
+    preferred extraction backend (Kreuzberg, PyMuPDF, Docling, etc.).
+
+    Example::
+
+        class KreuzbergDocumentStrategy(DocumentExtractionStrategy):
+            DOCUMENT_TYPES = {"application/pdf", "application/msword", ...}
+
+            def detect(self, response):
+                if response.downloaded_files:
+                    return True
+                ct = (response.response_headers or {}).get("content-type", "")
+                return ct.split(";")[0].strip() in self.DOCUMENT_TYPES
+
+            async def extract(self, response, url):
+                from kreuzberg import extract_file
+                path = Path(response.downloaded_files[0])
+                result = await extract_file(str(path))
+                return DocumentExtractionResult(
+                    content=result.content,
+                    content_type=path.suffix.lstrip("."),
+                    source_path=path,
+                )
+    """
+
+    @abstractmethod
+    def detect(self, response: "AsyncCrawlResponse") -> bool:
+        """Return True if the response represents a binary document.
+
+        Implementations can check:
+        - ``response.downloaded_files`` — browser triggered a download
+        - ``response.response_headers`` — Content-Type header
+        - ``response.status_code`` — failed navigation
+        - URL extension heuristics
+        """
+        ...
+
+    @abstractmethod
+    async def extract(
+        self, response: "AsyncCrawlResponse", url: str
+    ) -> DocumentExtractionResult:
+        """Extract text content from the document.
+
+        Args:
+            response: The crawl response (may contain downloaded file paths).
+            url: The original URL that was crawled.
+
+        Returns:
+            DocumentExtractionResult with extracted text.
+        """
+        ...