Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
LLMContentFilter,
RelevantContentFilter,
)
from .document_extraction_strategy import DocumentExtractionStrategy, DocumentExtractionResult
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
from .components.crawler_monitor import CrawlerMonitor
from .link_preview import LinkPreview
Expand Down
2 changes: 2 additions & 0 deletions crawl4ai/async_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1403,6 +1403,7 @@ def __init__(
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
document_extraction_strategy=None,
only_text: bool = False,
css_selector: str = None,
target_elements: List[str] = None,
Expand Down Expand Up @@ -1526,6 +1527,7 @@ def __init__(
self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.document_extraction_strategy = document_extraction_strategy
self.only_text = only_text
self.css_selector = css_selector
self.target_elements = target_elements or []
Expand Down
116 changes: 77 additions & 39 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,45 +455,83 @@ async def arun(
async_response = await self.crawler_strategy.crawl(
url, config=config)

html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result

self.logger.url_status(
url=cache_context.display_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="FETCH",
)

crawl_result = await self.aprocess_html(
url=url, html=html,
extracted_content=extracted_content,
config=config,
screenshot_data=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
**kwargs,
)

crawl_result.status_code = async_response.status_code
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
crawl_result.redirected_status_code = async_response.redirected_status_code
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result
crawl_result.mhtml = async_response.mhtml_data
crawl_result.ssl_certificate = async_response.ssl_certificate
crawl_result.network_requests = async_response.network_requests
crawl_result.console_messages = async_response.console_messages
crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, "session_id", None)
crawl_result.cache_status = "miss"
# Document extraction: detect binary documents before HTML processing
doc_strategy = getattr(config, "document_extraction_strategy", None)
if doc_strategy and doc_strategy.detect(async_response):
self.logger.info(
message="Document detected for {url}, using document extraction",
tag="DOCUMENT",
params={"url": url},
)
doc_result = await doc_strategy.extract(async_response, url)
crawl_result = CrawlResult(
url=url,
html="",
success=True,
cleaned_html="",
_markdown=MarkdownGenerationResult(
raw_markdown=doc_result.content,
markdown_with_citations=doc_result.content,
references_markdown="",
fit_markdown="",
fit_html="",
),
metadata={
"is_document": True,
"content_type": doc_result.content_type,
**(doc_result.metadata or {}),
},
status_code=async_response.status_code,
response_headers=async_response.response_headers,
downloaded_files=async_response.downloaded_files,
redirected_url=async_response.redirected_url,
redirected_status_code=async_response.redirected_status_code,
ssl_certificate=async_response.ssl_certificate,
network_requests=async_response.network_requests,
console_messages=async_response.console_messages,
session_id=getattr(config, "session_id", None),
cache_status="miss",
)
else:
html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
js_execution_result = async_response.js_execution_result

self.logger.url_status(
url=cache_context.display_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="FETCH",
)

crawl_result = await self.aprocess_html(
url=url, html=html,
extracted_content=extracted_content,
config=config,
screenshot_data=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
original_scheme=urlparse(url).scheme,
**kwargs,
)

crawl_result.status_code = async_response.status_code
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
crawl_result.redirected_status_code = async_response.redirected_status_code
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result
crawl_result.mhtml = async_response.mhtml_data
crawl_result.ssl_certificate = async_response.ssl_certificate
crawl_result.network_requests = async_response.network_requests
crawl_result.console_messages = async_response.console_messages
crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, "session_id", None)
crawl_result.cache_status = "miss"

# Check if blocked (skip for raw: URLs —
# caller-provided content, anti-bot N/A)
Expand Down
90 changes: 90 additions & 0 deletions crawl4ai/document_extraction_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""
Document Extraction Strategy — abstract base for detecting and extracting
text from binary documents (PDF, DOCX, XLSX, etc.) during the crawl pipeline.

When configured on CrawlerRunConfig, the strategy is checked after browser
navigation but before HTML content scraping. If it detects a document, it
extracts text directly — skipping the HTML pipeline entirely.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, TYPE_CHECKING

if TYPE_CHECKING:
from .models import AsyncCrawlResponse


@dataclass
class DocumentExtractionResult:
"""Result of extracting text from a binary document."""

content: str
"""Extracted text content (plain text or markdown)."""

content_type: str
"""MIME type or file extension (e.g., 'application/pdf', 'pdf')."""

source_path: Optional[Path] = None
"""Local file path if the document was downloaded."""

metadata: dict = field(default_factory=dict)
"""Optional metadata (title, author, page count, etc.)."""


class DocumentExtractionStrategy(ABC):
"""
Abstract strategy for detecting and extracting text from binary documents.

Subclass this and implement ``detect()`` and ``extract()`` using your
preferred extraction backend (Kreuzberg, PyMuPDF, Docling, etc.).

Example::

class KreuzbergDocumentStrategy(DocumentExtractionStrategy):
DOCUMENT_TYPES = {"application/pdf", "application/msword", ...}

def detect(self, response):
if response.downloaded_files:
return True
ct = (response.response_headers or {}).get("content-type", "")
return ct.split(";")[0].strip() in self.DOCUMENT_TYPES

async def extract(self, response, url):
from kreuzberg import extract_file
path = Path(response.downloaded_files[0])
result = await extract_file(str(path))
return DocumentExtractionResult(
content=result.content,
content_type=path.suffix.lstrip("."),
source_path=path,
)
"""

@abstractmethod
def detect(self, response: "AsyncCrawlResponse") -> bool:
"""Return True if the response represents a binary document.

Implementations can check:
- ``response.downloaded_files`` — browser triggered a download
- ``response.response_headers`` — Content-Type header
- ``response.status_code`` — failed navigation
- URL extension heuristics
"""
...

@abstractmethod
async def extract(
self, response: "AsyncCrawlResponse", url: str
) -> DocumentExtractionResult:
"""Extract text content from the document.

Args:
response: The crawl response (may contain downloaded file paths).
url: The original URL that was crawled.

Returns:
DocumentExtractionResult with extracted text.
"""
...
Loading