Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions crawl4ai/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator, field_validator
from typing import Annotated
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from typing import AsyncGenerator
Expand Down Expand Up @@ -140,6 +140,12 @@ class CrawlResult(BaseModel):
screenshot: Optional[str] = None
pdf: Optional[bytes] = None
mhtml: Optional[str] = None
markdown_data: Optional[MarkdownGenerationResult] = Field(
default=None,
alias="markdown",
exclude=True,
repr=False,
)
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
Expand All @@ -163,7 +169,16 @@ class CrawlResult(BaseModel):

model_config = ConfigDict(arbitrary_types_allowed=True)

# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
@field_validator("markdown_data", mode="before")
Comment on lines 170 to +172
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR metadata links/mentions Issue #123 (“Timeout setting”), but the changes here are focused on CrawlResult markdown deserialization. If this PR is intended to fix #123, the code changes don’t appear to address timeouts; otherwise consider removing/clarifying the issue reference in the PR description.

Copilot uses AI. Check for mistakes.
@classmethod
def validate_markdown(cls, v):
if isinstance(v, dict):
# This converts a raw dictionary (from cache/JSON)
# into the structured Pydantic object
return MarkdownGenerationResult(**v)
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validate_markdown only converts dicts. If input markdown is a plain string (which can happen with legacy cache hydration), Pydantic will try to validate that string as MarkdownGenerationResult for markdown_data and raise a validation error. Consider handling str here (e.g., wrap it into a MarkdownGenerationResult with raw_markdown set and other fields empty) to preserve backward compatibility.

Suggested change
return MarkdownGenerationResult(**v)
return MarkdownGenerationResult(**v)
if isinstance(v, str):
# Legacy backward compatibility: markdown provided as a plain string
# Wrap it into MarkdownGenerationResult, using raw_markdown
return MarkdownGenerationResult(raw_markdown=v)

Copilot uses AI. Check for mistakes.
return v

# NOTE: The StringCompatibleMarkdown class, model_post_init hook, property getters/setters,
# and model_dump override all exist to support a smooth transition from markdown as a string
# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
#
Expand All @@ -175,15 +190,8 @@ class CrawlResult(BaseModel):
# When backward compatibility is no longer needed in future versions, this entire mechanism
# can be simplified to a standard field with no custom accessors or serialization logic.

def __init__(self, **data):
markdown_result = data.pop('markdown', None)
super().__init__(**data)
if markdown_result is not None:
self._markdown = (
MarkdownGenerationResult(**markdown_result)
if isinstance(markdown_result, dict)
else markdown_result
)
def model_post_init(self, __context):
self._markdown = self.markdown_data

@property
def markdown(self):
Expand All @@ -203,7 +211,10 @@ def markdown(self, value):
"""
Setter for the markdown property.
"""
if isinstance(value, dict):
value = MarkdownGenerationResult(**value)
self._markdown = value
self.markdown_data = value

@property
def markdown_v2(self):
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/test_crawl_result_markdown_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from crawl4ai.models import CrawlResult


def test_crawl_result_converts_markdown_dict_input():
result = CrawlResult(
url="https://example.com",
html="<html></html>",
success=True,
markdown={
"raw_markdown": "# Hello",
"markdown_with_citations": "# Hello",
"references_markdown": "",
"fit_markdown": "Hello",
"fit_html": "<p>Hello</p>",
},
)
Comment on lines +4 to +16
Copy link

Copilot AI Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test covers dict input, but it doesn’t cover legacy string hydration (e.g., CrawlResult(..., markdown="# Hello")). Adding that case would protect backward compatibility and catch the validation regression mentioned in the model validator.

Copilot uses AI. Check for mistakes.

assert result.markdown is not None
assert result.markdown.raw_markdown == "# Hello"
assert str(result.markdown) == "# Hello"
assert "Hello" in result.markdown
assert result.model_dump()["markdown"]["raw_markdown"] == "# Hello"