-
-
Notifications
You must be signed in to change notification settings - Fork 6.5k
fix: resolve dict attribute error in MarkdownGenerator deserialization #1887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,4 +1,4 @@ | ||||||||||||||
| from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator | ||||||||||||||
| from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator, field_validator | ||||||||||||||
| from typing import Annotated | ||||||||||||||
| from typing import List, Dict, Optional, Callable, Awaitable, Union, Any | ||||||||||||||
| from typing import AsyncGenerator | ||||||||||||||
|
|
@@ -140,6 +140,12 @@ class CrawlResult(BaseModel): | |||||||||||||
| screenshot: Optional[str] = None | ||||||||||||||
| pdf: Optional[bytes] = None | ||||||||||||||
| mhtml: Optional[str] = None | ||||||||||||||
| markdown_data: Optional[MarkdownGenerationResult] = Field( | ||||||||||||||
| default=None, | ||||||||||||||
| alias="markdown", | ||||||||||||||
| exclude=True, | ||||||||||||||
| repr=False, | ||||||||||||||
| ) | ||||||||||||||
| _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None) | ||||||||||||||
| extracted_content: Optional[str] = None | ||||||||||||||
| metadata: Optional[dict] = None | ||||||||||||||
|
|
@@ -163,7 +169,16 @@ class CrawlResult(BaseModel): | |||||||||||||
|
|
||||||||||||||
| model_config = ConfigDict(arbitrary_types_allowed=True) | ||||||||||||||
|
|
||||||||||||||
| # NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters, | ||||||||||||||
| @field_validator("markdown_data", mode="before") | ||||||||||||||
| @classmethod | ||||||||||||||
| def validate_markdown(cls, v): | ||||||||||||||
| if isinstance(v, dict): | ||||||||||||||
| # This converts a raw dictionary (from cache/JSON) | ||||||||||||||
| # into the structured Pydantic object | ||||||||||||||
| return MarkdownGenerationResult(**v) | ||||||||||||||
|
||||||||||||||
| return MarkdownGenerationResult(**v) | |
| return MarkdownGenerationResult(**v) | |
| if isinstance(v, str): | |
| # Legacy backward compatibility: markdown provided as a plain string | |
| # Wrap it into MarkdownGenerationResult, using raw_markdown | |
| return MarkdownGenerationResult(raw_markdown=v) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| from crawl4ai.models import CrawlResult | ||
|
|
||
|
|
||
| def test_crawl_result_converts_markdown_dict_input(): | ||
| result = CrawlResult( | ||
| url="https://example.com", | ||
| html="<html></html>", | ||
| success=True, | ||
| markdown={ | ||
| "raw_markdown": "# Hello", | ||
| "markdown_with_citations": "# Hello", | ||
| "references_markdown": "", | ||
| "fit_markdown": "Hello", | ||
| "fit_html": "<p>Hello</p>", | ||
| }, | ||
| ) | ||
|
Comment on lines
+4
to
+16
|
||
|
|
||
| assert result.markdown is not None | ||
| assert result.markdown.raw_markdown == "# Hello" | ||
| assert str(result.markdown) == "# Hello" | ||
| assert "Hello" in result.markdown | ||
| assert result.model_dump()["markdown"]["raw_markdown"] == "# Hello" | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR metadata links/mentions Issue #123 (“Timeout setting”), but the changes here are focused on
CrawlResultmarkdown deserialization. If this PR is intended to fix #123, the code changes don’t appear to address timeouts; otherwise consider removing/clarifying the issue reference in the PR description.