From 7344c05c271bcd59a959954c3f429425bf1f7c46 Mon Sep 17 00:00:00 2001 From: MTahirKleem Date: Tue, 31 Mar 2026 01:39:59 +0500 Subject: [PATCH] fix: resolve dict attribute error in MarkdownGenerator deserialization - Added 'markdown_data' alias field to CrawlResult for robust Pydantic hydration. - Implemented field_validator to convert raw dicts to MarkdownGenerationResult. - Used model_post_init to sync internal private state. - Maintained backward compatibility for string-based markdown access. - Added unit test for validation guard. --- crawl4ai/models.py | 33 ++++++++++++------- .../test_crawl_result_markdown_validator.py | 22 +++++++++++++ 2 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 tests/unit/test_crawl_result_markdown_validator.py diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 506538970..82c19e241 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator +from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict, BeforeValidator, field_validator from typing import Annotated from typing import List, Dict, Optional, Callable, Awaitable, Union, Any from typing import AsyncGenerator @@ -140,6 +140,12 @@ class CrawlResult(BaseModel): screenshot: Optional[str] = None pdf: Optional[bytes] = None mhtml: Optional[str] = None + markdown_data: Optional[MarkdownGenerationResult] = Field( + default=None, + alias="markdown", + exclude=True, + repr=False, + ) _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None) extracted_content: Optional[str] = None metadata: Optional[dict] = None @@ -163,7 +169,16 @@ class CrawlResult(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) -# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters, + @field_validator("markdown_data", mode="before") + @classmethod + def validate_markdown(cls, v): + if isinstance(v, dict): + # This converts a raw dictionary (from cache/JSON) + # into the structured Pydantic object + return MarkdownGenerationResult(**v) + return v + +# NOTE: The StringCompatibleMarkdown class, model_post_init hook, property getters/setters, # and model_dump override all exist to support a smooth transition from markdown as a string # to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility. # @@ -175,15 +190,8 @@ class CrawlResult(BaseModel): # When backward compatibility is no longer needed in future versions, this entire mechanism # can be simplified to a standard field with no custom accessors or serialization logic. - def __init__(self, **data): - markdown_result = data.pop('markdown', None) - super().__init__(**data) - if markdown_result is not None: - self._markdown = ( - MarkdownGenerationResult(**markdown_result) - if isinstance(markdown_result, dict) - else markdown_result - ) + def model_post_init(self, __context): + self._markdown = self.markdown_data @property def markdown(self): @@ -203,7 +211,10 @@ def markdown(self, value): """ Setter for the markdown property. """ + if isinstance(value, dict): + value = MarkdownGenerationResult(**value) self._markdown = value + self.markdown_data = value @property def markdown_v2(self): diff --git a/tests/unit/test_crawl_result_markdown_validator.py b/tests/unit/test_crawl_result_markdown_validator.py new file mode 100644 index 000000000..d49210660 --- /dev/null +++ b/tests/unit/test_crawl_result_markdown_validator.py @@ -0,0 +1,22 @@ +from crawl4ai.models import CrawlResult + + +def test_crawl_result_converts_markdown_dict_input(): + result = CrawlResult( + url="https://example.com", + html="", + success=True, + markdown={ + "raw_markdown": "# Hello", + "markdown_with_citations": "# Hello", + "references_markdown": "", + "fit_markdown": "Hello", + "fit_html": "

Hello

", + }, + ) + + assert result.markdown is not None + assert result.markdown.raw_markdown == "# Hello" + assert str(result.markdown) == "# Hello" + assert "Hello" in result.markdown + assert result.model_dump()["markdown"]["raw_markdown"] == "# Hello"