diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 44d3040b5..8d42c047e 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -584,6 +584,18 @@ class BrowserConfig: light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. + executable_path (str or None): Path to a custom browser executable (e.g., ungoogled-chromium, + Brave, or a stealth-patched binary). If None, uses the default + Playwright-managed browser. Default: None. + ignore_default_args (list or None): List of default Chromium flags to exclude from launch. + Passed directly to Playwright's chromium.launch(). Default: None. + skip_default_browser_args (bool): If True, skips the hardcoded browser args in _build_browser_args() + and only uses extra_args. Useful when a custom browser binary manages + its own flags and the defaults cause conflicts. Default: False. + skip_default_headers (bool): If True, skips the forced User-Agent and sec-ch-ua header overrides + in setup_context(). Useful when a custom browser binary manages its own + fingerprint and Crawl4AI's overrides create detectable mismatches. + Default: False. enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. Cannot be used with use_undetected browser mode. Default: False. memory_saving_mode (bool): If True, adds aggressive cache discard and V8 heap cap flags @@ -644,6 +656,10 @@ def __init__( text_mode: bool = False, light_mode: bool = False, extra_args: list = None, + executable_path: str = None, + ignore_default_args: list = None, + skip_default_browser_args: bool = False, + skip_default_headers: bool = False, debugging_port: int = 9222, host: str = "localhost", enable_stealth: bool = False, @@ -709,6 +725,10 @@ def __init__( self.text_mode = text_mode self.light_mode = light_mode self.extra_args = extra_args if extra_args is not None else [] + self.executable_path = executable_path + self.ignore_default_args = ignore_default_args + self.skip_default_browser_args = skip_default_browser_args + self.skip_default_headers = skip_default_headers self.sleep_on_close = sleep_on_close self.verbose = verbose self.debugging_port = debugging_port @@ -804,6 +824,10 @@ def to_dict(self): "text_mode": self.text_mode, "light_mode": self.light_mode, "extra_args": self.extra_args, + "executable_path": self.executable_path, + "ignore_default_args": self.ignore_default_args, + "skip_default_browser_args": self.skip_default_browser_args, + "skip_default_headers": self.skip_default_headers, "sleep_on_close": self.sleep_on_close, "verbose": self.verbose, "debugging_port": self.debugging_port, diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 0b429c34d..23bba2379 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -69,51 +69,56 @@ class ManagedBrowser: @staticmethod def build_browser_flags(config: BrowserConfig) -> List[str]: """Common CLI flags for launching Chromium""" - flags = [ - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", - "--force-color-profile=srgb", - "--mute-audio", - "--disable-background-timer-throttling", - # Memory-saving flags: disable unused Chrome features - "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", - "--disable-component-update", - "--disable-domain-reliability", - ] - # GPU flags disable WebGL which anti-bot sensors detect as headless. - # Keep WebGL working (via SwiftShader) when stealth mode is active. - if not config.enable_stealth: - flags.extend([ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", - ]) - if config.memory_saving_mode: - flags.extend([ - "--aggressive-cache-discard", - '--js-flags=--max-old-space-size=512', - ]) - if config.light_mode: - flags.extend(BROWSER_DISABLE_OPTIONS) - if config.text_mode: - flags.extend([ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - "--disable-software-rasterizer", + if config.skip_default_browser_args: + flags = list(config.extra_args) if config.extra_args else [] + else: + flags = [ + "--no-sandbox", "--disable-dev-shm-usage", - ]) + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + # Memory-saving flags: disable unused Chrome features + "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", + "--disable-component-update", + "--disable-domain-reliability", + ] + # GPU flags disable WebGL which anti-bot sensors detect as headless. + # Keep WebGL working (via SwiftShader) when stealth mode is active. + if not config.enable_stealth: + flags.extend([ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + ]) + if config.memory_saving_mode: + flags.extend([ + "--aggressive-cache-discard", + '--js-flags=--max-old-space-size=512', + ]) + if config.light_mode: + flags.extend(BROWSER_DISABLE_OPTIONS) + if config.text_mode: + flags.extend([ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ]) + if config.extra_args: + flags.extend(config.extra_args) # proxy support — only pass server URL, never credentials. # Chromium's --proxy-server flag silently ignores inline user:pass@. # Auth credentials are handled at the Playwright context level instead. @@ -1056,62 +1061,72 @@ async def _verify_cdp_ready(self, cdp_url: str) -> bool: def _build_browser_args(self) -> dict: """Build browser launch arguments from config.""" - args = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", - "--no-sandbox", - "--disable-dev-shm-usage", - "--no-first-run", - "--no-default-browser-check", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - "--disable-blink-features=AutomationControlled", - "--window-position=400,0", - "--disable-renderer-backgrounding", - "--disable-ipc-flooding-protection", - "--force-color-profile=srgb", - "--mute-audio", - "--disable-background-timer-throttling", - # Memory-saving flags: disable unused Chrome features - "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", - "--disable-component-update", - "--disable-domain-reliability", - # "--single-process", - f"--window-size={self.config.viewport_width},{self.config.viewport_height}", - ] - - if self.config.memory_saving_mode: - args.extend([ - "--aggressive-cache-discard", - '--js-flags=--max-old-space-size=512', - ]) - - if self.config.light_mode: - args.extend(BROWSER_DISABLE_OPTIONS) + if self.config.skip_default_browser_args: + # Skip all hardcoded args — only use extra_args + args = list(self.config.extra_args) if self.config.extra_args else [] + else: + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + # Memory-saving flags: disable unused Chrome features + "--disable-features=OptimizationHints,MediaRouter,DialMediaRouteProvider", + "--disable-component-update", + "--disable-domain-reliability", + # "--single-process", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] - if self.config.text_mode: - args.extend( - [ - "--blink-settings=imagesEnabled=false", - "--disable-remote-fonts", - "--disable-images", - "--disable-javascript", - "--disable-software-rasterizer", - "--disable-dev-shm-usage", - ] - ) + if self.config.memory_saving_mode: + args.extend([ + "--aggressive-cache-discard", + '--js-flags=--max-old-space-size=512', + ]) + + if self.config.light_mode: + args.extend(BROWSER_DISABLE_OPTIONS) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) - if self.config.extra_args: - args.extend(self.config.extra_args) + if self.config.extra_args: + args.extend(self.config.extra_args) # Deduplicate args args = list(dict.fromkeys(args)) browser_args = {"headless": self.config.headless, "args": args} + if self.config.executable_path: + browser_args["executable_path"] = self.config.executable_path + + if self.config.ignore_default_args: + browser_args["ignore_default_args"] = self.config.ignore_default_args + if self.config.chrome_channel: browser_args["channel"] = self.config.chrome_channel @@ -1191,7 +1206,7 @@ async def setup_context( ] = self.config.downloads_path # Handle user agent and browser hints - if self.config.user_agent: + if self.config.user_agent and not self.config.skip_default_headers: combined_headers = { "User-Agent": self.config.user_agent, "sec-ch-ua": self.config.browser_hint,