From 06f78f2d03c7e68ed1cdc0ebf3c7e958d5812cea Mon Sep 17 00:00:00 2001
From: Max Bohomolov
Date: Sun, 14 Jun 2026 17:42:35 +0000
Subject: [PATCH 1/3] Add `AiCrawler` with AI-powered HTML extraction
---
pyproject.toml | 3 +-
src/crawlee/crawlers/__init__.py | 42 ++
src/crawlee/crawlers/_ai/__init__.py | 42 ++
src/crawlee/crawlers/_ai/_ai_crawler.py | 173 +++++
.../crawlers/_ai/_ai_crawling_context.py | 44 ++
src/crawlee/crawlers/_ai/_base_distiller.py | 66 ++
src/crawlee/crawlers/_ai/_base_extractor.py | 113 ++++
.../crawlers/_ai/_clean_html_distiller.py | 260 +++++++
src/crawlee/crawlers/_ai/_direct_extractor.py | 144 ++++
src/crawlee/crawlers/_ai/_prompts.py | 47 ++
.../crawlers/_ai/_selector_extractor.py | 633 ++++++++++++++++++
.../crawlers/_ai/_skeleton_distiller.py | 216 ++++++
src/crawlee/crawlers/_ai/_types.py | 132 ++++
src/crawlee/crawlers/_ai/_utils.py | 28 +
uv.lock | 455 ++++++++++++-
15 files changed, 2395 insertions(+), 3 deletions(-)
create mode 100644 src/crawlee/crawlers/_ai/__init__.py
create mode 100644 src/crawlee/crawlers/_ai/_ai_crawler.py
create mode 100644 src/crawlee/crawlers/_ai/_ai_crawling_context.py
create mode 100644 src/crawlee/crawlers/_ai/_base_distiller.py
create mode 100644 src/crawlee/crawlers/_ai/_base_extractor.py
create mode 100644 src/crawlee/crawlers/_ai/_clean_html_distiller.py
create mode 100644 src/crawlee/crawlers/_ai/_direct_extractor.py
create mode 100644 src/crawlee/crawlers/_ai/_prompts.py
create mode 100644 src/crawlee/crawlers/_ai/_selector_extractor.py
create mode 100644 src/crawlee/crawlers/_ai/_skeleton_distiller.py
create mode 100644 src/crawlee/crawlers/_ai/_types.py
create mode 100644 src/crawlee/crawlers/_ai/_utils.py
diff --git a/pyproject.toml b/pyproject.toml
index 811622d742..16c43e1df4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
]
[project.optional-dependencies]
-all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
+all = ["crawlee[adaptive-crawler,ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
@@ -58,6 +58,7 @@ adaptive-crawler = [
"apify_fingerprint_datapoints>=0.0.3",
"browserforge>=1.2.4"
]
+ai = ["pydantic-ai-slim[openai]>=1.106.0", "parsel>=1.10.0", "lxml[html_clean]>=5.2.0"]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
index ac97581bb0..2e67efa985 100644
--- a/src/crawlee/crawlers/__init__.py
+++ b/src/crawlee/crawlers/__init__.py
@@ -65,6 +65,36 @@
StagehandPreNavCrawlingContext,
)
+with _try_import(
+ __name__,
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
+ 'get_basic_ai_cleaner',
+):
+ from ._ai import (
+ AiCleanHtmlDistiller,
+ AiCrawler,
+ AiCrawlingContext,
+ AiDirectExtractor,
+ AiHtmlDistiller,
+ AiHtmlExtractor,
+ AiSelectorExtractor,
+ AiSkeletonDistiller,
+ AiUsageStats,
+ BaseAiHtmlDistiller,
+ BaseAiHtmlExtractor,
+ get_basic_ai_cleaner,
+ )
+
__all__ = [
'AbstractHttpCrawler',
@@ -74,6 +104,17 @@
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
'BasicCrawler',
'BasicCrawlerOptions',
'BasicCrawlingContext',
@@ -99,4 +140,5 @@
'StagehandCrawlingContext',
'StagehandPostNavCrawlingContext',
'StagehandPreNavCrawlingContext',
+ 'get_basic_ai_cleaner',
]
diff --git a/src/crawlee/crawlers/_ai/__init__.py b/src/crawlee/crawlers/_ai/__init__.py
new file mode 100644
index 0000000000..90571efc04
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/__init__.py
@@ -0,0 +1,42 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+
+_install_import_hook(__name__)
+
+# The following imports are wrapped in try_import to handle optional dependencies (the `ai` extra),
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'AiCrawler'):
+ from ._ai_crawler import AiCrawler
+with _try_import(__name__, 'AiCrawlingContext'):
+ from ._ai_crawling_context import AiCrawlingContext
+with _try_import(__name__, 'BaseAiHtmlExtractor'):
+ from ._base_extractor import BaseAiHtmlExtractor
+with _try_import(__name__, 'AiDirectExtractor'):
+ from ._direct_extractor import AiDirectExtractor
+with _try_import(__name__, 'AiSelectorExtractor'):
+ from ._selector_extractor import AiSelectorExtractor
+with _try_import(__name__, 'BaseAiHtmlDistiller'):
+ from ._base_distiller import BaseAiHtmlDistiller
+with _try_import(__name__, 'AiCleanHtmlDistiller'):
+ from ._clean_html_distiller import AiCleanHtmlDistiller
+with _try_import(__name__, 'AiSkeletonDistiller'):
+ from ._skeleton_distiller import AiSkeletonDistiller
+with _try_import(__name__, 'AiHtmlDistiller', 'AiHtmlExtractor', 'AiUsageStats'):
+ from ._types import AiHtmlDistiller, AiHtmlExtractor, AiUsageStats
+with _try_import(__name__, 'get_basic_ai_cleaner'):
+ from ._utils import get_basic_ai_cleaner
+
+__all__ = [
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
+ 'get_basic_ai_cleaner',
+]
diff --git a/src/crawlee/crawlers/_ai/_ai_crawler.py b/src/crawlee/crawlers/_ai/_ai_crawler.py
new file mode 100644
index 0000000000..5c89d20e1f
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_ai_crawler.py
@@ -0,0 +1,173 @@
+from __future__ import annotations
+
+import warnings
+from contextlib import AbstractAsyncContextManager
+from logging import getLogger
+from typing import TYPE_CHECKING
+
+from parsel import Selector
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+from crawlee.crawlers._parsel._parsel_parser import ParselParser
+
+from ._ai_crawling_context import AiCrawlingContext
+from ._direct_extractor import AiDirectExtractor
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+ from pydantic_ai.models import Model
+ from typing_extensions import Unpack
+
+ from crawlee import Request
+ from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
+
+ from ._types import AiHtmlExtractor, AiUsageStats, ExtractFunction, TSchema
+
+
+logger = getLogger(__name__)
+
+
+@docs_group('Crawlers')
+class AiCrawler(AbstractHttpCrawler[AiCrawlingContext, Selector, Selector]):
+ """A web crawler that extracts structured data from pages using an AI model.
+
+ Builds on `AbstractHttpCrawler` and parses responses with Parsel, so the request handler has both the usual
+ Parsel `selector` and the AI-powered `extract` helper: pass a Pydantic model and get a validated instance back.
+
+ The model layer is Pydantic AI, so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works
+ through the `model` argument. The default extractor is an `AiDirectExtractor`: each page is distilled and sent
+ to the model in one call. For cached CSS-selector extraction at near-zero LLM cost, pass an `AiSelectorExtractor`
+ through the `extractor` argument.
+
+ Warning:
+ This is an experimental crawler. Its public API may change in future versions.
+
+ ### Usage
+
+ ```python
+ from pydantic import BaseModel
+ from pydantic_ai.models.openai import OpenAIChatModel
+ from pydantic_ai.providers.openai import OpenAIProvider
+
+ from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+ class Article(BaseModel):
+ title: str
+ author: str | None
+
+
+ crawler = AiCrawler(model=OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...')))
+
+
+ @crawler.router.default_handler
+ async def request_handler(context: AiCrawlingContext) -> None:
+ article = await context.extract(Article)
+ await context.push_data(article.model_dump())
+
+
+ await crawler.run(['https://crawlee.dev/'])
+ ```
+ """
+
+ def __init__(
+ self,
+ *,
+ model: str | Model | None = None,
+ extractor: AiHtmlExtractor | None = None,
+ **kwargs: Unpack[HttpCrawlerOptions[AiCrawlingContext]],
+ ) -> None:
+ """Initialize a new instance.
+
+ Args:
+ model: The model used for extraction, given to the default extractor (`AiDirectExtractor`). A
+ provider-prefixed name (e.g. `'openai:gpt-5.4-nano'`) or a Pydantic AI `Model` instance. When given
+ as a string, the provider reads credentials from its environment variable (e.g. `OPENAI_API_KEY`).
+ Pass a `Model` instance to supply them explicitly. Provide exactly one of `model` or `extractor`.
+ extractor: A pre-configured `AiHtmlExtractor`, for full control over the distiller, instructions,
+ caching, usage limits, and model fallback. Pass an `AiSelectorExtractor` here for cached-selector
+ extraction. Provide exactly one of `model` or `extractor`.
+ kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
+ """
+ if (model is None) == (extractor is None):
+ raise ValueError('Provide exactly one of `model` or `extractor`.')
+
+ if extractor is None and model is not None:
+ extractor = AiDirectExtractor(model)
+
+ if not extractor:
+ raise ValueError('Extractor initialization failed; check the provided model or extractor configuration.')
+
+ # Call the notification only once.
+ warnings.warn(
+ 'The AiCrawler is experimental and its public API may change in future releases.',
+ category=UserWarning,
+ stacklevel=2,
+ )
+
+ self._ai_usage = extractor.ai_usage
+ self._extractor = extractor
+
+ async def final_step(
+ context: ParsedHttpCrawlingContext[Selector],
+ ) -> AsyncGenerator[AiCrawlingContext, None]:
+ """Enhance `ParsedHttpCrawlingContext[Selector]` with the `extract` helper and `ai_usage`."""
+ parsel_context = ParselCrawlingContext.from_parsed_http_crawling_context(context)
+ yield AiCrawlingContext.from_parsel_crawling_context(
+ parsel_context,
+ extract=self._create_extract_function(parsel_context.selector, parsel_context.request),
+ ai_usage=self._ai_usage,
+ )
+
+ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
+
+ # If the extractor is an async context manager, add it to the crawler's additional context managers so it's
+ # properly entered and exited around the crawl.
+ if isinstance(extractor, AbstractAsyncContextManager):
+ kwargs['_additional_context_managers'] = [
+ *kwargs.get('_additional_context_managers', []),
+ extractor,
+ ]
+ super().__init__(
+ parser=ParselParser(),
+ **kwargs,
+ )
+
+ @property
+ def extractor(self) -> AiHtmlExtractor:
+ """The extractor used to turn pages into structured data."""
+ return self._extractor
+
+ @property
+ def ai_usage(self) -> AiUsageStats:
+ """Accumulated token usage across extraction calls."""
+ return self._ai_usage
+
+ def _create_extract_function(self, selector: Selector, request: Request) -> ExtractFunction:
+ """Build an `extract` helper bound to the page's parsed tree.
+
+ When the caller omits `cache_tag`, it defaults to `request.label` so an `AiSelectorExtractor` buckets
+ selectors per route without extra wiring. An explicit `cache_tag` overrides this.
+ """
+
+ async def extract(
+ schema: type[TSchema],
+ *,
+ scope: str | None = None,
+ cache_tag: str | None = None,
+ additional_instructions: str | None = None,
+ ) -> TSchema:
+ # `AiHtmlExtractor.extract` accepts a Selector directly, so the already-parsed tree is handed over
+ # without a serialize round trip.
+ return await self._extractor.extract(
+ selector,
+ schema,
+ scope=scope,
+ cache_tag=cache_tag if cache_tag is not None else request.label,
+ additional_instructions=additional_instructions,
+ )
+
+ return extract
diff --git a/src/crawlee/crawlers/_ai/_ai_crawling_context.py b/src/crawlee/crawlers/_ai/_ai_crawling_context.py
new file mode 100644
index 0000000000..18377a6644
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_ai_crawling_context.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+
+if TYPE_CHECKING:
+ from typing_extensions import Self
+
+ from ._types import AiUsageStats, ExtractFunction
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class AiCrawlingContext(ParselCrawlingContext):
+ """The crawling context used by the `AiCrawler`.
+
+ It extends `ParselCrawlingContext`, so the full Parsel `selector` (and `enqueue_links`) remain available
+ alongside the AI-powered `extract` helper. Handlers can mix cheap manual selectors with AI extraction on the
+ same page.
+ """
+
+ extract: ExtractFunction
+ """Extract a structured Pydantic model from the page using the configured AI extractor."""
+
+ ai_usage: AiUsageStats
+ """The cumulative token usage stats of the extractor across calls in this crawl."""
+
+ @classmethod
+ def from_parsel_crawling_context(
+ cls,
+ context: ParselCrawlingContext,
+ *,
+ extract: ExtractFunction,
+ ai_usage: AiUsageStats,
+ ) -> Self:
+ """Create a new context from an existing `ParselCrawlingContext`."""
+ return cls(
+ extract=extract,
+ ai_usage=ai_usage,
+ **{field.name: getattr(context, field.name) for field in fields(context)},
+ )
diff --git a/src/crawlee/crawlers/_ai/_base_distiller.py b/src/crawlee/crawlers/_ai/_base_distiller.py
new file mode 100644
index 0000000000..3567054167
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_base_distiller.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from crawlee._utils.docs import docs_group
+
+if TYPE_CHECKING:
+ from lxml.html import HtmlElement
+
+
+# Placeholder tag used to hide JSON scripts from the cleaning pass. The cleaner removes `Item
Body
'
+
+
+def test_drops_noise_tags() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
'
+ distilled_html = AiCleanHtmlDistiller().distill(html)
+
+ assert distilled_html == html
+
+
+def test_drops_data_uri_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
')
+
+ assert distilled_html == '
'
+
+
+def test_limited_class_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_classes=2).distill('
'
+
+
+def test_drops_empty_class_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
'
+
+
+def test_truncates_long_attribute_values() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_attr_len=5).distill(f'link')
+
+ assert distilled_html == f'link'
+
+
+def test_truncates_json_payload() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_json_len=5).distill(
+ '