diff --git a/docs/guides/ai_crawler.mdx b/docs/guides/ai_crawler.mdx
new file mode 100644
index 0000000000..17d030fe89
--- /dev/null
+++ b/docs/guides/ai_crawler.mdx
@@ -0,0 +1,150 @@
+---
+id: ai-crawler
+title: AI crawler
+description: Learn how to use AiCrawler to extract structured data from HTML pages with an LLM.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import BasicExample from '!!raw-loader!./code_examples/ai_crawler/basic_example.py';
+import AdditionalInstructionsExample from '!!raw-loader!./code_examples/ai_crawler/additional_instructions_example.py';
+import CustomDistillerExample from '!!raw-loader!./code_examples/ai_crawler/custom_distiller_example.py';
+import SelectorExtractorExample from '!!raw-loader!./code_examples/ai_crawler/selector_extractor_example.py';
+import UsageLimitExample from '!!raw-loader!./code_examples/ai_crawler/usage_limit_example.py';
+
+An `AiCrawler` extracts structured data from a page with an LLM. It fetches each page over plain HTTP and parses it with Parsel, then exposes an `extract` helper: pass a Pydantic model and get a validated instance back. Instead of writing CSS selectors for every field, you describe the data with a schema and the model fills it in.
+
+The model layer is [Pydantic AI](https://ai.pydantic.dev/), so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works through the `model` argument. The context is an `AiCrawlingContext`, which extends the `ParselCrawlingContext`, so the manual `selector` and `enqueue_links` stay available next to `extract`.
+
+:::caution Experimental
+
+`AiCrawler` is experimental. Its public API may change in future releases.
+
+:::
+
+## When to use AiCrawler
+
+Use `AiCrawler` when:
+
+- Selectors are unknown or brittle. The model reads the content, so it tolerates markup that varies or changes.
+- One schema spans many layouts. A single Pydantic model fits differently structured pages, with no per-page selectors.
+- Rapid prototyping. You describe the data with a schema instead of writing selectors.
+
+For pages with a stable, known structure, a plain `ParselCrawler` or `BeautifulSoupCrawler` is cheaper, since it runs no model calls.
+
+`AiCrawler` fetches pages over plain HTTP and does not render JavaScript. For pages that need a browser, or for complex multi-step interactions, use `StagehandCrawler`. See the [Stagehand crawler guide](./stagehand-crawler).
+
+## Installation
+
+`AiCrawler` requires the `ai` optional dependency group:
+
+```bash
+pip install 'crawlee[ai]'
+```
+
+or with uv:
+
+```bash
+uv add 'crawlee[ai]'
+```
+
+The `ai` extra installs the OpenAI integration by default. To use another provider, add the matching [pydantic-ai-slim](https://ai.pydantic.dev/install/#use-with-pydantic-ai-slim) extra. For example, for Anthropic:
+
+```bash
+pip install 'crawlee[ai]' 'pydantic-ai-slim[anthropic]'
+```
+
+## Basic usage
+
+Provide a `model` and call `context.extract` with a Pydantic model inside the handler. The example below extracts an article and pushes it to the dataset.
+
+
+ {BasicExample}
+
+
+The `model` builds the crawler's default extractor, an `AiDirectExtractor`. With neither `model` nor `extractor`, a default OpenAI model is used.
+
+The `model` argument accepts a provider-prefixed name or a Pydantic AI `Model` instance.
+
+```python
+# A provider-prefixed name reads credentials from the provider's environment variable (e.g. OPENAI_API_KEY).
+crawler = AiCrawler(model='openai:gpt-5.4-nano')
+
+# A Model instance takes credentials explicitly.
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+model = OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...'))
+crawler = AiCrawler(model=model)
+```
+
+## Extractors
+
+An extractor turns a page into your schema. Extractors implement different strategies for working with the LLM, and each one uses an `AiHtmlDistiller` to shape the model's input. Crawlee ships two.
+
+### AiDirectExtractor
+
+`AiDirectExtractor` sends the distilled page to the model in one call. The schema is the model's output type. Pydantic AI validates the result; on a mismatch, it sends the error back to the model to fix, bounded by `retries`.
+
+It reads each page on its own, so extraction is accurate per page. It accepts schemas of any shape: nested models, lists, dictionaries, unions, and deep nesting. The cost is one model call per page, which scales poorly on a large site.
+
+Use `additional_instructions` to focus the model on the data you want:
+
+
+ {AdditionalInstructionsExample}
+
+
+### AiSelectorExtractor
+
+`AiSelectorExtractor` asks the model for reusable CSS selectors on the first page of a route, caches them, and reuses them with no model call on later pages of the same layout, so it scales to large sites. When a page matches none of the cached selectors (a different markup variant), it generates and caches a new set, so one bucket can hold several variants. If selector generation fails, or the schema shape is unsupported, it degrades to the `fallback` extractor when one is set, and raises otherwise. Selectors are bucketed by `cache_tag`, which defaults to the request label, so each route keeps its own set. The cache is persisted to a `KeyValueStore`, so a later run reuses selectors learned earlier.
+
+
+ {SelectorExtractorExample}
+
+
+It supports schemas built from scalar fields, lists of scalars, lists of items, and a single nested item, one level deep. For shapes it cannot serve (such as a `dict` field), set a `fallback` or use `AiDirectExtractor`.
+
+Both extractors share two more knobs. `retries` caps how many times the model may fix output that fails schema validation (default 1 for `AiDirectExtractor`, 3 for `AiSelectorExtractor`). `instructions` replaces the base task instructions entirely.
+
+## Distillers
+
+A distiller reduces raw HTML to a compact representation the model reads cheaply. Each extractor uses one. Replace it with the extractor's `distiller` argument (the crawler itself has no `distiller` argument).
+
+`AiDirectExtractor` defaults to an `AiCleanHtmlDistiller`: cleaned, structure-preserving HTML that keeps the full page text. `AiSelectorExtractor` uses an `AiSkeletonDistiller` internally to ask the model for selectors; you rarely set it yourself.
+
+### Custom distiller
+
+Subclass `BaseAiHtmlDistiller` and implement `distill` to send a different representation. Set `prompt_notes` so the model knows the input format. The extractor appends the notes to its instructions.
+
+The example below converts the cleaned page to Markdown with [html-to-markdown](https://pypi.org/project/html-to-markdown/), an extra dependency:
+
+```bash
+pip install html-to-markdown
+```
+
+
+ {CustomDistillerExample}
+
+
+## Extract options
+
+`context.extract` takes options alongside the schema:
+
+- `scope` - a CSS selector that restricts extraction to the first matching subtree (e.g. `main` or `article.post`). It saves tokens and keeps the model away from unrelated parts of the page.
+- `cache_tag` - the bucket for cached selectors. It defaults to the request label.
+- `additional_instructions` - extra instructions for this call, appended to the base instructions. With `AiSelectorExtractor` they steer the one-time selector generation, not each extraction, so use them to point the model at the right region.
+
+## Usage and cost
+
+Token usage accumulates on `context.ai_usage`, and on `crawler.ai_usage` for the whole crawl. The accumulator is an `AiUsageStats` with `requests`, `input_tokens`, `output_tokens`, and `total_tokens`.
+
+To cap spend, pass `usage_limits` (a pydantic-ai `UsageLimits`) to an extractor. It applies to every model run, and `extract` raises `UsageLimitExceeded` when a page needs more. The example below caps each extraction, logs and skips pages that exceed it, and stops the whole crawl once a token budget is spent.
+
+
+ {UsageLimitExample}
+
+
+## Conclusion
+
+This guide introduced `AiCrawler` and its `extract` helper, the `AiDirectExtractor` and `AiSelectorExtractor` strategies, the built-in and custom distillers, the extract options, and how failures and cost are handled. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/crawlee-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping!
diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx
index f9c4b764fb..f86f5041da 100644
--- a/docs/guides/architecture_overview.mdx
+++ b/docs/guides/architecture_overview.mdx
@@ -49,6 +49,8 @@ class ParselCrawler
class BeautifulSoupCrawler
+class AiCrawler
+
class PlaywrightCrawler
class AdaptivePlaywrightCrawler
@@ -65,6 +67,7 @@ BasicCrawler --|> AdaptivePlaywrightCrawler
AbstractHttpCrawler --|> HttpCrawler
AbstractHttpCrawler --|> ParselCrawler
AbstractHttpCrawler --|> BeautifulSoupCrawler
+AbstractHttpCrawler --|> AiCrawler
PlaywrightCrawler --|> StagehandCrawler
```
@@ -72,11 +75,12 @@ PlaywrightCrawler --|> StagehandCrawler
HTTP crawlers use HTTP clients to fetch pages and parse them with HTML parsing libraries. They are fast and efficient for sites that do not require JavaScript rendering. HTTP clients are Crawlee components that wrap around HTTP libraries like [httpx](https://www.python-httpx.org/), [curl-impersonate](https://github.com/lwthiker/curl-impersonate) or [impit](https://apify.github.io/impit) and handle HTTP communication for requests and responses. You can learn more about them in the [HTTP clients guide](./http-clients).
-HTTP crawlers inherit from `AbstractHttpCrawler` and there are three crawlers that belong to this category:
+HTTP crawlers inherit from `AbstractHttpCrawler` and there are four crawlers that belong to this category:
- `BeautifulSoupCrawler` utilizes the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) HTML parser.
- `ParselCrawler` utilizes [Parsel](https://github.com/scrapy/parsel) for parsing HTML.
- `HttpCrawler` does not parse HTTP responses at all and is used when no content parsing is required.
+- `AiCrawler` parses HTML with Parsel and uses an LLM to extract structured data into a validated Pydantic model.
You can learn more about HTTP crawlers in the [HTTP crawlers guide](./http-crawlers).
@@ -120,6 +124,8 @@ class ParselCrawlingContext
class BeautifulSoupCrawlingContext
+class AiCrawlingContext
+
class PlaywrightPreNavCrawlingContext
class PlaywrightCrawlingContext
@@ -148,6 +154,8 @@ ParsedHttpCrawlingContext --|> ParselCrawlingContext
ParsedHttpCrawlingContext --|> BeautifulSoupCrawlingContext
+ParselCrawlingContext --|> AiCrawlingContext
+
BasicCrawlingContext --|> PlaywrightPreNavCrawlingContext
PlaywrightPreNavCrawlingContext --|> PlaywrightCrawlingContext
@@ -168,6 +176,7 @@ They have a similar inheritance structure as the crawlers, with the base class b
- `ParsedHttpCrawlingContext` for HTTP crawlers with parsed responses.
- `ParselCrawlingContext` for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
- `BeautifulSoupCrawlingContext` for HTTP crawlers that use [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for parsing.
+- `AiCrawlingContext` for the AI crawler, extending the Parsel context with an `extract` helper.
- `PlaywrightPreNavCrawlingContext` for Playwright crawlers before the page is navigated.
- `PlaywrightCrawlingContext` for Playwright crawlers.
- `AdaptivePlaywrightPreNavCrawlingContext` for Adaptive Playwright crawlers before the page is navigated.
diff --git a/docs/guides/code_examples/ai_crawler/additional_instructions_example.py b/docs/guides/code_examples/ai_crawler/additional_instructions_example.py
new file mode 100644
index 0000000000..aae0397da9
--- /dev/null
+++ b/docs/guides/code_examples/ai_crawler/additional_instructions_example.py
@@ -0,0 +1,44 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+class Post(BaseModel):
+ """Model representing a single post."""
+
+ title: str
+ url: str
+
+
+class Posts(BaseModel):
+ """Model representing the extracted list of posts."""
+
+ posts: list[Post]
+
+
+async def main() -> None:
+ model = OpenAIChatModel(
+ 'gpt-5.4-nano',
+ provider=OpenAIProvider(api_key='your-openai-api-key'),
+ )
+ crawler = AiCrawler(model=model, max_requests_per_crawl=5)
+
+ @crawler.router.default_handler
+ async def handler(context: AiCrawlingContext) -> None:
+ # The instruction narrows what the model returns from the page.
+ posts = await context.extract(
+ Posts,
+ additional_instructions='Extract only the top five posts on the page.',
+ )
+
+ await context.push_data(posts.model_dump())
+
+ await crawler.run(['https://news.ycombinator.com'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/ai_crawler/basic_example.py b/docs/guides/code_examples/ai_crawler/basic_example.py
new file mode 100644
index 0000000000..7cdd458ce7
--- /dev/null
+++ b/docs/guides/code_examples/ai_crawler/basic_example.py
@@ -0,0 +1,41 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+class Article(BaseModel):
+ """Model representing the extracted data for an article."""
+
+ title: str
+ short_text: str
+
+
+async def main() -> None:
+ model = OpenAIChatModel(
+ 'gpt-5.4-nano',
+ # Set the provider with the API key explicitly.
+ provider=OpenAIProvider(api_key='your-openai-api-key'),
+ )
+
+ crawler = AiCrawler(model=model, max_requests_per_crawl=5)
+
+ @crawler.router.default_handler
+ async def handler(context: AiCrawlingContext) -> None:
+ context.log.info(f'Processing {context.request.url} ...')
+
+ # Pass a Pydantic model and get a validated instance back.
+ article = await context.extract(Article)
+
+ await context.push_data(article.model_dump())
+
+ await context.enqueue_links()
+
+ await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/ai_crawler/custom_distiller_example.py b/docs/guides/code_examples/ai_crawler/custom_distiller_example.py
new file mode 100644
index 0000000000..fb1faca290
--- /dev/null
+++ b/docs/guides/code_examples/ai_crawler/custom_distiller_example.py
@@ -0,0 +1,67 @@
+import asyncio
+
+from html_to_markdown import convert
+from lxml_html_clean import Cleaner
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee.crawlers import (
+ AiCrawler,
+ AiCrawlingContext,
+ AiDirectExtractor,
+ BaseAiHtmlDistiller,
+ get_basic_ai_cleaner,
+)
+
+# Notes appended to the model instructions so it knows the input format.
+MARKDOWN_PROMPT_NOTES = 'The document is Markdown converted from the HTML page.'
+
+
+class MarkdownDistiller(BaseAiHtmlDistiller):
+ """Distiller that cleans the page HTML and converts it to Markdown."""
+
+ def __init__(self, cleaner: Cleaner | None = None) -> None:
+ super().__init__(prompt_notes=MARKDOWN_PROMPT_NOTES)
+
+ # Strip scripts, styles, and other noise before the conversion.
+ self._cleaner = cleaner or get_basic_ai_cleaner()
+
+ def distill(self, html: str) -> str:
+ return convert(self._cleaner.clean_html(html)).content or ''
+
+
+class Article(BaseModel):
+ """Model representing the extracted data for an article."""
+
+ title: str
+ short_text: str
+
+
+async def main() -> None:
+ model = OpenAIChatModel(
+ 'gpt-5.4-nano',
+ # Set the provider with the API key explicitly.
+ provider=OpenAIProvider(api_key='your-openai-api-key'),
+ )
+ crawler = AiCrawler(
+ # Use the custom distiller to convert the page to Markdown before extraction.
+ extractor=AiDirectExtractor(model=model, distiller=MarkdownDistiller()),
+ max_requests_per_crawl=5,
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: AiCrawlingContext) -> None:
+ # Pass a Pydantic model and get a validated instance back.
+ article = await context.extract(Article)
+ await context.push_data(article.model_dump())
+
+ # Enqueue links as usual, the distillation and extraction don't affect
+ # the rest of the crawling logic.
+ await context.enqueue_links()
+
+ await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/ai_crawler/selector_extractor_example.py b/docs/guides/code_examples/ai_crawler/selector_extractor_example.py
new file mode 100644
index 0000000000..f6dd98eb39
--- /dev/null
+++ b/docs/guides/code_examples/ai_crawler/selector_extractor_example.py
@@ -0,0 +1,56 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+
+from crawlee import Glob
+from crawlee.crawlers import (
+ AiCrawler,
+ AiCrawlingContext,
+ AiDirectExtractor,
+ AiSelectorExtractor,
+)
+
+
+class Article(BaseModel):
+ """Model representing the extracted data for an article."""
+
+ title: str
+ main_text: str
+
+
+async def main() -> None:
+ model = OpenAIChatModel(
+ 'gpt-5.4-nano',
+ provider=OpenAIProvider(api_key='your-openai-api-key'),
+ )
+ crawler = AiCrawler(
+ extractor=AiSelectorExtractor(
+ model=model,
+ # Pages the cached selectors cannot handle fall back to direct extraction.
+ fallback=AiDirectExtractor(model=model),
+ ),
+ max_requests_per_crawl=10,
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: AiCrawlingContext) -> None:
+ # Enqueue blog article pages; the article handler extracts the data.
+ await context.enqueue_links(
+ include=[Glob('https://crawlee.dev/blog/*')],
+ label='article',
+ )
+
+ @crawler.router.handler('article')
+ async def article_handler(context: AiCrawlingContext) -> None:
+ # The first page generates selectors; later pages reuse them with no LLM call.
+ article = await context.extract(Article)
+
+ await context.push_data(article.model_dump())
+
+ await crawler.run(['https://crawlee.dev/blog'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/docs/guides/code_examples/ai_crawler/usage_limit_example.py b/docs/guides/code_examples/ai_crawler/usage_limit_example.py
new file mode 100644
index 0000000000..7b0985af2f
--- /dev/null
+++ b/docs/guides/code_examples/ai_crawler/usage_limit_example.py
@@ -0,0 +1,57 @@
+import asyncio
+
+from pydantic import BaseModel
+from pydantic_ai.exceptions import UsageLimitExceeded
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.usage import UsageLimits
+
+from crawlee.crawlers import AiCrawler, AiCrawlingContext, AiDirectExtractor
+
+# Stop the whole crawl once this many tokens have been spent.
+TOKEN_BUDGET = 50_000
+
+
+class Article(BaseModel):
+ """Model representing the extracted data for an article."""
+
+ title: str
+ short_text: str
+
+
+async def main() -> None:
+ model = OpenAIChatModel(
+ 'gpt-5.4-nano',
+ provider=OpenAIProvider(api_key='your-openai-api-key'),
+ )
+ crawler = AiCrawler(
+ # Cap each extraction so an oversized page cannot consume LLM resources.
+ extractor=AiDirectExtractor(
+ model=model,
+ usage_limits=UsageLimits(total_tokens_limit=10_000),
+ ),
+ max_requests_per_crawl=5,
+ )
+
+ @crawler.router.default_handler
+ async def handler(context: AiCrawlingContext) -> None:
+ # Stop the crawl once the cumulative token budget is exhausted.
+ if context.ai_usage.total_tokens > TOKEN_BUDGET:
+ context.log.info('Token budget exhausted, stopping the crawler.')
+ crawler.stop()
+ return
+
+ try:
+ article = await context.extract(Article)
+ except UsageLimitExceeded:
+ # The page needs more tokens than the per-extraction limit allows.
+ context.log.warning(f'Content at {context.request.url} is too large.')
+ return
+
+ await context.push_data(article.model_dump())
+
+ await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 8a62da9e02..0593ffe3a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
]
[project.optional-dependencies]
-all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
+all = ["crawlee[adaptive-crawler,ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
@@ -58,6 +58,7 @@ adaptive-crawler = [
"apify_fingerprint_datapoints>=0.0.3",
"browserforge>=1.2.4"
]
+ai = ["pydantic-ai-slim[openai]>=1.106.0", "parsel>=1.10.0", "lxml[html_clean]>=5.2.0"]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
index ac97581bb0..2e67efa985 100644
--- a/src/crawlee/crawlers/__init__.py
+++ b/src/crawlee/crawlers/__init__.py
@@ -65,6 +65,36 @@
StagehandPreNavCrawlingContext,
)
+with _try_import(
+ __name__,
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
+ 'get_basic_ai_cleaner',
+):
+ from ._ai import (
+ AiCleanHtmlDistiller,
+ AiCrawler,
+ AiCrawlingContext,
+ AiDirectExtractor,
+ AiHtmlDistiller,
+ AiHtmlExtractor,
+ AiSelectorExtractor,
+ AiSkeletonDistiller,
+ AiUsageStats,
+ BaseAiHtmlDistiller,
+ BaseAiHtmlExtractor,
+ get_basic_ai_cleaner,
+ )
+
__all__ = [
'AbstractHttpCrawler',
@@ -74,6 +104,17 @@
'AdaptivePlaywrightCrawlingContext',
'AdaptivePlaywrightPostNavCrawlingContext',
'AdaptivePlaywrightPreNavCrawlingContext',
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
'BasicCrawler',
'BasicCrawlerOptions',
'BasicCrawlingContext',
@@ -99,4 +140,5 @@
'StagehandCrawlingContext',
'StagehandPostNavCrawlingContext',
'StagehandPreNavCrawlingContext',
+ 'get_basic_ai_cleaner',
]
diff --git a/src/crawlee/crawlers/_ai/__init__.py b/src/crawlee/crawlers/_ai/__init__.py
new file mode 100644
index 0000000000..90571efc04
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/__init__.py
@@ -0,0 +1,42 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+
+_install_import_hook(__name__)
+
+# The following imports are wrapped in try_import to handle optional dependencies (the `ai` extra),
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'AiCrawler'):
+ from ._ai_crawler import AiCrawler
+with _try_import(__name__, 'AiCrawlingContext'):
+ from ._ai_crawling_context import AiCrawlingContext
+with _try_import(__name__, 'BaseAiHtmlExtractor'):
+ from ._base_extractor import BaseAiHtmlExtractor
+with _try_import(__name__, 'AiDirectExtractor'):
+ from ._direct_extractor import AiDirectExtractor
+with _try_import(__name__, 'AiSelectorExtractor'):
+ from ._selector_extractor import AiSelectorExtractor
+with _try_import(__name__, 'BaseAiHtmlDistiller'):
+ from ._base_distiller import BaseAiHtmlDistiller
+with _try_import(__name__, 'AiCleanHtmlDistiller'):
+ from ._clean_html_distiller import AiCleanHtmlDistiller
+with _try_import(__name__, 'AiSkeletonDistiller'):
+ from ._skeleton_distiller import AiSkeletonDistiller
+with _try_import(__name__, 'AiHtmlDistiller', 'AiHtmlExtractor', 'AiUsageStats'):
+ from ._types import AiHtmlDistiller, AiHtmlExtractor, AiUsageStats
+with _try_import(__name__, 'get_basic_ai_cleaner'):
+ from ._utils import get_basic_ai_cleaner
+
+__all__ = [
+ 'AiCleanHtmlDistiller',
+ 'AiCrawler',
+ 'AiCrawlingContext',
+ 'AiDirectExtractor',
+ 'AiHtmlDistiller',
+ 'AiHtmlExtractor',
+ 'AiSelectorExtractor',
+ 'AiSkeletonDistiller',
+ 'AiUsageStats',
+ 'BaseAiHtmlDistiller',
+ 'BaseAiHtmlExtractor',
+ 'get_basic_ai_cleaner',
+]
diff --git a/src/crawlee/crawlers/_ai/_ai_crawler.py b/src/crawlee/crawlers/_ai/_ai_crawler.py
new file mode 100644
index 0000000000..c4a92ea56e
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_ai_crawler.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+import warnings
+from contextlib import AbstractAsyncContextManager
+from logging import getLogger
+from typing import TYPE_CHECKING
+
+from parsel import Selector
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+from crawlee.crawlers._parsel._parsel_parser import ParselParser
+
+from ._ai_crawling_context import AiCrawlingContext
+from ._direct_extractor import AiDirectExtractor
+
+if TYPE_CHECKING:
+ from collections.abc import AsyncGenerator
+
+ from pydantic_ai.models import Model
+ from typing_extensions import Unpack
+
+ from crawlee import Request
+ from crawlee.crawlers._abstract_http import ParsedHttpCrawlingContext
+
+ from ._types import AiHtmlExtractor, AiUsageStats, ExtractFunction, TSchema
+
+
+logger = getLogger(__name__)
+
+# Default model
+_DEFAULT_AI_MODEL = 'openai:gpt-5.4-nano'
+
+
+@docs_group('Crawlers')
+class AiCrawler(AbstractHttpCrawler[AiCrawlingContext, Selector, Selector]):
+ """A web crawler that extracts structured data from pages using an AI model.
+
+ Builds on `AbstractHttpCrawler` and parses responses with Parsel, so the request handler has both the usual
+ Parsel `selector` and the AI-powered `extract` helper: pass a Pydantic model and get a validated instance back.
+
+ The model layer is Pydantic AI, so any provider it supports (OpenAI, Anthropic, Gemini, Ollama, ...) works
+ through the `model` argument. The default extractor is an `AiDirectExtractor`: each page is distilled and sent
+ to the model in one call. For cached CSS-selector extraction at near-zero LLM cost, pass an `AiSelectorExtractor`
+ through the `extractor` argument.
+
+ Warning:
+ This is an experimental crawler. Its public API may change in future versions.
+
+ ### Usage
+
+ ```python
+ from pydantic import BaseModel
+ from pydantic_ai.models.openai import OpenAIChatModel
+ from pydantic_ai.providers.openai import OpenAIProvider
+
+ from crawlee.crawlers import AiCrawler, AiCrawlingContext
+
+
+ class Article(BaseModel):
+ title: str
+ author: str | None
+
+
+ crawler = AiCrawler(model=OpenAIChatModel('gpt-5.4-nano', provider=OpenAIProvider(api_key='...')))
+
+
+ @crawler.router.default_handler
+ async def request_handler(context: AiCrawlingContext) -> None:
+ article = await context.extract(Article)
+ await context.push_data(article.model_dump())
+
+
+ await crawler.run(['https://crawlee.dev/'])
+ ```
+ """
+
+ def __init__(
+ self,
+ *,
+ model: str | Model | None = None,
+ extractor: AiHtmlExtractor | None = None,
+ **kwargs: Unpack[HttpCrawlerOptions[AiCrawlingContext]],
+ ) -> None:
+ """Initialize a new instance.
+
+ Args:
+ model: The model used for extraction, given to the default extractor (`AiDirectExtractor`). A
+ provider-prefixed name (e.g. `'openai:gpt-5.4-nano'`) or a Pydantic AI `Model` instance. When given
+ as a string, the provider reads credentials from its environment variable (e.g. `OPENAI_API_KEY`).
+ Pass a `Model` instance to supply them explicitly. Defaults to `'openai:gpt-5.4-nano'` when neither
+ `model` nor `extractor` is given. Provide at most one of `model` or `extractor`.
+ extractor: A pre-configured `AiHtmlExtractor`, for full control over the distiller, instructions,
+ caching, usage limits, and model fallback. Pass an `AiSelectorExtractor` here for cached-selector
+ extraction. Provide at most one of `model` or `extractor`.
+ kwargs: Additional keyword arguments to pass to the underlying `AbstractHttpCrawler`.
+ """
+ if model is not None and extractor is not None:
+ raise ValueError('Provide at most one of `model` or `extractor`.')
+
+ if extractor is None:
+ extractor = AiDirectExtractor(model if model is not None else _DEFAULT_AI_MODEL)
+
+ # Call the notification only once.
+ warnings.warn(
+ 'The AiCrawler is experimental and its public API may change in future releases.',
+ category=UserWarning,
+ stacklevel=2,
+ )
+
+ self._ai_usage = extractor.ai_usage
+ self._extractor = extractor
+
+ async def final_step(
+ context: ParsedHttpCrawlingContext[Selector],
+ ) -> AsyncGenerator[AiCrawlingContext, None]:
+ """Enhance `ParsedHttpCrawlingContext[Selector]` with the `extract` helper and `ai_usage`."""
+ parsel_context = ParselCrawlingContext.from_parsed_http_crawling_context(context)
+ yield AiCrawlingContext.from_parsel_crawling_context(
+ parsel_context,
+ extract=self._create_extract_function(parsel_context.selector, parsel_context.request),
+ ai_usage=self._ai_usage,
+ )
+
+ kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline().compose(final_step)
+
+ # If the extractor is an async context manager, add it to the crawler's additional context managers so it's
+ # properly entered and exited around the crawl.
+ if isinstance(extractor, AbstractAsyncContextManager):
+ kwargs['_additional_context_managers'] = [
+ *kwargs.get('_additional_context_managers', []),
+ extractor,
+ ]
+ super().__init__(
+ parser=ParselParser(),
+ **kwargs,
+ )
+
+ @property
+ def extractor(self) -> AiHtmlExtractor:
+ """The extractor used to turn pages into structured data."""
+ return self._extractor
+
+ @property
+ def ai_usage(self) -> AiUsageStats:
+ """Accumulated token usage across extraction calls."""
+ return self._ai_usage
+
+ def _create_extract_function(self, selector: Selector, request: Request) -> ExtractFunction:
+ """Build an `extract` helper bound to the page's parsed tree.
+
+ When the caller omits `cache_tag`, it defaults to `request.label` so an `AiSelectorExtractor` buckets
+ selectors per route without extra wiring. An explicit `cache_tag` overrides this.
+ """
+
+ async def extract(
+ schema: type[TSchema],
+ *,
+ scope: str | None = None,
+ cache_tag: str | None = None,
+ additional_instructions: str | None = None,
+ ) -> TSchema:
+ # `AiHtmlExtractor.extract` accepts a Selector directly, so the already-parsed tree is handed over
+ # without a serialize round trip.
+ return await self._extractor.extract(
+ selector,
+ schema,
+ scope=scope,
+ cache_tag=cache_tag if cache_tag is not None else request.label,
+ additional_instructions=additional_instructions,
+ )
+
+ return extract
diff --git a/src/crawlee/crawlers/_ai/_ai_crawling_context.py b/src/crawlee/crawlers/_ai/_ai_crawling_context.py
new file mode 100644
index 0000000000..18377a6644
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_ai_crawling_context.py
@@ -0,0 +1,44 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING
+
+from crawlee._utils.docs import docs_group
+from crawlee.crawlers._parsel._parsel_crawling_context import ParselCrawlingContext
+
+if TYPE_CHECKING:
+ from typing_extensions import Self
+
+ from ._types import AiUsageStats, ExtractFunction
+
+
+@dataclass(frozen=True)
+@docs_group('Crawling contexts')
+class AiCrawlingContext(ParselCrawlingContext):
+ """The crawling context used by the `AiCrawler`.
+
+ It extends `ParselCrawlingContext`, so the full Parsel `selector` (and `enqueue_links`) remain available
+ alongside the AI-powered `extract` helper. Handlers can mix cheap manual selectors with AI extraction on the
+ same page.
+ """
+
+ extract: ExtractFunction
+ """Extract a structured Pydantic model from the page using the configured AI extractor."""
+
+ ai_usage: AiUsageStats
+ """The cumulative token usage stats of the extractor across calls in this crawl."""
+
+ @classmethod
+ def from_parsel_crawling_context(
+ cls,
+ context: ParselCrawlingContext,
+ *,
+ extract: ExtractFunction,
+ ai_usage: AiUsageStats,
+ ) -> Self:
+ """Create a new context from an existing `ParselCrawlingContext`."""
+ return cls(
+ extract=extract,
+ ai_usage=ai_usage,
+ **{field.name: getattr(context, field.name) for field in fields(context)},
+ )
diff --git a/src/crawlee/crawlers/_ai/_base_distiller.py b/src/crawlee/crawlers/_ai/_base_distiller.py
new file mode 100644
index 0000000000..3567054167
--- /dev/null
+++ b/src/crawlee/crawlers/_ai/_base_distiller.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import re
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+from crawlee._utils.docs import docs_group
+
+if TYPE_CHECKING:
+ from lxml.html import HtmlElement
+
+
+# Placeholder tag used to hide JSON scripts from the cleaning pass. The cleaner removes `
Item
Body
'
+
+
+def test_drops_noise_tags() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
'
+ distilled_html = AiCleanHtmlDistiller().distill(html)
+
+ assert distilled_html == html
+
+
+def test_drops_data_uri_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
')
+
+ assert distilled_html == '
'
+
+
+def test_limited_class_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_classes=2).distill('
'
+
+
+def test_drops_empty_class_attribute() -> None:
+ distilled_html = AiCleanHtmlDistiller().distill('
'
+
+
+def test_truncates_long_attribute_values() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_attr_len=5).distill(f'link')
+
+ assert distilled_html == f'link'
+
+
+def test_truncates_json_payload() -> None:
+ distilled_html = AiCleanHtmlDistiller(max_json_len=5).distill(
+ '