From 0b8d10a36d345893c1c653004f8b9347bf627d37 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 18 Jun 2026 15:03:40 +0200 Subject: [PATCH 1/2] docs: unify crawl caps and fix runnable examples Lower the page cap from 50 to 10 across all crawling examples so the browser-based ones finish within the runnable-demo timeout. Make Selenium (snippet too large for the Run-on-Apify URL) and Browser Use (needs an LLM API key) non-runnable with explanatory comments, keep both Scrapling examples runnable, and have the Pydantic example fail cleanly via `Actor.fail` instead of re-raising into a raw traceback. --- docs/03_guides/04_selenium.mdx | 9 +++++---- docs/03_guides/06_scrapy.mdx | 1 + docs/03_guides/07_scrapling.mdx | 7 +++---- docs/03_guides/09_browser_use.mdx | 9 +++++---- docs/03_guides/11_pydantic.mdx | 2 +- docs/03_guides/code/01_beautifulsoup_httpx.py | 2 +- docs/03_guides/code/02_parsel_impit.py | 2 +- docs/03_guides/code/03_playwright.py | 2 +- docs/03_guides/code/04_selenium.py | 2 +- docs/03_guides/code/05_crawlee_beautifulsoup.py | 2 +- docs/03_guides/code/05_crawlee_parsel.py | 2 +- docs/03_guides/code/05_crawlee_playwright.py | 2 +- docs/03_guides/code/07_scrapling.py | 2 +- docs/03_guides/code/07_scrapling_browser.py | 2 +- docs/03_guides/code/08_crawl4ai.py | 2 +- docs/03_guides/code/11_pydantic.py | 5 +++-- 16 files changed, 28 insertions(+), 25 deletions(-) diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index ea1a6519..01d71a05 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -4,9 +4,9 @@ title: Browser automation with Selenium description: Build an Apify Actor that scrapes dynamic web pages using Selenium WebDriver. --- -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; -import SeleniumExample from '!!raw-loader!roa-loader!./code/04_selenium.py'; +import SeleniumExample from '!!raw-loader!./code/04_selenium.py'; In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for browser automation and web scraping in your Apify Actors. @@ -36,9 +36,10 @@ This is a simple Actor that recursively scrapes data from linked pages on the sa It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. - +{/* Not runnable from the docs: the "Run on Apify" link encodes the whole snippet into the URL, and this Actor (with its inline proxy-auth extension) is large enough to exceed the URL length limit and fail with an HTTP 414. */} + {SeleniumExample} - + ## Using Apify Proxy diff --git a/docs/03_guides/06_scrapy.mdx b/docs/03_guides/06_scrapy.mdx index a690908a..ea235e1c 100644 --- a/docs/03_guides/06_scrapy.mdx +++ b/docs/03_guides/06_scrapy.mdx @@ -73,6 +73,7 @@ For further details, see the [Scrapy migration guide](https://docs.apify.com/cli The following example shows a Scrapy Actor that scrapes page titles and enqueues links found on each page. This example aligns with the structure provided in the Apify Actor templates. +{/* Not runnable from the docs: a Scrapy Actor is a multi-file project, while the "Run on Apify" runner executes a single self-contained snippet. */} diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 8d384df2..e3e4a174 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -4,11 +4,10 @@ title: Adaptive scraping with Scrapling description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. --- -import CodeBlock from '@theme/CodeBlock'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ScraplingExample from '!!raw-loader!roa-loader!./code/07_scrapling.py'; -import ScraplingBrowserScraper from '!!raw-loader!./code/07_scrapling_browser.py'; +import ScraplingBrowserScraper from '!!raw-loader!roa-loader!./code/07_scrapling_browser.py'; In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library for adaptive web scraping in your Apify Actors. @@ -101,9 +100,9 @@ scrapling install To switch the example from HTTP to a real browser, fetch each page through a browser session instead of `AsyncFetcher`. Opening a fresh browser for every page would be wasteful, so `main` enters an `AsyncDynamicSession` once and reuses it for the whole crawl, while `scrape_page` fetches with `session.fetch`. The parsing API is identical, so the extraction code stays the same: - + {ScraplingBrowserScraper} - + Note that: diff --git a/docs/03_guides/09_browser_use.mdx b/docs/03_guides/09_browser_use.mdx index e6b32d13..30237394 100644 --- a/docs/03_guides/09_browser_use.mdx +++ b/docs/03_guides/09_browser_use.mdx @@ -4,9 +4,9 @@ title: Browser AI agents with Browser Use description: Build an Apify Actor that automates a browser with an LLM agent using the Browser Use library. --- -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import CodeBlock from '@theme/CodeBlock'; -import BrowserUseExample from '!!raw-loader!roa-loader!./code/09_browser_use.py'; +import BrowserUseExample from '!!raw-loader!./code/09_browser_use.py'; In this guide, you'll learn how to use the [Browser Use](https://browser-use.com/) library to drive a browser with an LLM agent in your Apify Actors. @@ -46,9 +46,10 @@ The following Actor runs a Browser Use agent for a single task and stores its st The whole Actor fits in a single file. A `run_agent_task` helper holds the Browser Use-specific logic: it defines the output schema and builds the LLM, browser, and agent. The `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy), runs the agent, and stores the result: - +{/* Not runnable from the docs: the agent needs an LLM API key (OPENAI_API_KEY) that the shared example runner does not provide. */} + {BrowserUseExample} - + Note that: diff --git a/docs/03_guides/11_pydantic.mdx b/docs/03_guides/11_pydantic.mdx index bb3b5d9b..abc38571 100644 --- a/docs/03_guides/11_pydantic.mdx +++ b/docs/03_guides/11_pydantic.mdx @@ -56,7 +56,7 @@ The following Actor declares its input as a Pydantic `BaseModel`, validates the ### About the validation - `model_validate` parses the raw dictionary into a typed `ActorInput` instance. It fills in defaults and guarantees every field is valid, or raises a `ValidationError` that describes every problem at once. -- Catching that error, logging a readable summary, and re-raising makes the Actor fail fast with a clear explanation right at the start, rather than crashing with an obscure error somewhere deep in the run. Because the body runs inside `async with Actor:`, the re-raised exception automatically marks the run as `FAILED`. +- Catching that error, logging a readable summary, and failing the run with `Actor.fail` marks the run as `FAILED` with a clear status message. It fails fast right at the start with a readable explanation, instead of crashing with a raw traceback deeper in the run. - The error messages refer to the fields by their input-schema aliases. For invalid input like `{"searchTerms": [], "maxResults": 999, "outputFormat": "xml"}`, the log shows exactly what's wrong: ```text diff --git a/docs/03_guides/code/01_beautifulsoup_httpx.py b/docs/03_guides/code/01_beautifulsoup_httpx.py index e4ea6373..791b7fc9 100644 --- a/docs/03_guides/code/01_beautifulsoup_httpx.py +++ b/docs/03_guides/code/01_beautifulsoup_httpx.py @@ -82,7 +82,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 while handled_requests < max_requests and ( diff --git a/docs/03_guides/code/02_parsel_impit.py b/docs/03_guides/code/02_parsel_impit.py index 9e9efe1d..669dac92 100644 --- a/docs/03_guides/code/02_parsel_impit.py +++ b/docs/03_guides/code/02_parsel_impit.py @@ -82,7 +82,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 while handled_requests < max_requests and ( diff --git a/docs/03_guides/code/03_playwright.py b/docs/03_guides/code/03_playwright.py index 0e61f76c..aa38cd42 100644 --- a/docs/03_guides/code/03_playwright.py +++ b/docs/03_guides/code/03_playwright.py @@ -94,7 +94,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 Actor.log.info('Launching Playwright...') diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index 5e0c77a3..c01f336c 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -151,7 +151,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 # Fresh proxy URL for the run (None if no proxy). diff --git a/docs/03_guides/code/05_crawlee_beautifulsoup.py b/docs/03_guides/code/05_crawlee_beautifulsoup.py index fe527b9f..bdd1603b 100644 --- a/docs/03_guides/code/05_crawlee_beautifulsoup.py +++ b/docs/03_guides/code/05_crawlee_beautifulsoup.py @@ -51,7 +51,7 @@ async def main() -> None: proxy_configuration=proxy_configuration, request_handler=router, # Cap the crawl. Remove or increase the limit to follow all links. - max_requests_per_crawl=50, + max_requests_per_crawl=10, ) await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_parsel.py b/docs/03_guides/code/05_crawlee_parsel.py index cadfa1ec..8ccb2440 100644 --- a/docs/03_guides/code/05_crawlee_parsel.py +++ b/docs/03_guides/code/05_crawlee_parsel.py @@ -51,7 +51,7 @@ async def main() -> None: proxy_configuration=proxy_configuration, request_handler=router, # Cap the crawl. Remove or increase the limit to follow all links. - max_requests_per_crawl=50, + max_requests_per_crawl=10, ) await crawler.run(start_urls) diff --git a/docs/03_guides/code/05_crawlee_playwright.py b/docs/03_guides/code/05_crawlee_playwright.py index 737bd453..df91b3f0 100644 --- a/docs/03_guides/code/05_crawlee_playwright.py +++ b/docs/03_guides/code/05_crawlee_playwright.py @@ -54,7 +54,7 @@ async def main() -> None: proxy_configuration=proxy_configuration, request_handler=router, # Cap the crawl. Remove or increase the limit to follow all links. - max_requests_per_crawl=50, + max_requests_per_crawl=10, headless=True, browser_launch_options={'args': browser_args}, ) diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py index 7195165e..167cb922 100644 --- a/docs/03_guides/code/07_scrapling.py +++ b/docs/03_guides/code/07_scrapling.py @@ -84,7 +84,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 while handled_requests < max_requests and ( diff --git a/docs/03_guides/code/07_scrapling_browser.py b/docs/03_guides/code/07_scrapling_browser.py index 8c9b63b6..ca1b836c 100644 --- a/docs/03_guides/code/07_scrapling_browser.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -79,7 +79,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl. Raise or remove the limit to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 # Open the browser once and reuse it for every page in the crawl. diff --git a/docs/03_guides/code/08_crawl4ai.py b/docs/03_guides/code/08_crawl4ai.py index 1c7884c1..4731f959 100644 --- a/docs/03_guides/code/08_crawl4ai.py +++ b/docs/03_guides/code/08_crawl4ai.py @@ -82,7 +82,7 @@ async def main() -> None: await request_queue.add_request(Request.from_url(url)) # Cap the crawl; raise or remove to follow more pages. - max_requests = 50 + max_requests = 10 handled_requests = 0 # Reuse one headless browser-backed crawler for every request. diff --git a/docs/03_guides/code/11_pydantic.py b/docs/03_guides/code/11_pydantic.py index 4626b2d4..fbb33f00 100644 --- a/docs/03_guides/code/11_pydantic.py +++ b/docs/03_guides/code/11_pydantic.py @@ -44,9 +44,10 @@ async def main() -> None: try: actor_input = ActorInput.model_validate(raw_input) except ValidationError as exc: - # Log a per-field summary, then re-raise to fail the run. + # Log a per-field summary and fail the run cleanly, without a raw traceback. Actor.log.error('The Actor input is invalid:\n%s', exc) - raise + await Actor.fail(status_message='The Actor input is invalid.') + return # Work with typed attributes from here on. Actor.log.info('Input passed validation: %s', actor_input.model_dump()) From 8a854f2149b5c8e1a6f76af18683baa3f67c2418 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 18 Jun 2026 15:24:48 +0200 Subject: [PATCH 2/2] docs: make Selenium example runnable, proxy in a separate section --- docs/03_guides/04_selenium.mdx | 19 ++++-- docs/03_guides/code/04_selenium.py | 72 +--------------------- docs/03_guides/code/04_selenium_proxy.py | 78 ++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 75 deletions(-) create mode 100644 docs/03_guides/code/04_selenium_proxy.py diff --git a/docs/03_guides/04_selenium.mdx b/docs/03_guides/04_selenium.mdx index 01d71a05..ef29baae 100644 --- a/docs/03_guides/04_selenium.mdx +++ b/docs/03_guides/04_selenium.mdx @@ -5,8 +5,10 @@ description: Build an Apify Actor that scrapes dynamic web pages using Selenium --- import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import SeleniumExample from '!!raw-loader!./code/04_selenium.py'; +import SeleniumExample from '!!raw-loader!roa-loader!./code/04_selenium.py'; +import SeleniumProxyExample from '!!raw-loader!./code/04_selenium_proxy.py'; In this guide, you'll learn how to use [Selenium](https://www.selenium.dev/) for browser automation and web scraping in your Apify Actors. @@ -36,16 +38,21 @@ This is a simple Actor that recursively scrapes data from linked pages on the sa It uses Selenium ChromeDriver to open the pages in an automated Chrome browser, and to extract the title, headings, and links after the pages load. -{/* Not runnable from the docs: the "Run on Apify" link encodes the whole snippet into the URL, and this Actor (with its inline proxy-auth extension) is large enough to exceed the URL length limit and fail with an HTTP 414. */} - + {SeleniumExample} - + ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example creates a proxy configuration with `Actor.create_proxy_configuration` and routes the browser through it for the whole run. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The runnable example above skips the proxy to stay simple. This section extends it to route the browser through Apify Proxy. The snippet below isn't a complete, runnable Actor on its own. It shows only the proxy-specific parts you add to the example above. + +Chrome ignores the credentials passed in the `--proxy-server` flag. To use an authenticated proxy such as Apify Proxy, configure it from inside a small extension. The `proxy_auth_extension` helper builds one at runtime. Its service worker sets the proxy server and answers the browser's authentication challenge with the username and password. The proxy-aware `build_chrome_driver` below replaces the simple one from the example above and loads that extension. The new headless mode (`--headless=new`) is required for Chrome to load it. + + + {SeleniumProxyExample} + -Chrome ignores the credentials passed in the `--proxy-server` flag. Because of that, configure an authenticated proxy such as Apify Proxy from inside a small extension. The `proxy_auth_extension` helper builds one at runtime: its service worker sets the proxy server and answers the browser's authentication challenge with the username and password. Note that the new headless mode (`--headless=new`) is required for Chrome to load the extension. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For details, see [Proxy management](../concepts/proxy-management). +To wire it in, create the proxy configuration in `main` with `Actor.create_proxy_configuration`, get a URL with `await proxy_configuration.new_url()`, and pass it to `build_chrome_driver`. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For details, see [Proxy management](../concepts/proxy-management). ## Conclusion diff --git a/docs/03_guides/code/04_selenium.py b/docs/03_guides/code/04_selenium.py index c01f336c..bd5278a3 100644 --- a/docs/03_guides/code/04_selenium.py +++ b/docs/03_guides/code/04_selenium.py @@ -1,10 +1,6 @@ import asyncio -import json -from pathlib import Path -from tempfile import mkdtemp from typing import Any from urllib.parse import urljoin, urlsplit -from zipfile import ZipFile from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions @@ -18,71 +14,17 @@ # On the Apify platform, it's already in the Actor's Docker image. -def proxy_auth_extension(proxy_url: str) -> str: - """Build a Chrome extension that routes Chrome through an authenticated proxy.""" - parts = urlsplit(proxy_url) - - manifest = { - 'name': 'Apify Proxy', - 'version': '1.0.0', - 'manifest_version': 3, - 'permissions': ['proxy', 'webRequest', 'webRequestAuthProvider'], - 'host_permissions': [''], - 'background': {'service_worker': 'background.js'}, - 'minimum_chrome_version': '108', - } - - # The service worker sets the proxy and answers the auth challenge. - proxy_config = json.dumps( - { - 'mode': 'fixed_servers', - 'rules': { - 'singleProxy': { - 'scheme': parts.scheme, - 'host': parts.hostname, - 'port': parts.port, - }, - }, - } - ) - credentials = json.dumps( - {'username': parts.username or '', 'password': parts.password or ''} - ) - background = ( - 'chrome.proxy.settings.set(' - '{value: ' + proxy_config + ', scope: "regular"});\n' - 'chrome.webRequest.onAuthRequired.addListener(\n' - ' () => ({authCredentials: ' + credentials + '}),\n' - ' {urls: [""]},\n' - ' ["blocking"],\n' - ');\n' - ) - - extension_path = Path(mkdtemp()) / 'apify_proxy.zip' - with ZipFile(extension_path, 'w') as archive: - archive.writestr('manifest.json', json.dumps(manifest)) - archive.writestr('background.js', background) - return str(extension_path) - - -def build_chrome_driver(proxy_url: str | None = None) -> webdriver.Chrome: - """Create a headless Chrome WebDriver, optionally routed through a proxy.""" +def build_chrome_driver() -> webdriver.Chrome: + """Create a headless Chrome WebDriver suitable for a container.""" chrome_options = ChromeOptions() if Actor.configuration.headless: - # The new headless mode is required to load the proxy extension. chrome_options.add_argument('--headless=new') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') - if proxy_url: - chrome_options.add_extension(proxy_auth_extension(proxy_url)) - chrome_options.add_argument( - '--disable-features=DisableLoadExtensionCommandLineSwitch' - ) - return webdriver.Chrome(options=chrome_options) @@ -140,9 +82,6 @@ async def main() -> None: Actor.log.info('No start URLs specified in Actor input, exiting...') await Actor.exit() - # Selenium proxies at the browser level, so one URL is shared per run. - proxy_configuration = await Actor.create_proxy_configuration() - # Open the request queue and enqueue the start URLs (crawl depth 0). request_queue = await Actor.open_request_queue() for start_url in start_urls: @@ -154,13 +93,8 @@ async def main() -> None: max_requests = 10 handled_requests = 0 - # Fresh proxy URL for the run (None if no proxy). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - Actor.log.info('Launching Chrome WebDriver...') - driver = build_chrome_driver(proxy_url) + driver = build_chrome_driver() while handled_requests < max_requests and ( request := await request_queue.fetch_next_request() diff --git a/docs/03_guides/code/04_selenium_proxy.py b/docs/03_guides/code/04_selenium_proxy.py new file mode 100644 index 00000000..dd3cd62a --- /dev/null +++ b/docs/03_guides/code/04_selenium_proxy.py @@ -0,0 +1,78 @@ +import json +from pathlib import Path +from tempfile import mkdtemp +from urllib.parse import urlsplit +from zipfile import ZipFile + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions + +from apify import Actor + + +def proxy_auth_extension(proxy_url: str) -> str: + """Build a Chrome extension that routes Chrome through an authenticated proxy.""" + parts = urlsplit(proxy_url) + + manifest = { + 'name': 'Apify Proxy', + 'version': '1.0.0', + 'manifest_version': 3, + 'permissions': ['proxy', 'webRequest', 'webRequestAuthProvider'], + 'host_permissions': [''], + 'background': {'service_worker': 'background.js'}, + 'minimum_chrome_version': '108', + } + + # The service worker sets the proxy and answers the auth challenge. + proxy_config = json.dumps( + { + 'mode': 'fixed_servers', + 'rules': { + 'singleProxy': { + 'scheme': parts.scheme, + 'host': parts.hostname, + 'port': parts.port, + }, + }, + } + ) + credentials = json.dumps( + {'username': parts.username or '', 'password': parts.password or ''} + ) + background = ( + 'chrome.proxy.settings.set(' + '{value: ' + proxy_config + ', scope: "regular"});\n' + 'chrome.webRequest.onAuthRequired.addListener(\n' + ' () => ({authCredentials: ' + credentials + '}),\n' + ' {urls: [""]},\n' + ' ["blocking"],\n' + ');\n' + ) + + extension_path = Path(mkdtemp()) / 'apify_proxy.zip' + with ZipFile(extension_path, 'w') as archive: + archive.writestr('manifest.json', json.dumps(manifest)) + archive.writestr('background.js', background) + return str(extension_path) + + +def build_chrome_driver(proxy_url: str) -> webdriver.Chrome: + """Create a headless Chrome WebDriver routed through an authenticated proxy.""" + chrome_options = ChromeOptions() + + if Actor.configuration.headless: + # The new headless mode is required to load the proxy extension. + chrome_options.add_argument('--headless=new') + + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + + # Load the proxy extension and keep it enabled in headless mode. + chrome_options.add_extension(proxy_auth_extension(proxy_url)) + chrome_options.add_argument( + '--disable-features=DisableLoadExtensionCommandLineSwitch' + ) + + return webdriver.Chrome(options=chrome_options)