From 86325a95e4e9cd111f33e83ffe25358716ea142d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 3 Apr 2025 12:24:38 +0000 Subject: [PATCH 1/6] add helper function `send_request` for playwright --- .../_adaptive_playwright_crawling_context.py | 27 +---------- .../_playwright/_playwright_crawler.py | 16 +++++-- src/crawlee/crawlers/_playwright/_types.py | 31 ++++++++++++- src/crawlee/crawlers/_playwright/_utils.py | 18 ++++++++ .../_playwright/test_playwright_crawler.py | 45 +++++++++++++++++++ 5 files changed, 108 insertions(+), 29 deletions(-) diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py index 91f3d86a92..1c886d3393 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py @@ -6,10 +6,10 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError -from crawlee import HttpHeaders from crawlee._types import BasicCrawlingContext from crawlee._utils.docs import docs_group from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext +from crawlee.crawlers._playwright._types import PlaywrightHttpResponse if TYPE_CHECKING: from collections.abc import Awaitable, Callable, Sequence @@ -186,7 +186,7 @@ async def from_playwright_crawling_context( context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll') # This might not be always available. protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol') - http_response = await _PlaywrightHttpResponse.from_playwright_response( + http_response = await PlaywrightHttpResponse.from_playwright_response( response=context.response, protocol=protocol_guess or '' ) # block_requests is useful only on pre-navigation contexts. It is useless here. @@ -240,26 +240,3 @@ async def dummy_block_requests( context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests) return cls(**context_kwargs) - - -@dataclass(frozen=True) -class _PlaywrightHttpResponse: - """Wrapper class for playwright `Response` object to implement `HttpResponse` protocol.""" - - http_version: str - status_code: int - headers: HttpHeaders - _content: bytes - - def read(self) -> bytes: - return self._content - - @classmethod - async def from_playwright_response(cls, response: Response, protocol: str) -> Self: - headers = HttpHeaders(response.headers) - status_code = response.status - # Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument. - http_version = protocol - _content = await response.body() - - return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 48e5a521e7..a0ea1e8c2b 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -20,7 +20,7 @@ from ._playwright_crawling_context import PlaywrightCrawlingContext from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext -from ._utils import block_requests, infinite_scroll +from ._utils import block_requests, infinite_scroll, prepare_send_request_function TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) @@ -168,6 +168,8 @@ def __init__( kwargs.setdefault('_logger', logging.getLogger(__name__)) self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] + self._use_http_client = bool(kwargs.get('http_client')) + super().__init__(**kwargs) async def _open_page( @@ -180,11 +182,15 @@ async def _open_page( # Create a new browser page crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) + send_request = ( + context.send_request if self._use_http_client else prepare_send_request_function(crawlee_page.page) + ) + pre_navigation_context = PlaywrightPreNavCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, - send_request=context.send_request, + send_request=send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, @@ -289,11 +295,15 @@ async def enqueue_links( await context.add_requests(requests, **kwargs) + send_request = ( + context.send_request if self._use_http_client else prepare_send_request_function(context.page) + ) + yield PlaywrightCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, - send_request=context.send_request, + send_request=send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py index ef0c5a560e..427c7fe05d 100644 --- a/src/crawlee/crawlers/_playwright/_types.py +++ b/src/crawlee/crawlers/_playwright/_types.py @@ -1,9 +1,15 @@ from __future__ import annotations -from typing import Protocol +from dataclasses import dataclass +from typing import TYPE_CHECKING, Protocol +from crawlee import HttpHeaders from crawlee._utils.docs import docs_group +if TYPE_CHECKING: + from playwright.async_api import APIResponse, Response + from typing_extensions import Self + @docs_group('Functions') class BlockRequestsFunction(Protocol): @@ -22,3 +28,26 @@ async def __call__( url_patterns: List of URL patterns to block. If None, uses default patterns. extra_url_patterns: Additional URL patterns to append to the main patterns list. """ + + +@dataclass(frozen=True) +class PlaywrightHttpResponse: + """Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol.""" + + http_version: str + status_code: int + headers: HttpHeaders + _content: bytes + + def read(self) -> bytes: + return self._content + + @classmethod + async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self: + headers = HttpHeaders(response.headers) + status_code = response.status + # Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument. + http_version = protocol + _content = await response.body() + + return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content) diff --git a/src/crawlee/crawlers/_playwright/_utils.py b/src/crawlee/crawlers/_playwright/_utils.py index 956b269e13..bab68340d5 100644 --- a/src/crawlee/crawlers/_playwright/_utils.py +++ b/src/crawlee/crawlers/_playwright/_utils.py @@ -4,10 +4,14 @@ from contextlib import suppress from typing import TYPE_CHECKING +from ._types import PlaywrightHttpResponse + if TYPE_CHECKING: from playwright.async_api import Page from playwright.async_api import Request as PlaywrightRequest + from crawlee._types import HttpHeaders, HttpMethod, SendRequestFunction + _DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [ '.css', '.webp', @@ -108,3 +112,17 @@ async def block_requests( if specific_files: await page.route(f'**/{{{",".join(specific_files)}}}*', lambda route, _: route.abort()) + + +def prepare_send_request_function(page: Page) -> SendRequestFunction: + async def send_request( + url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None + ) -> PlaywrightHttpResponse: + # It is necessary to pass `set_extra_http_headers` passed earlier to `Playwright` + # TODO: https://github.com/apify/crawlee-python/issues/1055 + headers = dict(headers) if headers else None + # `request` is done based on the browser context and uses the same cookies and proxies + response = await page.request.fetch(url_or_request=url, method=method, headers=headers) + return await PlaywrightHttpResponse.from_playwright_response(response=response, protocol='') + + return send_request diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 8132cbe41d..110f61c58b 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -21,6 +21,7 @@ ) from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD +from crawlee.http_clients import HttpxHttpClient from crawlee.proxy_configuration import ProxyConfiguration from crawlee.sessions import SessionPool @@ -469,3 +470,47 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: assert fingerprints['window.navigator.userAgent'] assert 'headless' not in fingerprints['window.navigator.userAgent'].lower() + + +async def test_send_request(server_url: URL) -> None: + """Check that the persist context works with fingerprints.""" + check_data: dict[str, Any] = {} + + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + response = await context.response.text() + check_data['default'] = dict(json.loads(response)) + send_request_response = await context.send_request(str(server_url / 'user-agent')) + check_data['send_request'] = dict(json.loads(send_request_response.read())) + + await crawler.run([str(server_url / 'user-agent')]) + + assert check_data['default'].get('user-agent') is not None + assert check_data['send_request'].get('user-agent') is not None + + assert check_data['default'] == check_data['send_request'] + + +async def test_send_request_with_client(server_url: URL) -> None: + """Check that the persist context works with fingerprints.""" + check_data: dict[str, Any] = {} + + crawler = PlaywrightCrawler( + http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'}) + ) + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + response = await context.response.text() + check_data['default'] = dict(json.loads(response)) + send_request_response = await context.send_request(str(server_url / 'user-agent')) + check_data['send_request'] = dict(json.loads(send_request_response.read())) + + await crawler.run([str(server_url / 'user-agent')]) + + assert check_data['default'].get('user-agent') is not None + assert check_data['send_request']['user-agent'] == 'My User-Agent' + + assert check_data['default'] != check_data['send_request'] From fa532d11b01a3f53c5c41398cce48bc736d40127 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 18 Apr 2025 17:36:42 +0000 Subject: [PATCH 2/6] change helper to inner http-client --- .../_playwright/_playwright_crawler.py | 62 +++++++------- .../_playwright/_playwright_http_client.py | 82 +++++++++++++++++++ src/crawlee/crawlers/_playwright/_utils.py | 18 ---- .../_playwright/test_playwright_crawler.py | 6 ++ 4 files changed, 119 insertions(+), 49 deletions(-) create mode 100644 src/crawlee/crawlers/_playwright/_playwright_http_client.py diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 139f903052..d7d3865653 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -19,8 +19,9 @@ from crawlee.statistics import StatisticsState from ._playwright_crawling_context import PlaywrightCrawlingContext +from ._playwright_http_client import PlaywrightHttpClient, browser_page_context_var from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext -from ._utils import block_requests, infinite_scroll, prepare_send_request_function +from ._utils import block_requests, infinite_scroll TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) @@ -168,7 +169,7 @@ def __init__( kwargs.setdefault('_logger', logging.getLogger(__name__)) self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = [] - self._use_http_client = bool(kwargs.get('http_client')) + kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client'] super().__init__(**kwargs) @@ -182,15 +183,11 @@ async def _open_page( # Create a new browser page crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info) - send_request = ( - context.send_request if self._use_http_client else prepare_send_request_function(crawlee_page.page) - ) - pre_navigation_context = PlaywrightPreNavCrawlingContext( request=context.request, session=context.session, add_requests=context.add_requests, - send_request=send_request, + send_request=context.send_request, push_data=context.push_data, use_state=context.use_state, proxy_info=context.proxy_info, @@ -200,9 +197,12 @@ async def _open_page( block_requests=partial(block_requests, page=crawlee_page.page), ) - for hook in self._pre_navigation_hooks: - await hook(pre_navigation_context) - + try: + context_http_client_var = browser_page_context_var.set(crawlee_page.page) + for hook in self._pre_navigation_hooks: + await hook(pre_navigation_context) + finally: + browser_page_context_var.reset(context_http_client_var) yield pre_navigation_context async def _navigate( @@ -244,27 +244,27 @@ async def _navigate( extract_links = self._create_extract_links_function(context) - send_request = ( - context.send_request if self._use_http_client else prepare_send_request_function(context.page) - ) - - error = yield PlaywrightCrawlingContext( - request=context.request, - session=context.session, - add_requests=context.add_requests, - send_request=send_request, - push_data=context.push_data, - use_state=context.use_state, - proxy_info=context.proxy_info, - get_key_value_store=context.get_key_value_store, - log=context.log, - page=context.page, - infinite_scroll=lambda: infinite_scroll(context.page), - response=response, - extract_links=extract_links, - enqueue_links=self._create_enqueue_links_function(context, extract_links), - block_requests=partial(block_requests, page=context.page), - ) + try: + context_http_client_var = browser_page_context_var.set(context.page) + error = yield PlaywrightCrawlingContext( + request=context.request, + session=context.session, + add_requests=context.add_requests, + send_request=context.send_request, + push_data=context.push_data, + use_state=context.use_state, + proxy_info=context.proxy_info, + get_key_value_store=context.get_key_value_store, + log=context.log, + page=context.page, + infinite_scroll=lambda: infinite_scroll(context.page), + response=response, + extract_links=extract_links, + enqueue_links=self._create_enqueue_links_function(context, extract_links), + block_requests=partial(block_requests, page=context.page), + ) + finally: + browser_page_context_var.reset(context_http_client_var) # Collect data in case of errors, before the page object is closed. if error: diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py new file mode 100644 index 0000000000..6457b05ac0 --- /dev/null +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import contextvars +from typing import TYPE_CHECKING + +from typing_extensions import override + +from crawlee._types import HttpHeaders +from crawlee.crawlers._playwright._types import PlaywrightHttpResponse +from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse + +if TYPE_CHECKING: + from playwright.async_api import Page + + from crawlee import Request + from crawlee._types import HttpMethod, HttpPayload + from crawlee.proxy_configuration import ProxyInfo + from crawlee.sessions import Session + from crawlee.statistics import Statistics + + +browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None) + + +class PlaywrightHttpClient(HttpClient): + """HTTP client based on the Playwright library. + + This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) + and to manage sessions, proxies, and error handling. + + See the `HttpClient` class for more common information about HTTP clients. + + ### Usage + + ```python + from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler + from crawlee.http_clients import PlaywrightHttpClient + + http_client = PlaywrightHttpClient() + crawler = HttpCrawler(http_client=http_client) + ``` + """ + + def __init__(self) -> None: + """Initialize a new instance.""" + + @override + async def crawl( + self, + request: Request, + *, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + statistics: Statistics | None = None, + ) -> HttpCrawlingResult: + raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`') + + @override + async def send_request( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + ) -> HttpResponse: + if isinstance(headers, dict) or headers is None: + headers = HttpHeaders(headers or {}) + + browser_context = browser_page_context_var.get() + + if browser_context is None: + raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context') + + # Proxies appropriate to the browser context are used + response = await browser_context.request.fetch( + url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload + ) + + return await PlaywrightHttpResponse.from_playwright_response(response, protocol='') diff --git a/src/crawlee/crawlers/_playwright/_utils.py b/src/crawlee/crawlers/_playwright/_utils.py index bab68340d5..956b269e13 100644 --- a/src/crawlee/crawlers/_playwright/_utils.py +++ b/src/crawlee/crawlers/_playwright/_utils.py @@ -4,14 +4,10 @@ from contextlib import suppress from typing import TYPE_CHECKING -from ._types import PlaywrightHttpResponse - if TYPE_CHECKING: from playwright.async_api import Page from playwright.async_api import Request as PlaywrightRequest - from crawlee._types import HttpHeaders, HttpMethod, SendRequestFunction - _DEFAULT_BLOCK_REQUEST_URL_PATTERNS = [ '.css', '.webp', @@ -112,17 +108,3 @@ async def block_requests( if specific_files: await page.route(f'**/{{{",".join(specific_files)}}}*', lambda route, _: route.abort()) - - -def prepare_send_request_function(page: Page) -> SendRequestFunction: - async def send_request( - url: str, *, method: HttpMethod = 'GET', headers: HttpHeaders | dict[str, str] | None = None - ) -> PlaywrightHttpResponse: - # It is necessary to pass `set_extra_http_headers` passed earlier to `Playwright` - # TODO: https://github.com/apify/crawlee-python/issues/1055 - headers = dict(headers) if headers else None - # `request` is done based on the browser context and uses the same cookies and proxies - response = await page.request.fetch(url_or_request=url, method=method, headers=headers) - return await PlaywrightHttpResponse.from_playwright_response(response=response, protocol='') - - return send_request diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index d03210af91..efe2b8531e 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -572,6 +572,11 @@ async def test_send_request(server_url: URL) -> None: crawler = PlaywrightCrawler() + @crawler.pre_navigation_hook + async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None: + send_request_response = await context.send_request(str(server_url / 'user-agent')) + check_data['pre_send_request'] = dict(json.loads(send_request_response.read())) + @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: response = await context.response.text() @@ -583,6 +588,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert check_data['default'].get('user-agent') is not None assert check_data['send_request'].get('user-agent') is not None + assert check_data['pre_send_request'] == check_data['send_request'] assert check_data['default'] == check_data['send_request'] From 25d1cb3fe3455a9044b8015927a9b0a608551bc4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 18 Apr 2025 17:44:16 +0000 Subject: [PATCH 3/6] update docs --- .../crawlers/_playwright/_playwright_http_client.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py index 6457b05ac0..c5a9d21e04 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_http_client.py +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -30,15 +30,7 @@ class PlaywrightHttpClient(HttpClient): See the `HttpClient` class for more common information about HTTP clients. - ### Usage - - ```python - from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler - from crawlee.http_clients import PlaywrightHttpClient - - http_client = PlaywrightHttpClient() - crawler = HttpCrawler(http_client=http_client) - ``` + Note: This class is pre-designated for use in `PlaywrightCrawler` only """ def __init__(self) -> None: From 35c9d6bf80813a833b28591fbb26f9ec7c1c1495 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 12:53:41 +0000 Subject: [PATCH 4/6] add comments --- src/crawlee/crawlers/_playwright/_playwright_http_client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py index c5a9d21e04..09f22bbe7b 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_http_client.py +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -58,6 +58,10 @@ async def send_request( session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: + # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext` + # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved + # https://github.com/apify/crawlee-python/issues/1055 + if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) From 89504fefb9d7d9ec6c1417498f08b4d20998e685 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 15:30:44 +0000 Subject: [PATCH 5/6] add contextmanager for managment ContextVar --- .../crawlers/_playwright/_playwright_crawler.py | 12 +++--------- .../_playwright/_playwright_http_client.py | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index d7d3865653..7233745075 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -19,7 +19,7 @@ from crawlee.statistics import StatisticsState from ._playwright_crawling_context import PlaywrightCrawlingContext -from ._playwright_http_client import PlaywrightHttpClient, browser_page_context_var +from ._playwright_http_client import PlaywrightHttpClient, browser_page_context from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext from ._utils import block_requests, infinite_scroll @@ -197,12 +197,9 @@ async def _open_page( block_requests=partial(block_requests, page=crawlee_page.page), ) - try: - context_http_client_var = browser_page_context_var.set(crawlee_page.page) + async with browser_page_context(crawlee_page.page): for hook in self._pre_navigation_hooks: await hook(pre_navigation_context) - finally: - browser_page_context_var.reset(context_http_client_var) yield pre_navigation_context async def _navigate( @@ -244,8 +241,7 @@ async def _navigate( extract_links = self._create_extract_links_function(context) - try: - context_http_client_var = browser_page_context_var.set(context.page) + async with browser_page_context(context.page): error = yield PlaywrightCrawlingContext( request=context.request, session=context.session, @@ -263,8 +259,6 @@ async def _navigate( enqueue_links=self._create_enqueue_links_function(context, extract_links), block_requests=partial(block_requests, page=context.page), ) - finally: - browser_page_context_var.reset(context_http_client_var) # Collect data in case of errors, before the page object is closed. if error: diff --git a/src/crawlee/crawlers/_playwright/_playwright_http_client.py b/src/crawlee/crawlers/_playwright/_playwright_http_client.py index 09f22bbe7b..d8a51af40e 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_http_client.py +++ b/src/crawlee/crawlers/_playwright/_playwright_http_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextvars +from contextlib import asynccontextmanager from typing import TYPE_CHECKING from typing_extensions import override @@ -10,6 +11,8 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse if TYPE_CHECKING: + from collections.abc import AsyncGenerator + from playwright.async_api import Page from crawlee import Request @@ -19,7 +22,17 @@ from crawlee.statistics import Statistics -browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None) +_browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None) + + +@asynccontextmanager +async def browser_page_context(page: Page) -> AsyncGenerator[None, None]: + """Asynchronous context manager for setting the current Playwright page in the context variable.""" + token = _browser_page_context_var.set(page) + try: + yield + finally: + _browser_page_context_var.reset(token) class PlaywrightHttpClient(HttpClient): @@ -65,7 +78,7 @@ async def send_request( if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) - browser_context = browser_page_context_var.get() + browser_context = _browser_page_context_var.get() if browser_context is None: raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context') From 5ce33203ea2300d3232c98d16c37fc8890eeb418 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 24 Apr 2025 21:05:34 +0000 Subject: [PATCH 6/6] workaround for robots.txt requests --- src/crawlee/crawlers/_basic/_basic_crawler.py | 10 +++++++++- .../crawlers/_playwright/_playwright_crawler.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 49b28c043e..3573033149 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1340,6 +1340,14 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: return robots_txt_file # If not cached, fetch the robots.txt file - robots_txt_file = await RobotsTxtFile.find(url, self._http_client) + robots_txt_file = await self._find_txt_file_for_url(url) self._robots_txt_file_cache[origin_url] = robots_txt_file return robots_txt_file + + async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: + """Find the robots.txt file for a given URL. + + Args: + url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. + """ + return await RobotsTxtFile.find(url, self._http_client) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index f728e254b0..dfa2f0dfcc 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -10,11 +10,13 @@ from crawlee._request import Request, RequestOptions from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group +from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee.browsers import BrowserPool from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline from crawlee.errors import SessionError from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions +from crawlee.http_clients import HttpxHttpClient from crawlee.sessions._cookies import PlaywrightCookieParam from crawlee.statistics import StatisticsState @@ -445,6 +447,16 @@ async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam] """Update the cookies in the page context.""" await page.context.add_cookies([{**cookie} for cookie in cookies]) + async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: + """Find the robots.txt file for a given URL. + + Args: + url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. + """ + http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client + + return await RobotsTxtFile.find(url, http_client=http_client) + class _PlaywrightCrawlerAdditionalOptions(TypedDict): """Additional arguments for the `PlaywrightCrawler` constructor.