Skip to content

feat: add an internal HttpClient to be used in send_request for PlaywrightCrawler using APIRequestContext bound to the browser context #1134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

from playwright.async_api import TimeoutError as PlaywrightTimeoutError

from crawlee import HttpHeaders
from crawlee._types import BasicCrawlingContext
from crawlee._utils.docs import docs_group
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable, Sequence
Expand Down Expand Up @@ -186,7 +186,7 @@ async def from_playwright_crawling_context(
context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
# This might not be always available.
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol')
http_response = await _PlaywrightHttpResponse.from_playwright_response(
http_response = await PlaywrightHttpResponse.from_playwright_response(
response=context.response, protocol=protocol_guess or ''
)
# block_requests is useful only on pre-navigation contexts. It is useless here.
Expand Down Expand Up @@ -240,26 +240,3 @@ async def dummy_block_requests(

context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
return cls(**context_kwargs)


@dataclass(frozen=True)
class _PlaywrightHttpResponse:
"""Wrapper class for playwright `Response` object to implement `HttpResponse` protocol."""

http_version: str
status_code: int
headers: HttpHeaders
_content: bytes

def read(self) -> bytes:
return self._content

@classmethod
async def from_playwright_response(cls, response: Response, protocol: str) -> Self:
headers = HttpHeaders(response.headers)
status_code = response.status
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
http_version = protocol
_content = await response.body()

return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
10 changes: 9 additions & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,14 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
return robots_txt_file

# If not cached, fetch the robots.txt file
robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
robots_txt_file = await self._find_txt_file_for_url(url)
self._robots_txt_file_cache[origin_url] = robots_txt_file
return robots_txt_file

async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
"""Find the robots.txt file for a given URL.

Args:
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
"""
return await RobotsTxtFile.find(url, self._http_client)
56 changes: 36 additions & 20 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@
from crawlee._request import Request, RequestOptions
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
from crawlee._utils.docs import docs_group
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee.browsers import BrowserPool
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.http_clients import HttpxHttpClient
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState

from ._playwright_crawling_context import PlaywrightCrawlingContext
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
from ._utils import block_requests, infinite_scroll

Expand Down Expand Up @@ -168,6 +171,8 @@ def __init__(
kwargs.setdefault('_logger', logging.getLogger(__name__))
self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []

kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']

super().__init__(**kwargs)

async def _open_page(
Expand All @@ -194,9 +199,9 @@ async def _open_page(
block_requests=partial(block_requests, page=crawlee_page.page),
)

for hook in self._pre_navigation_hooks:
await hook(pre_navigation_context)

async with browser_page_context(crawlee_page.page):
for hook in self._pre_navigation_hooks:
await hook(pre_navigation_context)
yield pre_navigation_context

async def _navigate(
Expand Down Expand Up @@ -234,23 +239,24 @@ async def _navigate(

extract_links = self._create_extract_links_function(context)

error = yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
use_state=context.use_state,
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=context.page,
infinite_scroll=lambda: infinite_scroll(context.page),
response=response,
extract_links=extract_links,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
)
async with browser_page_context(context.page):
error = yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
use_state=context.use_state,
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=context.page,
infinite_scroll=lambda: infinite_scroll(context.page),
response=response,
extract_links=extract_links,
enqueue_links=self._create_enqueue_links_function(context, extract_links),
block_requests=partial(block_requests, page=context.page),
)

if context.session:
pw_cookies = await self._get_cookies(context.page)
Expand Down Expand Up @@ -441,6 +447,16 @@ async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]
"""Update the cookies in the page context."""
await page.context.add_cookies([{**cookie} for cookie in cookies])

async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
"""Find the robots.txt file for a given URL.

Args:
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
"""
http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client

return await RobotsTxtFile.find(url, http_client=http_client)


class _PlaywrightCrawlerAdditionalOptions(TypedDict):
"""Additional arguments for the `PlaywrightCrawler` constructor.
Expand Down
91 changes: 91 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_http_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from __future__ import annotations

import contextvars
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING

from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
from collections.abc import AsyncGenerator

from playwright.async_api import Page

from crawlee import Request
from crawlee._types import HttpMethod, HttpPayload
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
from crawlee.statistics import Statistics


_browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None)


@asynccontextmanager
async def browser_page_context(page: Page) -> AsyncGenerator[None, None]:
"""Asynchronous context manager for setting the current Playwright page in the context variable."""
token = _browser_page_context_var.set(page)
try:
yield
finally:
_browser_page_context_var.reset(token)


class PlaywrightHttpClient(HttpClient):
"""HTTP client based on the Playwright library.

This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
and to manage sessions, proxies, and error handling.

See the `HttpClient` class for more common information about HTTP clients.

Note: This class is pre-designated for use in `PlaywrightCrawler` only
"""

def __init__(self) -> None:
"""Initialize a new instance."""

@override
async def crawl(
self,
request: Request,
*,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
) -> HttpCrawlingResult:
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')

@override
async def send_request(
self,
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | dict[str, str] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
# https://github.com/apify/crawlee-python/issues/1055

if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

browser_context = _browser_page_context_var.get()

if browser_context is None:
raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context')

# Proxies appropriate to the browser context are used
response = await browser_context.request.fetch(
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
)

return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
31 changes: 30 additions & 1 deletion src/crawlee/crawlers/_playwright/_types.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from __future__ import annotations

from typing import Protocol
from dataclasses import dataclass
from typing import TYPE_CHECKING, Protocol

from crawlee import HttpHeaders
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
from playwright.async_api import APIResponse, Response
from typing_extensions import Self


@docs_group('Functions')
class BlockRequestsFunction(Protocol):
Expand All @@ -22,3 +28,26 @@ async def __call__(
url_patterns: List of URL patterns to block. If None, uses default patterns.
extra_url_patterns: Additional URL patterns to append to the main patterns list.
"""


@dataclass(frozen=True)
class PlaywrightHttpResponse:
"""Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol."""

http_version: str
status_code: int
headers: HttpHeaders
_content: bytes

def read(self) -> bytes:
return self._content

@classmethod
async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self:
headers = HttpHeaders(response.headers)
status_code = response.status
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
http_version = protocol
_content = await response.body()

return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
51 changes: 51 additions & 0 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
)
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.http_clients import HttpxHttpClient
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import Session, SessionPool
from crawlee.statistics import Statistics
Expand Down Expand Up @@ -616,3 +617,53 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
}


async def test_send_request(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}

crawler = PlaywrightCrawler()

@crawler.pre_navigation_hook
async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['pre_send_request'] = dict(json.loads(send_request_response.read()))

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

assert check_data['default'].get('user-agent') is not None
assert check_data['send_request'].get('user-agent') is not None
assert check_data['pre_send_request'] == check_data['send_request']

assert check_data['default'] == check_data['send_request']


async def test_send_request_with_client(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}

crawler = PlaywrightCrawler(
http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
)

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
response = await context.response.text()
check_data['default'] = dict(json.loads(response))
send_request_response = await context.send_request(str(server_url / 'user-agent'))
check_data['send_request'] = dict(json.loads(send_request_response.read()))

await crawler.run([str(server_url / 'user-agent')])

assert check_data['default'].get('user-agent') is not None
assert check_data['send_request']['user-agent'] == 'My User-Agent'

assert check_data['default'] != check_data['send_request']
Loading