Skip to content

Commit e794f49

Browse files
authored
feat: add an internal HttpClient to be used in send_request for PlaywrightCrawler using APIRequestContext bound to the browser context (#1134)
### Description - This PR, adds an internal `HttpClient` implementation for `PlaywrightCrawler` using the `APIRequestContext` from `Playwright`. This ensures that HTTP requests use the same proxies as the browser context. ### Issues - Closes: #928
1 parent 9a18065 commit e794f49

File tree

6 files changed

+219
-47
lines changed

6 files changed

+219
-47
lines changed

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66

77
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
88

9-
from crawlee import HttpHeaders
109
from crawlee._types import BasicCrawlingContext
1110
from crawlee._utils.docs import docs_group
1211
from crawlee.crawlers import AbstractHttpParser, ParsedHttpCrawlingContext, PlaywrightCrawlingContext
12+
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
1313

1414
if TYPE_CHECKING:
1515
from collections.abc import Awaitable, Callable, Sequence
@@ -186,7 +186,7 @@ async def from_playwright_crawling_context(
186186
context_kwargs['_infinite_scroll'] = context_kwargs.pop('infinite_scroll')
187187
# This might not be always available.
188188
protocol_guess = await context_kwargs['_page'].evaluate('() => performance.getEntries()[0].nextHopProtocol')
189-
http_response = await _PlaywrightHttpResponse.from_playwright_response(
189+
http_response = await PlaywrightHttpResponse.from_playwright_response(
190190
response=context.response, protocol=protocol_guess or ''
191191
)
192192
# block_requests is useful only on pre-navigation contexts. It is useless here.
@@ -240,26 +240,3 @@ async def dummy_block_requests(
240240

241241
context_kwargs['block_requests'] = context_kwargs.pop('block_requests', dummy_block_requests)
242242
return cls(**context_kwargs)
243-
244-
245-
@dataclass(frozen=True)
246-
class _PlaywrightHttpResponse:
247-
"""Wrapper class for playwright `Response` object to implement `HttpResponse` protocol."""
248-
249-
http_version: str
250-
status_code: int
251-
headers: HttpHeaders
252-
_content: bytes
253-
254-
def read(self) -> bytes:
255-
return self._content
256-
257-
@classmethod
258-
async def from_playwright_response(cls, response: Response, protocol: str) -> Self:
259-
headers = HttpHeaders(response.headers)
260-
status_code = response.status
261-
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
262-
http_version = protocol
263-
_content = await response.body()
264-
265-
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,14 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
13401340
return robots_txt_file
13411341

13421342
# If not cached, fetch the robots.txt file
1343-
robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
1343+
robots_txt_file = await self._find_txt_file_for_url(url)
13441344
self._robots_txt_file_cache[origin_url] = robots_txt_file
13451345
return robots_txt_file
1346+
1347+
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
1348+
"""Find the robots.txt file for a given URL.
1349+
1350+
Args:
1351+
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
1352+
"""
1353+
return await RobotsTxtFile.find(url, self._http_client)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,18 @@
1010
from crawlee._request import Request, RequestOptions
1111
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
1212
from crawlee._utils.docs import docs_group
13+
from crawlee._utils.robots import RobotsTxtFile
1314
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
1415
from crawlee.browsers import BrowserPool
1516
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
1617
from crawlee.errors import SessionError
1718
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
19+
from crawlee.http_clients import HttpxHttpClient
1820
from crawlee.sessions._cookies import PlaywrightCookieParam
1921
from crawlee.statistics import StatisticsState
2022

2123
from ._playwright_crawling_context import PlaywrightCrawlingContext
24+
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
2225
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
2326
from ._utils import block_requests, infinite_scroll
2427

@@ -168,6 +171,8 @@ def __init__(
168171
kwargs.setdefault('_logger', logging.getLogger(__name__))
169172
self._pre_navigation_hooks: list[Callable[[PlaywrightPreNavCrawlingContext], Awaitable[None]]] = []
170173

174+
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
175+
171176
super().__init__(**kwargs)
172177

173178
async def _open_page(
@@ -194,9 +199,9 @@ async def _open_page(
194199
block_requests=partial(block_requests, page=crawlee_page.page),
195200
)
196201

197-
for hook in self._pre_navigation_hooks:
198-
await hook(pre_navigation_context)
199-
202+
async with browser_page_context(crawlee_page.page):
203+
for hook in self._pre_navigation_hooks:
204+
await hook(pre_navigation_context)
200205
yield pre_navigation_context
201206

202207
async def _navigate(
@@ -234,23 +239,24 @@ async def _navigate(
234239

235240
extract_links = self._create_extract_links_function(context)
236241

237-
error = yield PlaywrightCrawlingContext(
238-
request=context.request,
239-
session=context.session,
240-
add_requests=context.add_requests,
241-
send_request=context.send_request,
242-
push_data=context.push_data,
243-
use_state=context.use_state,
244-
proxy_info=context.proxy_info,
245-
get_key_value_store=context.get_key_value_store,
246-
log=context.log,
247-
page=context.page,
248-
infinite_scroll=lambda: infinite_scroll(context.page),
249-
response=response,
250-
extract_links=extract_links,
251-
enqueue_links=self._create_enqueue_links_function(context, extract_links),
252-
block_requests=partial(block_requests, page=context.page),
253-
)
242+
async with browser_page_context(context.page):
243+
error = yield PlaywrightCrawlingContext(
244+
request=context.request,
245+
session=context.session,
246+
add_requests=context.add_requests,
247+
send_request=context.send_request,
248+
push_data=context.push_data,
249+
use_state=context.use_state,
250+
proxy_info=context.proxy_info,
251+
get_key_value_store=context.get_key_value_store,
252+
log=context.log,
253+
page=context.page,
254+
infinite_scroll=lambda: infinite_scroll(context.page),
255+
response=response,
256+
extract_links=extract_links,
257+
enqueue_links=self._create_enqueue_links_function(context, extract_links),
258+
block_requests=partial(block_requests, page=context.page),
259+
)
254260

255261
if context.session:
256262
pw_cookies = await self._get_cookies(context.page)
@@ -441,6 +447,16 @@ async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]
441447
"""Update the cookies in the page context."""
442448
await page.context.add_cookies([{**cookie} for cookie in cookies])
443449

450+
async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
451+
"""Find the robots.txt file for a given URL.
452+
453+
Args:
454+
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
455+
"""
456+
http_client = HttpxHttpClient() if isinstance(self._http_client, PlaywrightHttpClient) else self._http_client
457+
458+
return await RobotsTxtFile.find(url, http_client=http_client)
459+
444460

445461
class _PlaywrightCrawlerAdditionalOptions(TypedDict):
446462
"""Additional arguments for the `PlaywrightCrawler` constructor.
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from __future__ import annotations
2+
3+
import contextvars
4+
from contextlib import asynccontextmanager
5+
from typing import TYPE_CHECKING
6+
7+
from typing_extensions import override
8+
9+
from crawlee._types import HttpHeaders
10+
from crawlee.crawlers._playwright._types import PlaywrightHttpResponse
11+
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
12+
13+
if TYPE_CHECKING:
14+
from collections.abc import AsyncGenerator
15+
16+
from playwright.async_api import Page
17+
18+
from crawlee import Request
19+
from crawlee._types import HttpMethod, HttpPayload
20+
from crawlee.proxy_configuration import ProxyInfo
21+
from crawlee.sessions import Session
22+
from crawlee.statistics import Statistics
23+
24+
25+
_browser_page_context_var: contextvars.ContextVar[Page | None] = contextvars.ContextVar('browser_context', default=None)
26+
27+
28+
@asynccontextmanager
29+
async def browser_page_context(page: Page) -> AsyncGenerator[None, None]:
30+
"""Asynchronous context manager for setting the current Playwright page in the context variable."""
31+
token = _browser_page_context_var.set(page)
32+
try:
33+
yield
34+
finally:
35+
_browser_page_context_var.reset(token)
36+
37+
38+
class PlaywrightHttpClient(HttpClient):
39+
"""HTTP client based on the Playwright library.
40+
41+
This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
42+
and to manage sessions, proxies, and error handling.
43+
44+
See the `HttpClient` class for more common information about HTTP clients.
45+
46+
Note: This class is pre-designated for use in `PlaywrightCrawler` only
47+
"""
48+
49+
def __init__(self) -> None:
50+
"""Initialize a new instance."""
51+
52+
@override
53+
async def crawl(
54+
self,
55+
request: Request,
56+
*,
57+
session: Session | None = None,
58+
proxy_info: ProxyInfo | None = None,
59+
statistics: Statistics | None = None,
60+
) -> HttpCrawlingResult:
61+
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
62+
63+
@override
64+
async def send_request(
65+
self,
66+
url: str,
67+
*,
68+
method: HttpMethod = 'GET',
69+
headers: HttpHeaders | dict[str, str] | None = None,
70+
payload: HttpPayload | None = None,
71+
session: Session | None = None,
72+
proxy_info: ProxyInfo | None = None,
73+
) -> HttpResponse:
74+
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
75+
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
76+
# https://github.com/apify/crawlee-python/issues/1055
77+
78+
if isinstance(headers, dict) or headers is None:
79+
headers = HttpHeaders(headers or {})
80+
81+
browser_context = _browser_page_context_var.get()
82+
83+
if browser_context is None:
84+
raise RuntimeError('Unable to create an `APIRequestContext` outside the browser context')
85+
86+
# Proxies appropriate to the browser context are used
87+
response = await browser_context.request.fetch(
88+
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
89+
)
90+
91+
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')

src/crawlee/crawlers/_playwright/_types.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
from __future__ import annotations
22

3-
from typing import Protocol
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING, Protocol
45

6+
from crawlee import HttpHeaders
57
from crawlee._utils.docs import docs_group
68

9+
if TYPE_CHECKING:
10+
from playwright.async_api import APIResponse, Response
11+
from typing_extensions import Self
12+
713

814
@docs_group('Functions')
915
class BlockRequestsFunction(Protocol):
@@ -22,3 +28,26 @@ async def __call__(
2228
url_patterns: List of URL patterns to block. If None, uses default patterns.
2329
extra_url_patterns: Additional URL patterns to append to the main patterns list.
2430
"""
31+
32+
33+
@dataclass(frozen=True)
34+
class PlaywrightHttpResponse:
35+
"""Wrapper class for playwright `Response` and `APIResponse` objects to implement `HttpResponse` protocol."""
36+
37+
http_version: str
38+
status_code: int
39+
headers: HttpHeaders
40+
_content: bytes
41+
42+
def read(self) -> bytes:
43+
return self._content
44+
45+
@classmethod
46+
async def from_playwright_response(cls, response: Response | APIResponse, protocol: str) -> Self:
47+
headers = HttpHeaders(response.headers)
48+
status_code = response.status
49+
# Used http protocol version cannot be obtained from `Response` and has to be passed as additional argument.
50+
http_version = protocol
51+
_content = await response.body()
52+
53+
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
)
2222
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
2323
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
24+
from crawlee.http_clients import HttpxHttpClient
2425
from crawlee.proxy_configuration import ProxyConfiguration
2526
from crawlee.sessions import Session, SessionPool
2627
from crawlee.statistics import Statistics
@@ -616,3 +617,53 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
616617
str(server_url / 'start_enqueue'),
617618
str(server_url / 'sub_index'),
618619
}
620+
621+
622+
async def test_send_request(server_url: URL) -> None:
623+
"""Check that the persist context works with fingerprints."""
624+
check_data: dict[str, Any] = {}
625+
626+
crawler = PlaywrightCrawler()
627+
628+
@crawler.pre_navigation_hook
629+
async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
630+
send_request_response = await context.send_request(str(server_url / 'user-agent'))
631+
check_data['pre_send_request'] = dict(json.loads(send_request_response.read()))
632+
633+
@crawler.router.default_handler
634+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
635+
response = await context.response.text()
636+
check_data['default'] = dict(json.loads(response))
637+
send_request_response = await context.send_request(str(server_url / 'user-agent'))
638+
check_data['send_request'] = dict(json.loads(send_request_response.read()))
639+
640+
await crawler.run([str(server_url / 'user-agent')])
641+
642+
assert check_data['default'].get('user-agent') is not None
643+
assert check_data['send_request'].get('user-agent') is not None
644+
assert check_data['pre_send_request'] == check_data['send_request']
645+
646+
assert check_data['default'] == check_data['send_request']
647+
648+
649+
async def test_send_request_with_client(server_url: URL) -> None:
650+
"""Check that the persist context works with fingerprints."""
651+
check_data: dict[str, Any] = {}
652+
653+
crawler = PlaywrightCrawler(
654+
http_client=HttpxHttpClient(header_generator=None, headers={'user-agent': 'My User-Agent'})
655+
)
656+
657+
@crawler.router.default_handler
658+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
659+
response = await context.response.text()
660+
check_data['default'] = dict(json.loads(response))
661+
send_request_response = await context.send_request(str(server_url / 'user-agent'))
662+
check_data['send_request'] = dict(json.loads(send_request_response.read()))
663+
664+
await crawler.run([str(server_url / 'user-agent')])
665+
666+
assert check_data['default'].get('user-agent') is not None
667+
assert check_data['send_request']['user-agent'] == 'My User-Agent'
668+
669+
assert check_data['default'] != check_data['send_request']

0 commit comments

Comments
 (0)