diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index ee29212f8c..cce5dd063f 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -243,7 +243,11 @@ class Request(BaseRequestData): The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically generates a unique key and identifier based on the URL and request parameters. + ### Usage + ```python + from crawlee import Request + request = Request.from_url('https://crawlee.dev') ``` """ diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py index d6d61d4dcf..87687dba3d 100644 --- a/src/crawlee/basic_crawler/_basic_crawler.py +++ b/src/crawlee/basic_crawler/_basic_crawler.py @@ -88,14 +88,27 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): class BasicCrawler(Generic[TCrawlingContext]): - """Provides a simple framework for parallel crawling of web pages. - - The URLs to crawl are fed either from a static list of URLs or from a dynamic queue of URLs enabling recursive - crawling of websites. - - `BasicCrawler` is a low-level tool that requires the user to implement the page download and data extraction - functionality themselves. If we want a crawler that already facilitates this functionality, we should consider using - one of its subclasses. + """A basic web crawler providing a framework for crawling websites. + + The `BasicCrawler` provides a low-level functionality for crawling websites, allowing users to define their + own page download and data extraction logic. It is designed mostly to be subclassed by crawlers with specific + purposes. In most cases, you will want to use a more specialized crawler, such as `HttpCrawler`, + `BeautifulSoupCrawler`, `ParselCrawler`, or `PlaywrightCrawler`. If you are an advanced user and want full + control over the crawling process, you can subclass the `BasicCrawler` and implement the request-handling logic + yourself. + + The crawling process begins with URLs provided by a `RequestProvider` instance. Each request is then + handled by a user-defined `request_handler` function, which processes the page and extracts the data. + + The `BasicCrawler` includes several common features for crawling, such as: + - automatic scaling based on the system resources, + - retries for failed requests, + - session management, + - statistics tracking, + - request routing via labels, + - proxy rotation, + - direct storage interaction helpers, + - and more. """ def __init__( @@ -517,8 +530,8 @@ async def _push_data( dataset = await self.get_dataset(id=dataset_id, name=dataset_name) await dataset.push_data(data, **kwargs) - def _should_retry_request(self, crawling_context: BasicCrawlingContext, error: Exception) -> bool: - if crawling_context.request.no_retry: + def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool: + if context.request.no_retry: return False # Do not retry on client errors. @@ -526,31 +539,29 @@ def _should_retry_request(self, crawling_context: BasicCrawlingContext, error: E return False if isinstance(error, SessionError): - return ((crawling_context.request.session_rotation_count or 0) + 1) < self._max_session_rotations + return ((context.request.session_rotation_count or 0) + 1) < self._max_session_rotations - max_request_retries = crawling_context.request.max_retries + max_request_retries = context.request.max_retries if max_request_retries is None: max_request_retries = self._max_request_retries - return (crawling_context.request.retry_count + 1) < max_request_retries + return (context.request.retry_count + 1) < max_request_retries - async def _check_url_after_redirects( - self, crawling_context: TCrawlingContext - ) -> AsyncGenerator[TCrawlingContext, None]: + async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]: """Invoked at the end of the context pipeline to make sure that the `loaded_url` still matches enqueue_strategy. This is done to filter out links that redirect outside of the crawled domain. """ - if crawling_context.request.loaded_url is not None and not self._check_enqueue_strategy( - crawling_context.request.enqueue_strategy, - origin_url=urlparse(crawling_context.request.url), - target_url=urlparse(crawling_context.request.loaded_url), + if context.request.loaded_url is not None and not self._check_enqueue_strategy( + context.request.enqueue_strategy, + origin_url=urlparse(context.request.url), + target_url=urlparse(context.request.loaded_url), ): raise ContextPipelineInterruptedError( - f'Skipping URL {crawling_context.request.loaded_url} (redirected from {crawling_context.request.url})' + f'Skipping URL {context.request.loaded_url} (redirected from {context.request.url})' ) - yield crawling_context + yield context def _check_enqueue_strategy( self, @@ -611,19 +622,19 @@ def _check_url_patterns( async def _handle_request_retries( self, - crawling_context: TCrawlingContext | BasicCrawlingContext, + context: TCrawlingContext | BasicCrawlingContext, error: Exception, ) -> None: request_provider = await self.get_request_provider() - request = crawling_context.request + request = context.request - if self._should_retry_request(crawling_context, error): + if self._should_retry_request(context, error): request.retry_count += 1 self._statistics.error_tracker.add(error) if self._error_handler: try: - new_request = await self._error_handler(crawling_context, error) + new_request = await self._error_handler(context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e else: @@ -633,35 +644,31 @@ async def _handle_request_retries( await request_provider.reclaim_request(request) else: await wait_for( - lambda: request_provider.mark_request_as_handled(crawling_context.request), + lambda: request_provider.mark_request_as_handled(context.request), timeout=self._internal_timeout, timeout_message='Marking request as handled timed out after ' f'{self._internal_timeout.total_seconds()} seconds', logger=self._logger, max_retries=3, ) - await self._handle_failed_request(crawling_context, error) + await self._handle_failed_request(context, error) self._statistics.record_request_processing_failure(request.id or request.unique_key) - async def _handle_request_error( - self, - crawling_context: TCrawlingContext | BasicCrawlingContext, - error: Exception, - ) -> None: + async def _handle_request_error(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None: try: - crawling_context.request.state = RequestState.ERROR_HANDLER + context.request.state = RequestState.ERROR_HANDLER await wait_for( - partial(self._handle_request_retries, crawling_context, error), + partial(self._handle_request_retries, context, error), timeout=self._internal_timeout, timeout_message='Handling request failure timed out after ' f'{self._internal_timeout.total_seconds()} seconds', logger=self._logger, ) - crawling_context.request.state = RequestState.DONE + context.request.state = RequestState.DONE except UserDefinedErrorHandlerError: - crawling_context.request.state = RequestState.ERROR + context.request.state = RequestState.ERROR raise except Exception as secondary_error: self._logger.exception( @@ -669,21 +676,19 @@ async def _handle_request_error( 'and its underlying storages into an unknown state and crawling will be terminated.', exc_info=secondary_error, ) - crawling_context.request.state = RequestState.ERROR + context.request.state = RequestState.ERROR raise - if crawling_context.session: - crawling_context.session.mark_bad() + if context.session: + context.session.mark_bad() - async def _handle_failed_request( - self, crawling_context: TCrawlingContext | BasicCrawlingContext, error: Exception - ) -> None: + async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawlingContext, error: Exception) -> None: self._logger.exception('Request failed and reached maximum retries', exc_info=error) self._statistics.error_tracker.add(error) if self._failed_request_handler: try: - await self._failed_request_handler(crawling_context, error) + await self._failed_request_handler(context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e @@ -803,7 +808,7 @@ async def __run_task_function(self) -> None: proxy_info = await self._get_proxy_info(request, session) result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store) - crawling_context = BasicCrawlingContext( + context = BasicCrawlingContext( request=request, session=session, proxy_info=proxy_info, @@ -821,17 +826,17 @@ async def __run_task_function(self) -> None: request.state = RequestState.REQUEST_HANDLER await wait_for( - lambda: self.__run_request_handler(crawling_context), + lambda: self.__run_request_handler(context), timeout=self._request_handler_timeout, timeout_message='Request handler timed out after ' f'{self._request_handler_timeout.total_seconds()} seconds', logger=self._logger, ) - await self._commit_request_handler_result(crawling_context, result) + await self._commit_request_handler_result(context, result) await wait_for( - lambda: request_provider.mark_request_as_handled(crawling_context.request), + lambda: request_provider.mark_request_as_handled(context.request), timeout=self._internal_timeout, timeout_message='Marking request as handled timed out after ' f'{self._internal_timeout.total_seconds()} seconds', @@ -841,8 +846,8 @@ async def __run_task_function(self) -> None: request.state = RequestState.DONE - if crawling_context.session: - crawling_context.session.mark_good() + if context.session: + context.session.mark_good() self._statistics.record_request_processing_finish(statistics_id) @@ -858,20 +863,20 @@ async def __run_task_function(self) -> None: await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception) except SessionError as session_error: - if not crawling_context.session: + if not context.session: raise RuntimeError('SessionError raised in a crawling context without a session') from session_error if self._error_handler: - await self._error_handler(crawling_context, session_error) + await self._error_handler(context, session_error) - if self._should_retry_request(crawling_context, session_error): + if self._should_retry_request(context, session_error): self._logger.warning('Encountered a session error, rotating session and retrying') - crawling_context.session.retire() + context.session.retire() - if crawling_context.request.session_rotation_count is None: - crawling_context.request.session_rotation_count = 0 - crawling_context.request.session_rotation_count += 1 + if context.request.session_rotation_count is None: + context.request.session_rotation_count = 0 + context.request.session_rotation_count += 1 await request_provider.reclaim_request(request) self._statistics.error_tracker_retry.add(session_error) @@ -879,7 +884,7 @@ async def __run_task_function(self) -> None: self._logger.exception('Request failed and reached maximum retries', exc_info=session_error) await wait_for( - lambda: request_provider.mark_request_as_handled(crawling_context.request), + lambda: request_provider.mark_request_as_handled(context.request), timeout=self._internal_timeout, timeout_message='Marking request as handled timed out after ' f'{self._internal_timeout.total_seconds()} seconds', @@ -894,7 +899,7 @@ async def __run_task_function(self) -> None: self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error) await wait_for( - lambda: request_provider.mark_request_as_handled(crawling_context.request), + lambda: request_provider.mark_request_as_handled(context.request), timeout=self._internal_timeout, timeout_message='Marking request as handled timed out after ' f'{self._internal_timeout.total_seconds()} seconds', @@ -907,7 +912,7 @@ async def __run_task_function(self) -> None: 'An exception occurred during the initialization of crawling context', exc_info=initialization_error, ) - await self._handle_request_error(crawling_context, initialization_error.wrapped_exception) + await self._handle_request_error(context, initialization_error.wrapped_exception) except Exception as internal_error: self._logger.exception( @@ -917,5 +922,5 @@ async def __run_task_function(self) -> None: ) raise - async def __run_request_handler(self, crawling_context: BasicCrawlingContext) -> None: - await self._context_pipeline(crawling_context, self.router) + async def __run_request_handler(self, context: BasicCrawlingContext) -> None: + await self._context_pipeline(context, self.router) diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py index 4b9d6ef5ed..b970388b7a 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py @@ -23,7 +23,40 @@ class BeautifulSoupCrawler(BasicCrawler[BeautifulSoupCrawlingContext]): - """A crawler that fetches the request URL using `httpx` and parses the result with `BeautifulSoup`.""" + """A web crawler for performing HTTP requests and parsing HTML/XML content. + + The `BeautifulSoupCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. + On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the + `BeautifulSoup` library. The class allows integration with any HTTP client that implements the `BaseHttpClient` + interface. The HTTP client is provided to the crawler as an input parameter to the constructor. + + The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, + if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. + + ### Usage + + ```python + from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + crawler = BeautifulSoupCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.soup.title.string if context.soup.title else None, + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + await crawler.run(['https://crawlee.dev/']) + ``` + """ def __init__( self, @@ -37,11 +70,11 @@ def __init__( Args: parser: The type of parser that should be used by `BeautifulSoup`. - additional_http_error_status_codes: HTTP status codes that should be considered errors (and trigger - a retry). - ignore_http_error_status_codes: HTTP status codes that are normally considered errors but we want to treat - them as successful. - kwargs: Arguments to be forwarded to the underlying `BasicCrawler`. + additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering + automatic retries when encountered. + ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated + as successful responses. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ self._parser = parser @@ -65,6 +98,14 @@ def __init__( super().__init__(**kwargs) async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: + """Executes an HTTP request using a configured HTTP client. + + Args: + context: The crawling context from the `BasicCrawler`. + + Yields: + The enhanced crawling context with the HTTP response. + """ result = await self._http_client.crawl( request=context.request, session=context.session, @@ -85,16 +126,28 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera ) async def _handle_blocked_request( - self, crawling_context: BeautifulSoupCrawlingContext + self, + context: BeautifulSoupCrawlingContext, ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: + """Try to detect if the request is blocked based on the HTTP status code or the response content. + + Args: + context: The current crawling context. + + Raises: + SessionError: If the request is considered blocked. + + Yields: + The original crawling context if no errors are detected. + """ if self._retry_on_blocked: - status_code = crawling_context.http_response.status_code + status_code = context.http_response.status_code - if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): + if context.session and context.session.is_blocked_status_code(status_code=status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') matched_selectors = [ - selector for selector in RETRY_CSS_SELECTORS if crawling_context.soup.select_one(selector) is not None + selector for selector in RETRY_CSS_SELECTORS if context.soup.select_one(selector) is not None ] if matched_selectors: @@ -103,12 +156,20 @@ async def _handle_blocked_request( f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" ) - yield crawling_context + yield context async def _parse_http_response( self, context: HttpCrawlingContext, ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: + """Parse the HTTP response using the `BeautifulSoup` library and implements the `enqueue_links` function. + + Args: + context: The current crawling context. + + Yields: + The enhanced crawling context with the `BeautifulSoup` selector and the `enqueue_links` function. + """ soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser)) async def enqueue_links( diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index 4d004c322b..321fd33a47 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -77,6 +77,16 @@ class HttpxHttpClient(BaseHttpClient): and to manage sessions, proxies, and error handling. See the `BaseHttpClient` class for more common information about HTTP clients. + + ### Usage + + ```python + from crawlee.http_clients import HttpxHttpClient + from crawlee.http_crawler import HttpCrawler # or any other HTTP client-based crawler + + http_client = HttpxHttpClient() + crawler = HttpCrawler(http_client=http_client) + ``` """ _DEFAULT_HEADER_GENERATOR = HeaderGenerator() diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index bbbd872a42..51e434ff9c 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -77,6 +77,16 @@ class CurlImpersonateHttpClient(BaseHttpClient): and to manage sessions, proxies, and error handling. See the `BaseHttpClient` class for more common information about HTTP clients. + + ### Usage + + ```python + from crawlee.http_clients.curl_impersonate import CurlImpersonateHttpClient + from crawlee.http_crawler import HttpCrawler # or any other HTTP client-based crawler + + http_client = CurlImpersonateHttpClient() + crawler = HttpCrawler(http_client=http_client) + ``` """ def __init__( diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py index 5f7904fd1a..c2ac6de7d9 100644 --- a/src/crawlee/http_crawler/_http_crawler.py +++ b/src/crawlee/http_crawler/_http_crawler.py @@ -15,7 +15,40 @@ class HttpCrawler(BasicCrawler[HttpCrawlingContext]): - """A crawler that fetches the request URL using `httpx`.""" + """A web crawler for performing HTTP requests. + + The `HttpCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. On top + of that it implements the HTTP communication using the HTTP clients. The class allows integration with + any HTTP client that implements the `BaseHttpClient` interface. The HTTP client is provided to the crawler + as an input parameter to the constructor. + + The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, + if you need to execute client-side JavaScript, consider using a browser-based crawler like the `PlaywrightCrawler`. + + ### Usage + + ```python + from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext + + crawler = HttpCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'response': context.http_response.read().decode()[:100], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + await crawler.run(['https://crawlee.dev/']) + ``` + """ def __init__( self, @@ -27,11 +60,11 @@ def __init__( """A default constructor. Args: - additional_http_error_status_codes: HTTP status codes that should be considered errors (and trigger - a retry). - ignore_http_error_status_codes: HTTP status codes that are normally considered errors but we want to treat - them as successful. - kwargs: Arguments to be forwarded to the underlying `BasicCrawler`. + additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering + automatic retries when encountered. + ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated + as successful responses. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ kwargs['_context_pipeline'] = ( ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request) @@ -50,6 +83,14 @@ def __init__( super().__init__(**kwargs) async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: + """Executes an HTTP request using a configured HTTP client. + + Args: + context: The crawling context from the `BasicCrawler`. + + Yields: + The enhanced crawling context with the HTTP response. + """ result = await self._http_client.crawl( request=context.request, session=context.session, @@ -70,6 +111,17 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera ) async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: + """Try to detect if the request is blocked based on the HTTP status code. + + Args: + context: The current crawling context. + + Raises: + SessionError: If the request is considered blocked. + + Yields: + The original crawling context if no errors are detected. + """ if self._retry_on_blocked: status_code = context.http_response.status_code diff --git a/src/crawlee/parsel_crawler/_parsel_crawler.py b/src/crawlee/parsel_crawler/_parsel_crawler.py index 90a692ab45..0af7d4e412 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawler.py +++ b/src/crawlee/parsel_crawler/_parsel_crawler.py @@ -23,7 +23,40 @@ class ParselCrawler(BasicCrawler[ParselCrawlingContext]): - """A crawler that fetches the request URL using `httpx` and parses the result with `Parsel`.""" + """A web crawler for performing HTTP requests and parsing HTML/XML content. + + The `ParselCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. + On top of that it implements the HTTP communication using the HTTP clients and HTML/XML parsing using the + `Parsel` library. The class allows integration with any HTTP client that implements the `BaseHttpClient` + interface. The HTTP client is provided to the crawler as an input parameter to the constructor. + + The HTTP client-based crawlers are ideal for websites that do not require JavaScript execution. However, + if you need to execute client-side JavaScript, consider using browser-based crawler like the `PlaywrightCrawler`. + + ### Usage + + ```python + from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext + + crawler = ParselCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': context.selector.css('title').get(), + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + await crawler.run(['https://crawlee.dev/']) + ``` + """ def __init__( self, @@ -35,11 +68,11 @@ def __init__( """A default constructor. Args: - additional_http_error_status_codes: HTTP status codes that should be considered errors (and trigger - a retry). - ignore_http_error_status_codes: HTTP status codes that are normally considered errors but we want to treat - them as successful. - kwargs: Arguments to be forwarded to the underlying `BasicCrawler`. + additional_http_error_status_codes: Additional HTTP status codes to treat as errors, triggering + automatic retries when encountered. + ignore_http_error_status_codes: HTTP status codes typically considered errors but to be treated + as successful responses. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ kwargs['_context_pipeline'] = ( ContextPipeline() @@ -61,6 +94,14 @@ def __init__( super().__init__(**kwargs) async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: + """Executes an HTTP request using a configured HTTP client. + + Args: + context: The crawling context from the `BasicCrawler`. + + Yields: + The enhanced crawling context with the HTTP response. + """ result = await self._http_client.crawl( request=context.request, session=context.session, @@ -81,15 +122,26 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera ) async def _handle_blocked_request( - self, crawling_context: ParselCrawlingContext + self, context: ParselCrawlingContext ) -> AsyncGenerator[ParselCrawlingContext, None]: + """Try to detect if the request is blocked based on the HTTP status code or the response content. + + Args: + context: The current crawling context. + + Raises: + SessionError: If the request is considered blocked. + + Yields: + The original crawling context if no errors are detected. + """ if self._retry_on_blocked: - status_code = crawling_context.http_response.status_code + status_code = context.http_response.status_code - if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): + if context.session and context.session.is_blocked_status_code(status_code=status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') - parsel = crawling_context.selector + parsel = context.selector matched_selectors = [ selector @@ -103,12 +155,20 @@ async def _handle_blocked_request( f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" ) - yield crawling_context + yield context async def _parse_http_response( self, context: HttpCrawlingContext, ) -> AsyncGenerator[ParselCrawlingContext, None]: + """Parse the HTTP response using the `Parsel` library and implements the `enqueue_links` function. + + Args: + context: The current crawling context. + + Yields: + The enhanced crawling context with the `Parsel` selector and the `enqueue_links` function. + """ parsel_selector = await asyncio.to_thread(lambda: Selector(body=context.http_response.read())) async def enqueue_links( diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py index eef1f5f572..acc8c115ef 100644 --- a/src/crawlee/playwright_crawler/_playwright_crawler.py +++ b/src/crawlee/playwright_crawler/_playwright_crawler.py @@ -24,23 +24,43 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): - """A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library. + """A web crawler that leverages the `Playwright` browser automation library. - `PlaywrightCrawler` is a subclass of `BasicCrawler`, inheriting all its features, such as autoscaling of requests, - request routing, and utilization of `RequestProvider`. Additionally, it offers Playwright-specific methods and - properties, like the `page` property for user data extraction, and the `enqueue_links` method for crawling - other pages. + The `PlaywrightCrawler` builds on top of the `BasicCrawler`, which means it inherits all of its features. + On top of that it provides a high level web crawling interface on top of the `Playwright` library. To be more + specific, it uses the Crawlee's `BrowserPool` to manage the Playwright's browser instances and the pages they + open. You can create your own `BrowserPool` instance and pass it to the `PlaywrightCrawler` constructor, or let + the crawler create a new instance with the default settings. - This crawler is ideal for crawling websites that require JavaScript execution, as it uses headless browsers - to download web pages and extract data. For websites that do not require JavaScript, consider using - `BeautifulSoupCrawler`, which uses raw HTTP requests, and it is much faster. + This crawler is ideal for crawling websites that require JavaScript execution, as it uses real browsers + to download web pages and extract data. For websites that do not require JavaScript, consider using one of the + HTTP client-based crawlers, such as the `HttpCrawler`, `ParselCrawler`, or `BeautifulSoupCrawler`. They use + raw HTTP requests, which means they are much faster. - `PlaywrightCrawler` opens a new browser page (i.e., tab) for each `Request` object and invokes the user-provided - request handler function via the `Router`. Users can interact with the page and extract the data using - the Playwright API. + ### Usage - Note that the pool of browser instances used by `PlaywrightCrawler`, and the pages they open, is internally - managed by the `BrowserPool`. + ```python + from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext + + crawler = PlaywrightCrawler() + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract data from the page. + data = { + 'url': context.request.url, + 'title': await context.page.title(), + 'response': (await context.response.text())[:100], + } + + # Push the extracted data to the default dataset. + await context.push_data(data) + + await crawler.run(['https://crawlee.dev/']) + ``` """ def __init__( @@ -58,7 +78,7 @@ def __init__( This option should not be used if `browser_pool` is provided. headless: Whether to run the browser in headless mode. This option should not be used if `browser_pool` is provided. - kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`. + kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`. """ if browser_pool: # Raise an exception if browser_pool is provided together with headless or browser_type arguments. @@ -86,7 +106,7 @@ async def _make_http_request( self, context: BasicCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: - """Enhance the crawling context with making an HTTP request using Playwright. + """Executes an HTTP request utilizing the `BrowserPool` and the `Playwright` library. Args: context: The basic crawling context to be enhanced. @@ -96,7 +116,8 @@ async def _make_http_request( SessionError: If the URL cannot be loaded by the browser. Yields: - An enhanced crawling context with Playwright-specific features. + The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, and + infinite_scroll). """ if self._browser_pool is None: raise ValueError('Browser pool is not initialized.') @@ -174,28 +195,28 @@ async def enqueue_links( async def _handle_blocked_request( self, - crawling_context: PlaywrightCrawlingContext, + context: PlaywrightCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: - """Enhance the crawling context with handling of blocked requests. + """Try to detect if the request is blocked based on the HTTP status code or the response content. Args: - crawling_context: The crawling context to be checked for blocking. + context: The current crawling context. Raises: - SessionError: If the session is blocked based on the HTTP status code or the response content. + SessionError: If the request is considered blocked. Yields: - The original crawling context if the session is not blocked. + The original crawling context if no errors are detected. """ if self._retry_on_blocked: - status_code = crawling_context.response.status + status_code = context.response.status # Check if the session is blocked based on the HTTP status code. - if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): + if context.session and context.session.is_blocked_status_code(status_code=status_code): raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.') matched_selectors = [ - selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector)) + selector for selector in RETRY_CSS_SELECTORS if (await context.page.query_selector(selector)) ] # Check if the session is blocked based on the response content @@ -205,4 +226,4 @@ async def _handle_blocked_request( f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" ) - yield crawling_context + yield context diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a2fa83f542..61c1614077 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -100,8 +100,11 @@ class Dataset(BaseStorage): not exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already exist. - Usage: + ### Usage + ```python + from crawlee.storages import Dataset + dataset = await Dataset.open(name='my_dataset') ``` """ diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 9d480fdc6f..fb80211869 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -39,8 +39,11 @@ class KeyValueStore(BaseStorage): that does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not already exist. - Usage: + ### Usage + ```python + from crawlee.storages import KeyValueStore + kvs = await KeyValueStore.open(name='my_kvs') ``` """ diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 8510631427..bd1127d488 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -85,8 +85,11 @@ class RequestQueue(BaseStorage, RequestProvider): persist indefinitely, while unnamed queues expire after 7 days unless specified otherwise. The queue supports mutable operations, allowing URLs to be added and removed as needed. - Usage: + ### Usage + ```python + from crawlee.storages import RequestQueue + rq = await RequestQueue.open(name='my_rq') ``` """