diff --git a/.travis.yml b/.travis.yml index 17ddb3f..8e8e322 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,13 +2,13 @@ language: python matrix: include: - - python: 3.5 - env: TOXENV=py35 - python: 3.6 env: TOXENV=py36 - python: 3.7 env: TOXENV=py37 - - python: 3.7 + - python: 3.8 + env: TOXENV=py38 + - python: 3.8 env: TOXENV=flake8 # command to install dependencies diff --git a/README.rst b/README.rst index d31dc31..ab32512 100644 --- a/README.rst +++ b/README.rst @@ -28,11 +28,32 @@ Installation pip install scrapy-autoextract -scrapy-autoextract requires Python 3.5+ +scrapy-autoextract requires Python 3.6+ +Usage +===== + +There are two different ways to consume the AutoExtract API with this library: + +* using our Scrapy middleware +* using our Page Object providers + +The middleware +-------------- + +The middleware is opt-in and can be explicitly enabled per request, +with the ``{'autoextract': {'enabled': True}}`` request meta. +All the options below can be set either in the project settings file, +or just for specific spiders, in the ``custom_settings`` dict. + +Within the spider, consuming the AutoExtract result is as easy as:: + + def parse(self, response): + yield response.meta['autoextract'] + Configuration -============= +^^^^^^^^^^^^^ Add the AutoExtract downloader middleware in the settings file:: @@ -42,16 +63,132 @@ Add the AutoExtract downloader middleware in the settings file:: Note that this should be the last downloader middleware to be executed. +The providers +------------- -Usage -===== +Another way of consuming AutoExtract API is using the Page Objects pattern +proposed by the `web-poet`_ library and implemented by `scrapy-poet`_. -The middleware is opt-in and can be explicitly enabled per request, -with the ``{'autoextract': {'enabled': True}}`` request meta. -All the options below can be set either in the project settings file, -or just for specific spiders, in the ``custom_settings`` dict. +Page Objects their returned Items are defined by the `autoextract-poet`_ +library. + +Within the spider, consuming the AutoExtract result is as easy as:: + + import scrapy + from autoextract_poet.page_inputs import AutoExtractArticleData + + class SampleSpider(scrapy.Spider): + + name = "sample" + + def parse(self, response, article: AutoExtractArticleData): + # We're making two requests here: + # - one through Scrapy to build the response argument + # - another through providers to build the article argument + yield article.to_item() + +Note that on the example above, we're going to perform two requests: + +* one goes through Scrapy (it might use Crawlera, Splash or no proxy at all, depending on your configuration) +* another goes through AutoExtract API using `scrapinghub-autoextract`_ + +If you don't need the additional request going through Scrapy, +you can annotate the response argument of your callback with ``DummyResponse``. +This will ignore the Scrapy request and only the AutoExtract API will be fetched. + +For example:: + + import scrapy + from autoextract_poet.page_inputs import AutoExtractArticleData + from scrapy_poet.utils import DummyResponse -Available settings: + class SampleSpider(scrapy.Spider): + + name = "sample" + + def parse(self, response: DummyResponse, article: AutoExtractArticleData): + # We're making a single request here to build the article argument + yield article.to_item() + +Configuration +^^^^^^^^^^^^^ + +First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_ +and then enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: + + # Install AutoExtract providers + import scrapy_autoextract.providers + scrapy_autoextract.providers.install() + + # Enable scrapy-poet's provider injection middleware + DOWNLOADER_MIDDLEWARES = { + 'scrapy_poet.InjectionMiddleware': 543, + } + + # Configure Twisted's reactor for asyncio support on Scrapy + TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' + +Currently, our providers are implemented using asyncio. +Scrapy has introduced asyncio support since version 2.0 +but as of Scrapy 2.3 you need to manually enable it by configuring Twisted's default reactor. +Check `Scrapy's asyncio documentation`_ for more information. + +Checklist: + +* scrapy-poet is installed and downloader/injector middleware is configured +* autoextract-poet is installed (page inputs are imported from this lib) +* providers are installed on settings.py +* Scrapy's asyncio support is enabled on settings.py + +Now you should be ready to use our AutoExtract providers. + +Exceptions +^^^^^^^^^^ + +While trying to fetch AutoExtract API, providers might raise some exceptions. +Those exceptions might come from scrapy-autoextract providers themselves, +`scrapinghub-autoextract`_, or Tenacity, the library used to implement retries. +For example: + +* ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned +* ``tenacity.RetryError``: raised when an error persists even after the retrials + +Check `scrapinghub-autoextract's async errors`_ for exception definitions. + +You can capture those exceptions using an error callback (``errback``):: + + import scrapy + from autoextract.aio.errors import RequestError, QueryRetryError + from tenacity import RetryError + from twisted.python.failure import Failure + + class SampleSpider(scrapy.Spider): + + name = "sample" + urls = [...] + + def start_requests(self): + for url in self.urls: + yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_article) + + def parse_article(self, response: DummyResponse, article: AutoExtractArticleData): + yield article.to_item() + + def errback_article(self, failure: Failure): + if failure.check(RequestError): + self.logger.error(f"RequestError on {failure.request.url}) + + if failure.check(RetryError): + self.logger.error(f"RetryError on {failure.request.url}) + +See `Scrapy documentation `_ +for more details on how to capture exceptions using request's errback. + +Settings +======== + +Middleware settings +------------------- - ``AUTOEXTRACT_USER`` [mandatory] is your AutoExtract API key - ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. @@ -67,11 +204,12 @@ Available settings: - If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'`` to make better usage of AutoExtract concurrency and avoid delays. -Within the spider, consuming the AutoExtract result is as easy as:: - - def parse(self, response): - yield response.meta['autoextract'] +Provider settings +----------------- +- ``AUTOEXTRACT_USER`` [optional] is your AutoExtract API key. Defaults to ``SCRAPINGHUB_AUTOEXTRACT_KEY`` environment variable. +- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to the official AutoExtract endpoint. +- ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. Limitations =========== @@ -89,8 +227,26 @@ When using the AutoExtract middleware, there are some limitations. so it's best to use ``AUTHTHROTTLE_ENABLED=False`` in the settings. * Redirects are handled by AutoExtract, not by Scrapy, so these kinds of middlewares might have no effect -* Retries should be disabled, because AutoExtract handles them internally - (use ``RETRY_ENABLED=False`` in the settings) - There is an exception, if there are too many requests sent in - a short amount of time and AutoExtract returns HTTP code 429. - For that case it's best to use ``RETRY_HTTP_CODES=[429]``. +* 429 errors could be handled as standard retries when using Scrapy middleware, + but they're handled properly and automatically with scrapy-poet integration, + as it relies on `scrapinghub-autoextract`_. + You may lose some responses with the middleware approach. +* Overall, retries have a better behavior with scrapy-poet integration + and it includes support for automatic Query-level errors retries with + no need to change ``RETRY_HTTP_CODES``. + +When using the AutoExtract providers, be aware that: + +* With scrapy-poet integration, retry requests don't go through Scrapy +* Not all data types are supported with scrapy-poet, + currently only Articles and Products are supported + +.. _`web-poet`: https://github.com/scrapinghub/web-poet +.. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet +.. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet +.. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract +.. _`scrapinghub-autoextract's async errors`: https://github.com/scrapinghub/scrapinghub-autoextract/blob/master/autoextract/aio/errors.py +.. _`scrapy-poet's documentation`: https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project +.. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html +.. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level +.. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py new file mode 100644 index 0000000..e6ddb28 --- /dev/null +++ b/scrapy_autoextract/providers.py @@ -0,0 +1,114 @@ +from typing import ClassVar, Type + +from autoextract.aio import request_raw +from autoextract.request import Request as AutoExtractRequest +from autoextract_poet.page_inputs import ( + AutoExtractArticleData, + AutoExtractProductData, +) +from scrapy import Request +from scrapy.settings import Settings +from scrapy.statscollectors import StatsCollector +from scrapy_poet.page_input_providers import ( + PageObjectInputProvider, + register, +) + + +class QueryError(Exception): + + def __init__(self, query: dict, message: str): + self.query = query + self.message = message + + def __str__(self): + return f"QueryError: query={self.query}, message='{self.message}'" + + +class _Provider(PageObjectInputProvider): + """An interface that describes a generic AutoExtract Provider. + + It should not be used publicly as it serves the purpose of being a base + class for more specific providers such as Article and Product providers. + """ + + provided_class: ClassVar[Type] # needs item_key attr and to_item method + + def __init__( + self, + request: Request, + settings: Settings, + stats: StatsCollector, + ): + """Initialize provider storing its dependencies as attributes.""" + self.request = request + self.stats = stats + self.settings = settings + + async def __call__(self): + """Make an AutoExtract request and build a Page Input of provided class + based on API response data. + """ + page_type = self.get_page_type() + self.stats.inc_value(f"autoextract/{page_type}/total") + + request = AutoExtractRequest( + url=self.request.url, + pageType=page_type, + ) + api_key = self.settings.get("AUTOEXTRACT_USER") + endpoint = self.settings.get("AUTOEXTRACT_URL") + max_query_error_retries = self.settings.getint( + "AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES", 3 + ) + + try: + response = await request_raw( + [request], + api_key=api_key, + endpoint=endpoint, + max_query_error_retries=max_query_error_retries + ) + except Exception: + self.stats.inc_value(f"autoextract/{page_type}/error/request") + raise + + data = response[0] + + if "error" in data: + self.stats.inc_value(f"autoextract/{page_type}/error/query") + raise QueryError(data["query"], data["error"]) + + self.stats.inc_value(f"autoextract/{page_type}/success") + return self.provided_class(data=data) + + @classmethod + def register(cls): + """Register this provider for its provided class on scrapy-poet + registry. This will make it possible to declare provided class as + a callback dependency when writing Scrapy spiders. + """ + register(cls, cls.provided_class) + + @classmethod + def get_page_type(cls) -> str: + """Page type is defined by the class attribute `item_key` available on + `autoextract_poet.page_inputs` classes. + """ + return cls.provided_class.item_key + + +class ArticleDataProvider(_Provider): + + provided_class = AutoExtractArticleData + + +class ProductDataProvider(_Provider): + + provided_class = AutoExtractProductData + + +def install(): + """Register all providers for their respective provided classes.""" + ArticleDataProvider.register() + ProductDataProvider.register() diff --git a/setup.py b/setup.py index 7e3e307..caa82f1 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,11 @@ def get_version(): long_description=open('README.rst').read(), url='https://github.com/scrapinghub/scrapy-autoextract', packages=find_packages(), + install_requires=[ + 'autoextract-poet>=0.0.1', + 'scrapinghub-autoextract>=0.5.1', + 'scrapy-poet>=0.0.3', + ], keywords='scrapy autoextract middleware', classifiers=[ 'Development Status :: 4 - Beta', @@ -35,9 +40,9 @@ def get_version(): 'Operating System :: OS Independent', 'License :: OSI Approved :: BSD License', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Framework :: Scrapy', ], ) diff --git a/test/test_providers.py b/test/test_providers.py new file mode 100644 index 0000000..1f7dd50 --- /dev/null +++ b/test/test_providers.py @@ -0,0 +1,42 @@ +import pytest + +from scrapy_poet.page_input_providers import providers +from scrapy_autoextract.providers import ( + ArticleDataProvider, + ProductDataProvider, + QueryError, + install, +) + + +PROVIDERS = ( + ArticleDataProvider, + ProductDataProvider, +) + + +def test_install(): + # Given an uninitialized scrapy-poet repository, + # our AutoExtract should not be registered by default + for provider in PROVIDERS: + assert providers.get(provider.provided_class) is None + + # After installing AutoExtract providers... + install() + + # Our AutoExtract providers should be registered now + for provider in PROVIDERS: + assert providers.get(provider.provided_class) is provider + + +@pytest.mark.parametrize("provider, page_type", ( + (ArticleDataProvider, "article"), + (ProductDataProvider, "product"), +)) +def test_get_page_type(provider, page_type): + assert provider.get_page_type() == page_type + + +def test_query_error(): + exc = QueryError({"foo": "bar"}, "sample error") + assert str(exc) == "QueryError: query={'foo': 'bar'}, message='sample error'" diff --git a/tox.ini b/tox.ini index 1ee0c16..7c0054a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py35, py36, py37 +envlist = py36, py37, py38 [testenv] deps =