From 515e8a5cacc6b01d07cf502e08adccec484a04a6 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 13 Aug 2020 17:39:30 -0300 Subject: [PATCH 01/51] Create Page Object Input Providers --- scrapy_autoextract/providers.py | 61 +++++++++++++++++++++++++++++++++ setup.py | 7 ++++ 2 files changed, 68 insertions(+) create mode 100644 scrapy_autoextract/providers.py diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py new file mode 100644 index 0000000..07db68e --- /dev/null +++ b/scrapy_autoextract/providers.py @@ -0,0 +1,61 @@ +from typing import ClassVar, Optional, Type + +from autoextract.aio import request_raw +from autoextract.request import Request as AutoExtractRequest +from autoextract_poet.page_inputs import ( + AutoExtractArticleData, + AutoExtractProductData, +) +from scrapy import Request +from scrapy.settings import Settings +from scrapy.statscollectors import StatsCollector +from scrapy_poet.page_input_providers import PageObjectInputProvider + + +class _Provider(PageObjectInputProvider): + + page_type: ClassVar[str] + provided_class: ClassVar[Optional[Type]] + + def __init__( + self, + request: Request, + settings: Settings, + stats: StatsCollector, + ): + self.request = request + self.stats = stats + self.settings = settings + + async def __call__(self): + self.stats.inc_value(f"autoextract/{self.page_type}/total") + + request = AutoExtractRequest( + url=self.request.url, + pageType=self.page_type, + ) + + try: + data = await request_raw( + [request.as_dict()], + api_key=self.settings.get('AUTOEXTRACT_USER'), + max_query_error_retries=3 + )[0] + except Exception: + self.stats.inc_value(f"autoextract/{self.page_type}/error") + raise + + self.stats.inc_value(f"autoextract/{self.page_type}/success") + return self.provided_class(data=data) + + +class ArticleResponseDataProvider(_Provider): + + page_type = "article" + provided_class = AutoExtractArticleData + + +class ProductResponseDataProvider(_Provider): + + page_type = "product" + provided_class = AutoExtractProductData diff --git a/setup.py b/setup.py index 7e3e307..78773df 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,13 @@ def get_version(): long_description=open('README.rst').read(), url='https://github.com/scrapinghub/scrapy-autoextract', packages=find_packages(), + install_requires=[ + # FIXME: uncomment after release + # 'autoextract-poet', + 'scrapinghub-autoextract==0.3.0', + 'scrapy', + 'scrapy-poet==0.0.3', + ], keywords='scrapy autoextract middleware', classifiers=[ 'Development Status :: 4 - Beta', From 18940f23017dc8ceb957085bf747ad9e8e0dd01b Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 14 Aug 2020 17:09:49 -0300 Subject: [PATCH 02/51] Update scrapy-poet dependency on setup.py Instead of fixing a specific version, ask for a version greater than or equal our minimum, therefore skipping versions that are known to be incompatible. Co-authored-by: Mikhail Korobov --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 78773df..fbaaf77 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_version(): # 'autoextract-poet', 'scrapinghub-autoextract==0.3.0', 'scrapy', - 'scrapy-poet==0.0.3', + 'scrapy-poet>=0.0.3', ], keywords='scrapy autoextract middleware', classifiers=[ From 8fea07cd7e370a3092e50f41ab1a5861a9554f05 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 14 Aug 2020 17:15:57 -0300 Subject: [PATCH 03/51] Update scrapinghub-autoextract dependency on setup.py Instead of fixing a specific version, ask for a version greater than or equal our minimum, therefore skipping versions that are known to be incompatible. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fbaaf77..f27985d 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def get_version(): install_requires=[ # FIXME: uncomment after release # 'autoextract-poet', - 'scrapinghub-autoextract==0.3.0', + 'scrapinghub-autoextract>=0.3.0', 'scrapy', 'scrapy-poet>=0.0.3', ], From 751d13e12c65eb119e0402b64882546dd4eb4f77 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 14 Aug 2020 17:20:54 -0300 Subject: [PATCH 04/51] Remove Scrapy from setup.py dependencies According to @kmike, it's safer not to have Scrapy listed as a project dependency because it could trigger an undesired update of the framework when this library is used combined with a Scrapy project. https://github.com/scrapinghub/scrapy-autoextract/pull/13#discussion_r470268170 --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index f27985d..933ec49 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ def get_version(): # FIXME: uncomment after release # 'autoextract-poet', 'scrapinghub-autoextract>=0.3.0', - 'scrapy', 'scrapy-poet>=0.0.3', ], keywords='scrapy autoextract middleware', From 3505fc01d093738e8a38ff0302f301d04bf3398b Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 14 Aug 2020 17:27:10 -0300 Subject: [PATCH 05/51] Use current scrapy-poet's providers API Keep in mind that we need to import scrapy-autoextract's providers module in order to effectively register its providers. --- scrapy_autoextract/providers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 07db68e..bc12b11 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -9,7 +9,10 @@ from scrapy import Request from scrapy.settings import Settings from scrapy.statscollectors import StatsCollector -from scrapy_poet.page_input_providers import PageObjectInputProvider +from scrapy_poet.page_input_providers import ( + PageObjectInputProvider, + provides, +) class _Provider(PageObjectInputProvider): @@ -49,13 +52,13 @@ async def __call__(self): return self.provided_class(data=data) +@provides(AutoExtractArticleData) class ArticleResponseDataProvider(_Provider): page_type = "article" - provided_class = AutoExtractArticleData +@provides(AutoExtractProductData) class ProductResponseDataProvider(_Provider): page_type = "product" - provided_class = AutoExtractProductData From 2d5dfba47f214c9a5cdcea79d7d184a539433216 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 09:41:16 -0300 Subject: [PATCH 06/51] Remove "Response" from class name As we had already discussed, this Page Input is more related to a data object than to a regular response. --- scrapy_autoextract/providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index bc12b11..6a87a61 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -53,12 +53,12 @@ async def __call__(self): @provides(AutoExtractArticleData) -class ArticleResponseDataProvider(_Provider): +class ArticleDataProvider(_Provider): page_type = "article" @provides(AutoExtractProductData) -class ProductResponseDataProvider(_Provider): +class ProductDataProvider(_Provider): page_type = "product" From 7356d56fd3d4f6a0306b1c3c7240b581b12509d9 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 09:45:09 -0300 Subject: [PATCH 07/51] Replace "provides" decorator with an install method that makes use of "register" --- scrapy_autoextract/providers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 6a87a61..067ce4c 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -11,7 +11,7 @@ from scrapy.statscollectors import StatsCollector from scrapy_poet.page_input_providers import ( PageObjectInputProvider, - provides, + register, ) @@ -52,13 +52,16 @@ async def __call__(self): return self.provided_class(data=data) -@provides(AutoExtractArticleData) class ArticleDataProvider(_Provider): page_type = "article" -@provides(AutoExtractProductData) class ProductDataProvider(_Provider): page_type = "product" + + +def install(): + register(ArticleDataProvider, AutoExtractArticleData) + register(ProductDataProvider, AutoExtractProductData) From adbbd4d9cd6dbfd3b9db4e64d8700fccf19f16ba Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 10:06:26 -0300 Subject: [PATCH 08/51] Avoid duplication of provided class name I've created an auxiliary class method to register providers. This way, we can avoid duplicating the provided class name by keeping it on a single place. Whenever it needs to be changed, it could be done in a single place. --- scrapy_autoextract/providers.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 067ce4c..a947c5c 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -51,17 +51,23 @@ async def __call__(self): self.stats.inc_value(f"autoextract/{self.page_type}/success") return self.provided_class(data=data) + @classmethod + def register(cls): + register(cls, cls.provided_class) + class ArticleDataProvider(_Provider): page_type = "article" + provided_class = AutoExtractArticleData class ProductDataProvider(_Provider): page_type = "product" + provided_class = AutoExtractProductData def install(): - register(ArticleDataProvider, AutoExtractArticleData) - register(ProductDataProvider, AutoExtractProductData) + ArticleDataProvider.register() + ProductDataProvider.register() From a3e844bfdf439c286ff5355dce19184c9b51c8ca Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 10:35:55 -0300 Subject: [PATCH 09/51] Add docstrings --- scrapy_autoextract/providers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index a947c5c..46c4f33 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -16,6 +16,11 @@ class _Provider(PageObjectInputProvider): + """An interface that describes a generic AutoExtract Provider. + + It should not be used publicly as it serves the purpose of being a base + class for more specific providers such as Article and Product providers. + """ page_type: ClassVar[str] provided_class: ClassVar[Optional[Type]] @@ -26,11 +31,15 @@ def __init__( settings: Settings, stats: StatsCollector, ): + """Initialize provider storing its dependencies as attributes.""" self.request = request self.stats = stats self.settings = settings async def __call__(self): + """Make an AutoExtract request and build a Page Input of provided class + based on API response data. + """ self.stats.inc_value(f"autoextract/{self.page_type}/total") request = AutoExtractRequest( @@ -53,6 +62,10 @@ async def __call__(self): @classmethod def register(cls): + """Register this provider for its provided class on scrapy-poet + registry. This will make it possible to declare provided class as + a callback dependency when writing Scrapy spiders. + """ register(cls, cls.provided_class) @@ -69,5 +82,6 @@ class ProductDataProvider(_Provider): def install(): + """Register all providers for their respective provided classes.""" ArticleDataProvider.register() ProductDataProvider.register() From c6783dbc0f64b24275116b3ad47809c92ff9e8c3 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 10:36:03 -0300 Subject: [PATCH 10/51] Add tests --- test/test_providers.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/test_providers.py diff --git a/test/test_providers.py b/test/test_providers.py new file mode 100644 index 0000000..a2c8419 --- /dev/null +++ b/test/test_providers.py @@ -0,0 +1,26 @@ +from scrapy_poet.page_input_providers import providers +from scrapy_autoextract.providers import ( + ArticleDataProvider, + ProductDataProvider, + install, +) + + +PROVIDERS = ( + ArticleDataProvider, + ProductDataProvider, +) + + +def test_install(): + # Given an uninitialized scrapy-poet repository, + # our AutoExtract should not be registered by default + for provider in PROVIDERS: + assert providers.get(provider.provided_class) is None + + # After installing AutoExtract providers... + install() + + # Our AutoExtract providers should be registered now + for provider in PROVIDERS: + assert providers.get(provider.provided_class) is provider From 25c4d0d6b67f449d092840b3ef10821e7062b65f Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 10:43:58 -0300 Subject: [PATCH 11/51] Temporarily fix setup.py using git while autoextract-poet is not available on PyPI --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 933ec49..dfbb25a 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,8 @@ def get_version(): url='https://github.com/scrapinghub/scrapy-autoextract', packages=find_packages(), install_requires=[ - # FIXME: uncomment after release - # 'autoextract-poet', + # FIXME: change from git to pypi after first release + 'autoextract-poet @ git+ssh://git@github.com/scrapinghub/autoextract-poet.git#egg=autoextract_poet', 'scrapinghub-autoextract>=0.3.0', 'scrapy-poet>=0.0.3', ], From 30af1cb626c3d569565cac87514d0ac9f5b850d4 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 11:16:20 -0300 Subject: [PATCH 12/51] Temporarily fix setup.py using git while autoextract-poet is not available on PyPI --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dfbb25a..291e92a 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def get_version(): packages=find_packages(), install_requires=[ # FIXME: change from git to pypi after first release - 'autoextract-poet @ git+ssh://git@github.com/scrapinghub/autoextract-poet.git#egg=autoextract_poet', + 'autoextract-poet @ git+ssh://git@github.com/scrapinghub/autoextract-poet.git@page-inputs#egg=autoextract_poet', 'scrapinghub-autoextract>=0.3.0', 'scrapy-poet>=0.0.3', ], From f612c2f9e5589beaff2d04975145a1d1e2a05acd Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 17 Aug 2020 11:17:54 -0300 Subject: [PATCH 13/51] Change protocol from git to https since it's a public repository --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 291e92a..42ae7c3 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def get_version(): packages=find_packages(), install_requires=[ # FIXME: change from git to pypi after first release - 'autoextract-poet @ git+ssh://git@github.com/scrapinghub/autoextract-poet.git@page-inputs#egg=autoextract_poet', + 'autoextract-poet @ git+https://github.com/scrapinghub/autoextract-poet.git@page-inputs#egg=autoextract_poet', 'scrapinghub-autoextract>=0.3.0', 'scrapy-poet>=0.0.3', ], From dc5e194d5d302a3eb53fec470f1f4997516114a3 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 18 Aug 2020 11:06:21 -0300 Subject: [PATCH 14/51] Fix autoextract-poet dependency after its release on PyPI --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 42ae7c3..84f930b 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,7 @@ def get_version(): url='https://github.com/scrapinghub/scrapy-autoextract', packages=find_packages(), install_requires=[ - # FIXME: change from git to pypi after first release - 'autoextract-poet @ git+https://github.com/scrapinghub/autoextract-poet.git@page-inputs#egg=autoextract_poet', + 'autoextract-poet>=0.0.1', 'scrapinghub-autoextract>=0.3.0', 'scrapy-poet>=0.0.3', ], From 2365429b9932bc21c5339f4c6dc3445aeea029f6 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 18 Aug 2020 11:17:56 -0300 Subject: [PATCH 15/51] Remove page_type class attribute Page type is defined by the class attribute `item_key` available on `autoextract_poet.page_inputs` classes. --- scrapy_autoextract/providers.py | 19 ++++++++++++------- test/test_providers.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 46c4f33..3e9f4ba 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -22,7 +22,6 @@ class _Provider(PageObjectInputProvider): class for more specific providers such as Article and Product providers. """ - page_type: ClassVar[str] provided_class: ClassVar[Optional[Type]] def __init__( @@ -40,11 +39,12 @@ async def __call__(self): """Make an AutoExtract request and build a Page Input of provided class based on API response data. """ - self.stats.inc_value(f"autoextract/{self.page_type}/total") + page_type = self.get_page_type() + self.stats.inc_value(f"autoextract/{page_type}/total") request = AutoExtractRequest( url=self.request.url, - pageType=self.page_type, + pageType=page_type, ) try: @@ -54,10 +54,10 @@ async def __call__(self): max_query_error_retries=3 )[0] except Exception: - self.stats.inc_value(f"autoextract/{self.page_type}/error") + self.stats.inc_value(f"autoextract/{page_type}/error") raise - self.stats.inc_value(f"autoextract/{self.page_type}/success") + self.stats.inc_value(f"autoextract/{page_type}/success") return self.provided_class(data=data) @classmethod @@ -68,16 +68,21 @@ def register(cls): """ register(cls, cls.provided_class) + @classmethod + def get_page_type(cls) -> str: + """Page type is defined by the class attribute `item_key` available on + `autoextract_poet.page_inputs` classes. + """ + return cls.provided_class.item_key + class ArticleDataProvider(_Provider): - page_type = "article" provided_class = AutoExtractArticleData class ProductDataProvider(_Provider): - page_type = "product" provided_class = AutoExtractProductData diff --git a/test/test_providers.py b/test/test_providers.py index a2c8419..01b6c04 100644 --- a/test/test_providers.py +++ b/test/test_providers.py @@ -1,3 +1,5 @@ +import pytest + from scrapy_poet.page_input_providers import providers from scrapy_autoextract.providers import ( ArticleDataProvider, @@ -24,3 +26,11 @@ def test_install(): # Our AutoExtract providers should be registered now for provider in PROVIDERS: assert providers.get(provider.provided_class) is provider + + +@pytest.mark.parametrize("provider, page_type", ( + (ArticleDataProvider, "article"), + (ProductDataProvider, "product"), +)) +def test_get_page_type(provider, page_type): + assert provider.get_page_type() == page_type From 5de020c0da13d175278e4821bcd65ad308b53bb6 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 18 Aug 2020 11:22:30 -0300 Subject: [PATCH 16/51] Drop Python 3.5 and add Python 3.8 support We're ditching Python 3.5 because one of our dependencies, scrapy-poet, is not compatible with this version anymore. We're also introducing Python 3.8 to our list of supported version and our pipelines. --- .travis.yml | 6 +++--- setup.py | 2 +- tox.ini | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 17ddb3f..8e8e322 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,13 +2,13 @@ language: python matrix: include: - - python: 3.5 - env: TOXENV=py35 - python: 3.6 env: TOXENV=py36 - python: 3.7 env: TOXENV=py37 - - python: 3.7 + - python: 3.8 + env: TOXENV=py38 + - python: 3.8 env: TOXENV=flake8 # command to install dependencies diff --git a/setup.py b/setup.py index 84f930b..c2d68e7 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,9 @@ def get_version(): 'Operating System :: OS Independent', 'License :: OSI Approved :: BSD License', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Framework :: Scrapy', ], ) diff --git a/tox.ini b/tox.ini index 1ee0c16..7c0054a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py35, py36, py37 +envlist = py36, py37, py38 [testenv] deps = From eb99fb76b237f6288d3f2d7947f7c48a34d15912 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 19 Aug 2020 09:47:07 -0300 Subject: [PATCH 17/51] Calling `.as_dict()` here is not needed, we can just pass the request object `autoextract.request.query_as_dict_list()` function is already calling `as_dict()` on this object. --- scrapy_autoextract/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 3e9f4ba..c76d6a2 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -49,7 +49,7 @@ async def __call__(self): try: data = await request_raw( - [request.as_dict()], + [request], api_key=self.settings.get('AUTOEXTRACT_USER'), max_query_error_retries=3 )[0] From 38ce3fa0453c0db2851417bbb5d9947184d404e7 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 19 Aug 2020 09:59:02 -0300 Subject: [PATCH 18/51] Create AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES settings which defaults to 3 This way we can customize this parameter within Scrapy Settings. --- README.rst | 1 + scrapy_autoextract/providers.py | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index d31dc31..b2810ae 100644 --- a/README.rst +++ b/README.rst @@ -66,6 +66,7 @@ Available settings: - ``AUTOEXTRACT_SLOT_POLICY`` [optional] Download concurrency options. Defaults to ``SlotPolicy.PER_DOMAIN`` - If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'`` to make better usage of AutoExtract concurrency and avoid delays. +- ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. Within the spider, consuming the AutoExtract result is as easy as:: diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index c76d6a2..1e796fd 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -46,12 +46,16 @@ async def __call__(self): url=self.request.url, pageType=page_type, ) + api_key = self.settings.get('AUTOEXTRACT_USER') + max_query_error_retries = self.settings.getint( + 'AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES', 3 + ) try: data = await request_raw( [request], - api_key=self.settings.get('AUTOEXTRACT_USER'), - max_query_error_retries=3 + api_key=api_key, + max_query_error_retries=max_query_error_retries )[0] except Exception: self.stats.inc_value(f"autoextract/{page_type}/error") From 76a0e9e7b57c07e90c267fed98eecfde35a7924d Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 19 Aug 2020 10:38:46 -0300 Subject: [PATCH 19/51] Improve documentation on readme file - better sectioning between content related to middleware and providers - describe what are those providers (web-poet, scrapy-poet...) - mention autoextract-poet - briefly explain how to configure your scrapy project to use providers --- README.rst | 76 +++++++++++++++++++++++++++------ scrapy_autoextract/providers.py | 1 + 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/README.rst b/README.rst index b2810ae..2c14852 100644 --- a/README.rst +++ b/README.rst @@ -31,8 +31,29 @@ Installation scrapy-autoextract requires Python 3.5+ +Usage +===== + +There are two different ways to consume the AutoExtract API with this library: + +* using our Scrapy middleware +* using our Page Object providers + +The middleware +-------------- + +The middleware is opt-in and can be explicitly enabled per request, +with the ``{'autoextract': {'enabled': True}}`` request meta. +All the options below can be set either in the project settings file, +or just for specific spiders, in the ``custom_settings`` dict. + +Within the spider, consuming the AutoExtract result is as easy as:: + + def parse(self, response): + yield response.meta['autoextract'] + Configuration -============= +^^^^^^^^^^^^^ Add the AutoExtract downloader middleware in the settings file:: @@ -42,19 +63,46 @@ Add the AutoExtract downloader middleware in the settings file:: Note that this should be the last downloader middleware to be executed. +The providers +------------- -Usage -===== +Another way of consuming AutoExtract API is using the Page Objects pattern +proposed by the _`web-poet` library and implemented by _`scrapy-poet`. -The middleware is opt-in and can be explicitly enabled per request, -with the ``{'autoextract': {'enabled': True}}`` request meta. -All the options below can be set either in the project settings file, -or just for specific spiders, in the ``custom_settings`` dict. +Page Objects their returned Items are defined by the _`autoextract-poet` +library. + +Within the spider, consuming the AutoExtract result is as easy as:: + + from autoextract_poet.page_inputs import AutoExtractArticleData + + def parse(self, response, article: AutoExtractArticleData): + yield article.to_item() + +Configuration +^^^^^^^^^^^^^ + +Make sure you have _`scrapy-poet` properly configured. + +Install the AutoExtract providers in the settings file:: + + import scrapy_autoextract.providers + scrapy_autoextract.providers.install() + +Now you should be ready to use our AutoExtract providers. -Available settings: +Settings +======== + +Common settings +--------------- - ``AUTOEXTRACT_USER`` [mandatory] is your AutoExtract API key - ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. + +Middleware settings +------------------- + - ``AUTOEXTRACT_TIMEOUT`` [optional] sets the response timeout from AutoExtract. Defaults to 660 seconds. Can also be defined by setting the "download_timeout" in the request.meta. - ``AUTOEXTRACT_PAGE_TYPE`` [mandatory] defines the kind of document to be extracted. @@ -66,13 +114,11 @@ Available settings: - ``AUTOEXTRACT_SLOT_POLICY`` [optional] Download concurrency options. Defaults to ``SlotPolicy.PER_DOMAIN`` - If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'`` to make better usage of AutoExtract concurrency and avoid delays. -- ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. - -Within the spider, consuming the AutoExtract result is as easy as:: - def parse(self, response): - yield response.meta['autoextract'] +Provider settings +----------------- +- ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. Limitations =========== @@ -95,3 +141,7 @@ When using the AutoExtract middleware, there are some limitations. There is an exception, if there are too many requests sent in a short amount of time and AutoExtract returns HTTP code 429. For that case it's best to use ``RETRY_HTTP_CODES=[429]``. + +.. _`web-poet`: https://github.com/scrapinghub/web-poet +.. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet +.. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 1e796fd..7dbf9b7 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -52,6 +52,7 @@ async def __call__(self): ) try: + # TODO: customize endpoint through settings data = await request_raw( [request], api_key=api_key, From 0e11b1b086c1a58187541982da2365c534cc7c9f Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 19 Aug 2020 10:40:51 -0300 Subject: [PATCH 20/51] Get AutoExtract endpoint from Scrapy settings --- scrapy_autoextract/providers.py | 7 ++++--- setup.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 7dbf9b7..2f2e640 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -46,16 +46,17 @@ async def __call__(self): url=self.request.url, pageType=page_type, ) - api_key = self.settings.get('AUTOEXTRACT_USER') + api_key = self.settings.get("AUTOEXTRACT_USER") + endpoint = self.settings.get("AUTOEXTRACT_URL") max_query_error_retries = self.settings.getint( - 'AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES', 3 + "AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES", 3 ) try: - # TODO: customize endpoint through settings data = await request_raw( [request], api_key=api_key, + endpoint=endpoint, max_query_error_retries=max_query_error_retries )[0] except Exception: diff --git a/setup.py b/setup.py index c2d68e7..5a9b7ae 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def get_version(): packages=find_packages(), install_requires=[ 'autoextract-poet>=0.0.1', - 'scrapinghub-autoextract>=0.3.0', + 'scrapinghub-autoextract>=0.4.0', 'scrapy-poet>=0.0.3', ], keywords='scrapy autoextract middleware', From b39e6c77f60d36e6b1c79b58b4bce9f1b839f356 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 19 Aug 2020 10:56:47 -0300 Subject: [PATCH 21/51] Fix rst links --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2c14852..786dd9f 100644 --- a/README.rst +++ b/README.rst @@ -67,9 +67,9 @@ The providers ------------- Another way of consuming AutoExtract API is using the Page Objects pattern -proposed by the _`web-poet` library and implemented by _`scrapy-poet`. +proposed by the `web-poet`_ library and implemented by `scrapy-poet`_. -Page Objects their returned Items are defined by the _`autoextract-poet` +Page Objects their returned Items are defined by the `autoextract-poet`_ library. Within the spider, consuming the AutoExtract result is as easy as:: @@ -82,7 +82,7 @@ Within the spider, consuming the AutoExtract result is as easy as:: Configuration ^^^^^^^^^^^^^ -Make sure you have _`scrapy-poet` properly configured. +Make sure you have `scrapy-poet`_ properly configured. Install the AutoExtract providers in the settings file:: From 3676d89dc61aadd26b49d624f2df212a1c32b5ce Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 20 Aug 2020 18:53:54 -0300 Subject: [PATCH 22/51] Improve documentation about providers configuration As proposed by @kmike, we're better separating the configuration of scrapy-poet from the configuration of our providers. We're also introducing a better link that takes the user directly to a section on scrapy-poet's documentation that explicitly talks about its configuration. --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 786dd9f..9904595 100644 --- a/README.rst +++ b/README.rst @@ -82,9 +82,8 @@ Within the spider, consuming the AutoExtract result is as easy as:: Configuration ^^^^^^^^^^^^^ -Make sure you have `scrapy-poet`_ properly configured. - -Install the AutoExtract providers in the settings file:: +First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_. +Then, enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: import scrapy_autoextract.providers scrapy_autoextract.providers.install() @@ -145,3 +144,4 @@ When using the AutoExtract middleware, there are some limitations. .. _`web-poet`: https://github.com/scrapinghub/web-poet .. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet +.. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project From a7059892fc2724ea5c863cfe2a3477d53a5ca7c7 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Thu, 20 Aug 2020 19:11:14 -0300 Subject: [PATCH 23/51] Improve documentation about middleware and providers limitations Adding some useful comments about both strategies limitation proposed by @kmike. --- README.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.rst b/README.rst index 9904595..07a8cf6 100644 --- a/README.rst +++ b/README.rst @@ -140,8 +140,22 @@ When using the AutoExtract middleware, there are some limitations. There is an exception, if there are too many requests sent in a short amount of time and AutoExtract returns HTTP code 429. For that case it's best to use ``RETRY_HTTP_CODES=[429]``. +* 429 errors are handled as standard retries when using Scrapy middleware, + but they're handled properly and automatically with scrapy-poet integration, + as it relies on `scrapinghub-autoextract`_. + You may loose some responses with the middleware. + With scrapy-poet, there is no need to change ``RETRY_HTTP_CODES``. +* Overall, retries have a better behavior with scrapy-poet integration + and it includes support for automatic Query-level errors retries + +When using the AutoExtract providers, be aware that: + +* With scrapy-poet integration, retry requests don't go through Scrapy +* Not all data types are supported with scrapy-poet, + currently only Articles and Products are supported .. _`web-poet`: https://github.com/scrapinghub/web-poet .. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet +.. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract .. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project From 48f81d8fab027aab751b8c9a9e5ca9590efb3fea Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 21 Aug 2020 09:18:50 -0300 Subject: [PATCH 24/51] Update minimum Python version on readme file since Python 3.5 has been dropped --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 07a8cf6..251f715 100644 --- a/README.rst +++ b/README.rst @@ -28,7 +28,7 @@ Installation pip install scrapy-autoextract -scrapy-autoextract requires Python 3.5+ +scrapy-autoextract requires Python 3.6+ Usage From 3045184f8e8ec313c709d0b28326405211e8adc5 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 21 Aug 2020 09:24:03 -0300 Subject: [PATCH 25/51] Add documentation about duplicate requests and how DummyResponse can help --- README.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.rst b/README.rst index 251f715..81a6fcd 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,23 @@ Within the spider, consuming the AutoExtract result is as easy as:: def parse(self, response, article: AutoExtractArticleData): yield article.to_item() +Note that on the example above, we're going to perform two requests: + +* one goes through Scrapy (it might use Crawlera, Splash or no proxy at all, depending on your configuration) +* another goes through AutoExtract API using `scrapinghub-autoextract`_ + +If you don't need the additional request going through Scrapy, +you can annotate the response argument of your callback with the DummyResponse type. +This will ignore the Scrapy request and only the AutoExtract API will be fetched. + +For example:: + + from autoextract_poet.page_inputs import AutoExtractArticleData + from scrapy_poet.utils import DummyResponse + + def parse(self, response: DummyResponse, article: AutoExtractArticleData): + yield article.to_item() + Configuration ^^^^^^^^^^^^^ From 7d4fb93e12e7ee8f9c20f78b0834ce676a16bf4a Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 21 Aug 2020 22:05:26 -0300 Subject: [PATCH 26/51] Fix TypeError: 'coroutine' object is not subscriptable --- scrapy_autoextract/providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 2f2e640..838942f 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -58,13 +58,13 @@ async def __call__(self): api_key=api_key, endpoint=endpoint, max_query_error_retries=max_query_error_retries - )[0] + ) except Exception: self.stats.inc_value(f"autoextract/{page_type}/error") raise self.stats.inc_value(f"autoextract/{page_type}/success") - return self.provided_class(data=data) + return self.provided_class(data=data[0]) @classmethod def register(cls): From 63cddd4a2cf89dd3ec0dbee60d6396ab96316038 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 21 Aug 2020 23:32:00 -0300 Subject: [PATCH 27/51] Update scrapinghub-autoextract dependency This fixes a bug that was preventing calls to request_raw with endpoint defined as None --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5a9b7ae..caa82f1 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ def get_version(): packages=find_packages(), install_requires=[ 'autoextract-poet>=0.0.1', - 'scrapinghub-autoextract>=0.4.0', + 'scrapinghub-autoextract>=0.5.1', 'scrapy-poet>=0.0.3', ], keywords='scrapy autoextract middleware', From 96bf8682068f8d66495d1125ce9fa70cbe8fdf8b Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Fri, 21 Aug 2020 23:44:29 -0300 Subject: [PATCH 28/51] Include documentatio about Scrapy's asyncio support and the need to configure Twisted's default reactor --- README.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.rst b/README.rst index 81a6fcd..1983ddb 100644 --- a/README.rst +++ b/README.rst @@ -105,6 +105,18 @@ Then, enable AutoExtract providers by putting the following code to Scrapy's ``s import scrapy_autoextract.providers scrapy_autoextract.providers.install() +Currently, our providers are implemented using asyncio. +Scrapy has introduced asyncio support since version 2.0 +but you need to manually enable it by configuring Twisted's default reactor. +Check `Scrapy's asyncio documentation`_ for more information. + +Checklist: + +* scrapy-poet is installed and downloader/injector middleware is configured +* autoextract-poet is installed (page inputs are imported from this lib) +* providers are installed on settings.py +* Scrapy's asyncio support is enabled on settings.py + Now you should be ready to use our AutoExtract providers. Settings @@ -176,3 +188,4 @@ When using the AutoExtract providers, be aware that: .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet .. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract .. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project +.. _`Scrapy's asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html From 44836b0aa9c121c0ecb74e7a9b2245ed8ba1c0f0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 10:06:20 -0300 Subject: [PATCH 29/51] provided_class class attribute should not be optional since we depend on item_key attribute --- scrapy_autoextract/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 838942f..ef7dcb7 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -22,7 +22,7 @@ class _Provider(PageObjectInputProvider): class for more specific providers such as Article and Product providers. """ - provided_class: ClassVar[Optional[Type]] + provided_class: ClassVar[Type] def __init__( self, From cda77ac50230a5f95df87998edf5a4858752d2eb Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 10:14:19 -0300 Subject: [PATCH 30/51] Remove unused import --- scrapy_autoextract/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index ef7dcb7..dab389a 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -1,4 +1,4 @@ -from typing import ClassVar, Optional, Type +from typing import ClassVar, Type from autoextract.aio import request_raw from autoextract.request import Request as AutoExtractRequest From 69cd2a740ef30802ddb4cdc6f42790cb7edc7d57 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 10:33:41 -0300 Subject: [PATCH 31/51] Raise QueryLevelError exceptions when response data contains an error --- scrapy_autoextract/providers.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index dab389a..78bd53b 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -15,6 +15,11 @@ ) +class QueryLevelError(Exception): + + pass + + class _Provider(PageObjectInputProvider): """An interface that describes a generic AutoExtract Provider. @@ -53,18 +58,24 @@ async def __call__(self): ) try: - data = await request_raw( + response = await request_raw( [request], api_key=api_key, endpoint=endpoint, max_query_error_retries=max_query_error_retries ) except Exception: - self.stats.inc_value(f"autoextract/{page_type}/error") + self.stats.inc_value(f"autoextract/{page_type}/error/request") raise + data = response[0] + + if "error" in data: + self.stats.inc_value(f"autoextract/{page_type}/error/query") + raise QueryLevelError(data["error"]) + self.stats.inc_value(f"autoextract/{page_type}/success") - return self.provided_class(data=data[0]) + return self.provided_class(data=data) @classmethod def register(cls): From 7c983da603d550be3e65331b18e425a64787b8f4 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 10:43:35 -0300 Subject: [PATCH 32/51] Improve exception message --- scrapy_autoextract/providers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 78bd53b..b7e433c 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -72,7 +72,9 @@ async def __call__(self): if "error" in data: self.stats.inc_value(f"autoextract/{page_type}/error/query") - raise QueryLevelError(data["error"]) + raise QueryLevelError( + f"Error '{data['error']}' while processing {self.request}" + ) self.stats.inc_value(f"autoextract/{page_type}/success") return self.provided_class(data=data) From 686c31f77ff0975f47bf1068b972d16227bd3499 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 10:51:57 -0300 Subject: [PATCH 33/51] Add comment to provided_class attribute The goal is to make it more clear that we depend on an item_key attribute and to_item method. --- scrapy_autoextract/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index b7e433c..6f28203 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -27,7 +27,7 @@ class _Provider(PageObjectInputProvider): class for more specific providers such as Article and Product providers. """ - provided_class: ClassVar[Type] + provided_class: ClassVar[Type] # needs item_key attr and to_item method def __init__( self, From 22ef04368c34f001b17c0b8bed9676f68615b9ca Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 12:09:46 -0300 Subject: [PATCH 34/51] Better documentation for AUTOEXTRACT_USER when using providers Mentioning that it fallbacks to SCRAPINGHUB_AUTOEXTRACT_KEY environment variable --- README.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 1983ddb..46805e1 100644 --- a/README.rst +++ b/README.rst @@ -122,15 +122,11 @@ Now you should be ready to use our AutoExtract providers. Settings ======== -Common settings ---------------- - -- ``AUTOEXTRACT_USER`` [mandatory] is your AutoExtract API key -- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. - Middleware settings ------------------- +- ``AUTOEXTRACT_USER`` [mandatory] is your AutoExtract API key +- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. - ``AUTOEXTRACT_TIMEOUT`` [optional] sets the response timeout from AutoExtract. Defaults to 660 seconds. Can also be defined by setting the "download_timeout" in the request.meta. - ``AUTOEXTRACT_PAGE_TYPE`` [mandatory] defines the kind of document to be extracted. @@ -146,6 +142,8 @@ Middleware settings Provider settings ----------------- +- ``AUTOEXTRACT_USER`` [optional] is your AutoExtract API key. Defaults to ``SCRAPINGHUB_AUTOEXTRACT_KEY`` environment variable. +- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. - ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. Limitations From b1464b275fb6edf6e1f60411a62fb03f23dc1e34 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 24 Aug 2020 12:19:56 -0300 Subject: [PATCH 35/51] Improve QueryError exception based on @ivanprado's suggestions --- scrapy_autoextract/providers.py | 13 ++++++++----- test/test_providers.py | 6 ++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 6f28203..e6ddb28 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -15,9 +15,14 @@ ) -class QueryLevelError(Exception): +class QueryError(Exception): - pass + def __init__(self, query: dict, message: str): + self.query = query + self.message = message + + def __str__(self): + return f"QueryError: query={self.query}, message='{self.message}'" class _Provider(PageObjectInputProvider): @@ -72,9 +77,7 @@ async def __call__(self): if "error" in data: self.stats.inc_value(f"autoextract/{page_type}/error/query") - raise QueryLevelError( - f"Error '{data['error']}' while processing {self.request}" - ) + raise QueryError(data["query"], data["error"]) self.stats.inc_value(f"autoextract/{page_type}/success") return self.provided_class(data=data) diff --git a/test/test_providers.py b/test/test_providers.py index 01b6c04..1f7dd50 100644 --- a/test/test_providers.py +++ b/test/test_providers.py @@ -4,6 +4,7 @@ from scrapy_autoextract.providers import ( ArticleDataProvider, ProductDataProvider, + QueryError, install, ) @@ -34,3 +35,8 @@ def test_install(): )) def test_get_page_type(provider, page_type): assert provider.get_page_type() == page_type + + +def test_query_error(): + exc = QueryError({"foo": "bar"}, "sample error") + assert str(exc) == "QueryError: query={'foo': 'bar'}, message='sample error'" From 2e950f1f9563672bac73318b7c460b54ec016612 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 10:01:39 -0300 Subject: [PATCH 36/51] Update README.rst Co-authored-by: Mikhail Korobov --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 46805e1..999640a 100644 --- a/README.rst +++ b/README.rst @@ -170,7 +170,7 @@ When using the AutoExtract middleware, there are some limitations. * 429 errors are handled as standard retries when using Scrapy middleware, but they're handled properly and automatically with scrapy-poet integration, as it relies on `scrapinghub-autoextract`_. - You may loose some responses with the middleware. + You may lose some responses with the middleware. With scrapy-poet, there is no need to change ``RETRY_HTTP_CODES``. * Overall, retries have a better behavior with scrapy-poet integration and it includes support for automatic Query-level errors retries From 54574f77a265d5b719149e6d0b27e3e5dd85d1f9 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 12:38:58 -0300 Subject: [PATCH 37/51] Improve documentation about middleware limitations --- README.rst | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 999640a..771a0de 100644 --- a/README.rst +++ b/README.rst @@ -162,18 +162,14 @@ When using the AutoExtract middleware, there are some limitations. so it's best to use ``AUTHTHROTTLE_ENABLED=False`` in the settings. * Redirects are handled by AutoExtract, not by Scrapy, so these kinds of middlewares might have no effect -* Retries should be disabled, because AutoExtract handles them internally - (use ``RETRY_ENABLED=False`` in the settings) - There is an exception, if there are too many requests sent in - a short amount of time and AutoExtract returns HTTP code 429. - For that case it's best to use ``RETRY_HTTP_CODES=[429]``. -* 429 errors are handled as standard retries when using Scrapy middleware, +* AutoExtract doesn't handle retries internally, they should be handled by its clients. +* 429 errors could be handled as standard retries when using Scrapy middleware, but they're handled properly and automatically with scrapy-poet integration, as it relies on `scrapinghub-autoextract`_. - You may lose some responses with the middleware. - With scrapy-poet, there is no need to change ``RETRY_HTTP_CODES``. + You may lose some responses with the middleware approach. * Overall, retries have a better behavior with scrapy-poet integration - and it includes support for automatic Query-level errors retries + and it includes support for automatic Query-level errors retries with + no need to change ``RETRY_HTTP_CODES``. When using the AutoExtract providers, be aware that: From 4de0834dc954e636f2c8c7f0abe077d9280c2138 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 22:48:15 -0300 Subject: [PATCH 38/51] Improve readme file's examples by surrounding parser by a real spider class definition --- README.rst | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 771a0de..f854785 100644 --- a/README.rst +++ b/README.rst @@ -74,10 +74,18 @@ library. Within the spider, consuming the AutoExtract result is as easy as:: + import scrapy from autoextract_poet.page_inputs import AutoExtractArticleData - def parse(self, response, article: AutoExtractArticleData): - yield article.to_item() + class SampleSpider(scrapy.Spider): + + name = "sample" + + def parse(self, response, article: AutoExtractArticleData): + # We're making two requests here: + # - one through Scrapy to build the response argument + # - another through providers to build the article argument + yield article.to_item() Note that on the example above, we're going to perform two requests: @@ -85,16 +93,21 @@ Note that on the example above, we're going to perform two requests: * another goes through AutoExtract API using `scrapinghub-autoextract`_ If you don't need the additional request going through Scrapy, -you can annotate the response argument of your callback with the DummyResponse type. +you can annotate the response argument of your callback with ``DummyResponse``. This will ignore the Scrapy request and only the AutoExtract API will be fetched. For example:: + import scrapy from autoextract_poet.page_inputs import AutoExtractArticleData from scrapy_poet.utils import DummyResponse - def parse(self, response: DummyResponse, article: AutoExtractArticleData): - yield article.to_item() + class SampleSpider(scrapy.Spider): + + name = "sample" + + def parse(self, response: DummyResponse, article: AutoExtractArticleData): + yield article.to_item() Configuration ^^^^^^^^^^^^^ From 5317c75478e84f5648a138596b9c3bf132614e8b Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 23:08:10 -0300 Subject: [PATCH 39/51] Improve readme file by incrementing the settings.py example code snippet on providers configuration section --- README.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index f854785..93a488f 100644 --- a/README.rst +++ b/README.rst @@ -112,15 +112,21 @@ For example:: Configuration ^^^^^^^^^^^^^ -First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_. -Then, enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: +First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_ +and then enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: import scrapy_autoextract.providers scrapy_autoextract.providers.install() + DOWNLOADER_MIDDLEWARES = { + 'scrapy_poet.InjectionMiddleware': 543, + } + + TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' + Currently, our providers are implemented using asyncio. Scrapy has introduced asyncio support since version 2.0 -but you need to manually enable it by configuring Twisted's default reactor. +but as of Scrapy 2.3 you need to manually enable it by configuring Twisted's default reactor. Check `Scrapy's asyncio documentation`_ for more information. Checklist: From 5b26ca01d91b487cad4f75d9ebde9e354ac9a13e Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 23:45:14 -0300 Subject: [PATCH 40/51] Improve readme file describing common exceptions and how to capture them using request errbacks (error callbacks). --- README.rst | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.rst b/README.rst index 93a488f..6e68dc4 100644 --- a/README.rst +++ b/README.rst @@ -107,6 +107,7 @@ For example:: name = "sample" def parse(self, response: DummyResponse, article: AutoExtractArticleData): + # We're making a single request here to build the article argument yield article.to_item() Configuration @@ -115,13 +116,16 @@ Configuration First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_ and then enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: + # Install AutoExtract providers import scrapy_autoextract.providers scrapy_autoextract.providers.install() + # Enable scrapy-poet's provider injection middleware DOWNLOADER_MIDDLEWARES = { 'scrapy_poet.InjectionMiddleware': 543, } + # Configure Twisted's reactor for asyncio support on Scrapy TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' Currently, our providers are implemented using asyncio. @@ -138,6 +142,52 @@ Checklist: Now you should be ready to use our AutoExtract providers. +Exceptions +^^^^^^^^^^ + +While trying to fetch AutoExtract API, providers might raise some exceptions. +Those exceptions will probably come from `scrapinghub-autoextract`_ +or Tenacity, the library used to implement retries. +For example: + +* ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned +* ``autoextract.aio.errors.QueryRetryError``: raised when it's not possible to retry a `Query-level error`_ +* ``tenacity.RetryError``: raised when it's not possible to retry an error + +Check `scrapinghub-autoextract's async errors`_ for exception definitions. + +You can capture those exceptions using an error callback (``errback``):: + + import scrapy + from autoextract.aio.errors import RequestError, QueryRetryError + from tenacity import RetryError + from twisted.python.failure import Failure + + class SampleSpider(scrapy.Spider): + + name = "sample" + urls = [...] + + def start_requests(self): + for url in self.urls: + yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_article) + + def parse_article(self, response: DummyResponse, article: AutoExtractArticleData): + yield article.to_item() + + def errback_article(self, failure: Failure): + if failure.check(RequestError): + self.logger.error(f"RequestError on {failure.request.url}) + + if failure.check(QueryRetryError): + self.logger.error(f"QueryRetryError on {failure.request.url}) + + if failure.check(RetryError): + self.logger.error(f"RetryError on {failure.request.url}) + +See `Scrapy documentation `_ +for more details on how to capture exceptions using request's errback. + Settings ======== @@ -200,5 +250,8 @@ When using the AutoExtract providers, be aware that: .. _`scrapy-poet`: https://github.com/scrapinghub/scrapy-poet .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet .. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract +.. _`scrapinghub-autoextract's async errors`: https://github.com/scrapinghub/scrapinghub-autoextract/blob/master/autoextract/aio/errors.py .. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project .. _`Scrapy's asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html +.. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level +.. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level From fb6f0c3fb3ff18c905aa25a352abaaa4c60e10a6 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 23:55:15 -0300 Subject: [PATCH 41/51] Trying to escape single quotes --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 6e68dc4..e2561ca 100644 --- a/README.rst +++ b/README.rst @@ -113,7 +113,7 @@ For example:: Configuration ^^^^^^^^^^^^^ -First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_ +First, you need to configure scrapy-poet as described on `scrapy-poet\'s documentation`_ and then enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: # Install AutoExtract providers @@ -131,7 +131,7 @@ and then enable AutoExtract providers by putting the following code to Scrapy's Currently, our providers are implemented using asyncio. Scrapy has introduced asyncio support since version 2.0 but as of Scrapy 2.3 you need to manually enable it by configuring Twisted's default reactor. -Check `Scrapy's asyncio documentation`_ for more information. +Check `Scrapy\'s asyncio documentation`_ for more information. Checklist: @@ -251,7 +251,7 @@ When using the AutoExtract providers, be aware that: .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet .. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract .. _`scrapinghub-autoextract's async errors`: https://github.com/scrapinghub/scrapinghub-autoextract/blob/master/autoextract/aio/errors.py -.. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project -.. _`Scrapy's asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html +.. _`scrapy-poet\'s documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project +.. _`Scrapy\'s asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level From d3c54fa10491cd5ddfe30f35d0a56cc8c49098ae Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 25 Aug 2020 23:58:43 -0300 Subject: [PATCH 42/51] Trying to fix single quotes --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index e2561ca..6e68dc4 100644 --- a/README.rst +++ b/README.rst @@ -113,7 +113,7 @@ For example:: Configuration ^^^^^^^^^^^^^ -First, you need to configure scrapy-poet as described on `scrapy-poet\'s documentation`_ +First, you need to configure scrapy-poet as described on `scrapy-poet's documentation`_ and then enable AutoExtract providers by putting the following code to Scrapy's ``settings.py`` file:: # Install AutoExtract providers @@ -131,7 +131,7 @@ and then enable AutoExtract providers by putting the following code to Scrapy's Currently, our providers are implemented using asyncio. Scrapy has introduced asyncio support since version 2.0 but as of Scrapy 2.3 you need to manually enable it by configuring Twisted's default reactor. -Check `Scrapy\'s asyncio documentation`_ for more information. +Check `Scrapy's asyncio documentation`_ for more information. Checklist: @@ -251,7 +251,7 @@ When using the AutoExtract providers, be aware that: .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet .. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract .. _`scrapinghub-autoextract's async errors`: https://github.com/scrapinghub/scrapinghub-autoextract/blob/master/autoextract/aio/errors.py -.. _`scrapy-poet\'s documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project -.. _`Scrapy\'s asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html +.. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project +.. _`Scrapy's asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level From 22078bd08c81f420dd69397bb8e8a4106bcb48d7 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 26 Aug 2020 00:00:39 -0300 Subject: [PATCH 43/51] Fix typo with RST links/references --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 6e68dc4..b885bbf 100644 --- a/README.rst +++ b/README.rst @@ -251,7 +251,7 @@ When using the AutoExtract providers, be aware that: .. _`autoextract-poet`: https://github.com/scrapinghub/autoextract-poet .. _`scrapinghub-autoextract`: https://github.com/scrapinghub/scrapinghub-autoextract .. _`scrapinghub-autoextract's async errors`: https://github.com/scrapinghub/scrapinghub-autoextract/blob/master/autoextract/aio/errors.py -.. _`scrapy-poet's documentation` https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project -.. _`Scrapy's asyncio documentation` https://docs.scrapy.org/en/latest/topics/asyncio.html +.. _`scrapy-poet's documentation`: https://scrapy-poet.readthedocs.io/en/latest/intro/tutorial.html#configuring-the-project +.. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level From a3dc46bbb12a76e28d8513c7cecde746d69bacd8 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Wed, 26 Aug 2020 00:08:48 -0300 Subject: [PATCH 44/51] Add documentation about our QueryError exception --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index b885bbf..a64a8b3 100644 --- a/README.rst +++ b/README.rst @@ -146,10 +146,11 @@ Exceptions ^^^^^^^^^^ While trying to fetch AutoExtract API, providers might raise some exceptions. -Those exceptions will probably come from `scrapinghub-autoextract`_ -or Tenacity, the library used to implement retries. +Those exceptions might come from scrapy-autoextract providers themselves, +`scrapinghub-autoextract`_, or Tenacity, the library used to implement retries. For example: +* ``scrapy_poet.providers.QueryError``: raised when a `Query-level error`_ is returned even after retrying the request * ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned * ``autoextract.aio.errors.QueryRetryError``: raised when it's not possible to retry a `Query-level error`_ * ``tenacity.RetryError``: raised when it's not possible to retry an error From c4bd3d0c5defa4acd9fc28d7f6d32df61b642dfa Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 31 Aug 2020 17:07:28 -0300 Subject: [PATCH 45/51] Remove QueryError exception because it's never raised to the user When a QueryError exceptions happens, it's because retries are enabled on the AutoExtract client. If a QueryError exception is raised, Tenacity is capturing it and retrying it. If it's not possible to retry a request due to a QueryError, Tenacity raised a QueryRetryError exception. In short, the QueryError never reaches userland's code. QueryRetryError does. --- README.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/README.rst b/README.rst index a64a8b3..058931a 100644 --- a/README.rst +++ b/README.rst @@ -150,7 +150,6 @@ Those exceptions might come from scrapy-autoextract providers themselves, `scrapinghub-autoextract`_, or Tenacity, the library used to implement retries. For example: -* ``scrapy_poet.providers.QueryError``: raised when a `Query-level error`_ is returned even after retrying the request * ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned * ``autoextract.aio.errors.QueryRetryError``: raised when it's not possible to retry a `Query-level error`_ * ``tenacity.RetryError``: raised when it's not possible to retry an error From 2c930e4c998bae0e3f883d58c1dba0d32fcb185d Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Mon, 31 Aug 2020 17:07:59 -0300 Subject: [PATCH 46/51] Improve documentation message --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 058931a..657ad41 100644 --- a/README.rst +++ b/README.rst @@ -152,7 +152,7 @@ For example: * ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned * ``autoextract.aio.errors.QueryRetryError``: raised when it's not possible to retry a `Query-level error`_ -* ``tenacity.RetryError``: raised when it's not possible to retry an error +* ``tenacity.RetryError``: raised when it's not possible to retry a generic error Check `scrapinghub-autoextract's async errors`_ for exception definitions. From 76e2679c13eb7b476fd54e7a716806e24099c1f0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 1 Sep 2020 10:15:16 -0300 Subject: [PATCH 47/51] Remove QueryRetryError from list of exceptions This exception never reaches the userland's code because it's captured and then AutoExtract client returns latest results available (even if they're errors). --- README.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/README.rst b/README.rst index 657ad41..2801a28 100644 --- a/README.rst +++ b/README.rst @@ -151,7 +151,6 @@ Those exceptions might come from scrapy-autoextract providers themselves, For example: * ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned -* ``autoextract.aio.errors.QueryRetryError``: raised when it's not possible to retry a `Query-level error`_ * ``tenacity.RetryError``: raised when it's not possible to retry a generic error Check `scrapinghub-autoextract's async errors`_ for exception definitions. From 300a0e2b47998bf09823588521fee8b033c3bf69 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 1 Sep 2020 10:56:13 -0300 Subject: [PATCH 48/51] Update README.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Iván de Prado --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2801a28..0b74798 100644 --- a/README.rst +++ b/README.rst @@ -151,7 +151,7 @@ Those exceptions might come from scrapy-autoextract providers themselves, For example: * ``autoextract.aio.errors.RequestError``: raised when a `Request-level error`_ is returned -* ``tenacity.RetryError``: raised when it's not possible to retry a generic error +* ``tenacity.RetryError``: raised when an error persists even after the retrials Check `scrapinghub-autoextract's async errors`_ for exception definitions. From 581ca7ef7d52449d59433f46c63a1fb2863d0f20 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 1 Sep 2020 11:03:28 -0300 Subject: [PATCH 49/51] Remove outdated example --- README.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.rst b/README.rst index 0b74798..a8100c1 100644 --- a/README.rst +++ b/README.rst @@ -178,9 +178,6 @@ You can capture those exceptions using an error callback (``errback``):: if failure.check(RequestError): self.logger.error(f"RequestError on {failure.request.url}) - if failure.check(QueryRetryError): - self.logger.error(f"QueryRetryError on {failure.request.url}) - if failure.check(RetryError): self.logger.error(f"RetryError on {failure.request.url}) From ac85a96f482ef9a0da5162708d7e8b044efa010a Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 1 Sep 2020 11:11:43 -0300 Subject: [PATCH 50/51] Update README.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Iván de Prado --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a8100c1..769453b 100644 --- a/README.rst +++ b/README.rst @@ -208,7 +208,7 @@ Provider settings ----------------- - ``AUTOEXTRACT_USER`` [optional] is your AutoExtract API key. Defaults to ``SCRAPINGHUB_AUTOEXTRACT_KEY`` environment variable. -- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to autoextract.scrapinghub.com. +- ``AUTOEXTRACT_URL`` [optional] the AutoExtract service url. Defaults to the official AutoExtract endpoint. - ``AUTOEXTRACT_MAX_QUERY_ERROR_RETRIES`` [optional] Max number of retries for Query-level errors. Defaults to ``3``. Limitations From aa260419559701d48b2a80c47c9ab612f7742ae0 Mon Sep 17 00:00:00 2001 From: Victor Torres Date: Tue, 1 Sep 2020 17:29:17 -0300 Subject: [PATCH 51/51] Remove one of the middleware limitations because according to @kmike, it's an API behavior, an implementation detail and not an scrapy-autoextract limitation. --- README.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/README.rst b/README.rst index 769453b..ab32512 100644 --- a/README.rst +++ b/README.rst @@ -227,7 +227,6 @@ When using the AutoExtract middleware, there are some limitations. so it's best to use ``AUTHTHROTTLE_ENABLED=False`` in the settings. * Redirects are handled by AutoExtract, not by Scrapy, so these kinds of middlewares might have no effect -* AutoExtract doesn't handle retries internally, they should be handled by its clients. * 429 errors could be handled as standard retries when using Scrapy middleware, but they're handled properly and automatically with scrapy-poet integration, as it relies on `scrapinghub-autoextract`_.