|
| 1 | +# TODO: type ignores and crawlee_storage_dir |
| 2 | +# https://github.com/apify/crawlee-python/issues/146 |
| 3 | + |
1 | 4 | from __future__ import annotations
|
2 | 5 |
|
| 6 | +from typing import TYPE_CHECKING |
| 7 | + |
3 | 8 | from crawlee.configuration import Configuration
|
| 9 | +from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext |
| 10 | + |
| 11 | +if TYPE_CHECKING: |
| 12 | + from pathlib import Path |
4 | 13 |
|
5 | 14 |
|
6 | 15 | def test_global_configuration_works() -> None:
|
7 | 16 | assert Configuration.get_global_configuration() is Configuration.get_global_configuration()
|
| 17 | + |
| 18 | + |
| 19 | +async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None: |
| 20 | + # Configure the crawler to not persist storage or metadata. |
| 21 | + configuration = Configuration( |
| 22 | + persist_storage=False, |
| 23 | + write_metadata=False, |
| 24 | + crawlee_storage_dir=str(tmp_path), # type: ignore |
| 25 | + ) |
| 26 | + |
| 27 | + crawler = HttpCrawler(configuration=configuration) |
| 28 | + |
| 29 | + @crawler.router.default_handler |
| 30 | + async def default_handler(context: HttpCrawlingContext) -> None: |
| 31 | + await context.push_data({'url': context.request.url}) |
| 32 | + |
| 33 | + await crawler.run(['https://crawlee.dev']) |
| 34 | + |
| 35 | + # Verify that no files were created in the storage directory. |
| 36 | + assert not any(tmp_path.iterdir()), 'Expected the storage directory to be empty, but it is not.' |
| 37 | + |
| 38 | + |
| 39 | +async def test_storage_persisted_when_enabled(tmp_path: Path) -> None: |
| 40 | + # Configure the crawler to persist storage and metadata. |
| 41 | + configuration = Configuration( |
| 42 | + persist_storage=True, |
| 43 | + write_metadata=True, |
| 44 | + crawlee_storage_dir=str(tmp_path), # type: ignore |
| 45 | + ) |
| 46 | + |
| 47 | + crawler = HttpCrawler(configuration=configuration) |
| 48 | + |
| 49 | + @crawler.router.default_handler |
| 50 | + async def default_handler(context: HttpCrawlingContext) -> None: |
| 51 | + await context.push_data({'url': context.request.url}) |
| 52 | + |
| 53 | + await crawler.run(['https://crawlee.dev']) |
| 54 | + |
| 55 | + # Verify that files were created in the storage directory. |
| 56 | + assert any(tmp_path.iterdir()), 'Expected the storage directory to contain files, but it does not.' |
0 commit comments