diff --git a/src/crawlee/service_container.py b/src/crawlee/service_container.py index 3f99682ab1..141b5d1d96 100644 --- a/src/crawlee/service_container.py +++ b/src/crawlee/service_container.py @@ -39,11 +39,16 @@ class _Services(TypedDict): _default_storage_client_type: StorageClientType = 'local' -def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseStorageClient: +def get_storage_client( + *, + client_type: StorageClientType | None = None, + configuration: Configuration | None = None, +) -> BaseStorageClient: """Get the storage client instance for the current environment. Args: client_type: Allows retrieving a specific storage client type, regardless of where we are running. + configuration: The configuration to use. Returns: The current storage client instance. @@ -57,7 +62,7 @@ def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseS return _services['cloud_storage_client'] if 'local_storage_client' not in _services: - _services['local_storage_client'] = MemoryStorageClient() + _services['local_storage_client'] = MemoryStorageClient(configuration=configuration) return _services['local_storage_client'] diff --git a/src/crawlee/storages/_creation_management.py b/src/crawlee/storages/_creation_management.py index dd768dd475..2da9e4bb3e 100644 --- a/src/crawlee/storages/_creation_management.py +++ b/src/crawlee/storages/_creation_management.py @@ -129,7 +129,7 @@ async def open_storage( ) -> TResource: """Open either a new storage or restore an existing one and return it.""" configuration = configuration or Configuration.get_global_configuration() - storage_client = storage_client or service_container.get_storage_client() + storage_client = storage_client or service_container.get_storage_client(configuration=configuration) # Try to restore the storage from cache by name if name: diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index bfcf185a5e..a776615544 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -1,7 +1,56 @@ +# TODO: type ignores and crawlee_storage_dir +# https://github.com/apify/crawlee-python/issues/146 + from __future__ import annotations +from typing import TYPE_CHECKING + from crawlee.configuration import Configuration +from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext + +if TYPE_CHECKING: + from pathlib import Path def test_global_configuration_works() -> None: assert Configuration.get_global_configuration() is Configuration.get_global_configuration() + + +async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None: + # Configure the crawler to not persist storage or metadata. + configuration = Configuration( + persist_storage=False, + write_metadata=False, + crawlee_storage_dir=str(tmp_path), # type: ignore + ) + + crawler = HttpCrawler(configuration=configuration) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + await context.push_data({'url': context.request.url}) + + await crawler.run(['https://crawlee.dev']) + + # Verify that no files were created in the storage directory. + assert not any(tmp_path.iterdir()), 'Expected the storage directory to be empty, but it is not.' + + +async def test_storage_persisted_when_enabled(tmp_path: Path) -> None: + # Configure the crawler to persist storage and metadata. + configuration = Configuration( + persist_storage=True, + write_metadata=True, + crawlee_storage_dir=str(tmp_path), # type: ignore + ) + + crawler = HttpCrawler(configuration=configuration) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + await context.push_data({'url': context.request.url}) + + await crawler.run(['https://crawlee.dev']) + + # Verify that files were created in the storage directory. + assert any(tmp_path.iterdir()), 'Expected the storage directory to contain files, but it does not.'