fix: do not persist storage when disabled

vdusek · vdusek · commit bf000e43257b · 2024-10-08T19:12:14.000+02:00
Closes: #539
diff --git a/src/crawlee/service_container.py b/src/crawlee/service_container.py
@@ -39,11 +39,16 @@ class _Services(TypedDict):
 _default_storage_client_type: StorageClientType = 'local'
 
 
-def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseStorageClient:
+def get_storage_client(
+    *,
+    client_type: StorageClientType | None = None,
+    configuration: Configuration | None = None,
+) -> BaseStorageClient:
     """Get the storage client instance for the current environment.
 
     Args:
         client_type: Allows retrieving a specific storage client type, regardless of where we are running.
+        configuration: The configuration to use.
 
     Returns:
         The current storage client instance.
@@ -57,7 +62,7 @@ def get_storage_client(*, client_type: StorageClientType | None = None) -> BaseS
         return _services['cloud_storage_client']
 
     if 'local_storage_client' not in _services:
-        _services['local_storage_client'] = MemoryStorageClient()
+        _services['local_storage_client'] = MemoryStorageClient(configuration=configuration)
 
     return _services['local_storage_client']
 
diff --git a/src/crawlee/storages/_creation_management.py b/src/crawlee/storages/_creation_management.py
@@ -129,7 +129,7 @@ async def open_storage(
 ) -> TResource:
     """Open either a new storage or restore an existing one and return it."""
     configuration = configuration or Configuration.get_global_configuration()
-    storage_client = storage_client or service_container.get_storage_client()
+    storage_client = storage_client or service_container.get_storage_client(configuration=configuration)
 
     # Try to restore the storage from cache by name
     if name:
diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
@@ -1,7 +1,56 @@
+# TODO: type ignores and crawlee_storage_dir
+# https://github.com/apify/crawlee-python/issues/146
+
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 from crawlee.configuration import Configuration
+from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
 def test_global_configuration_works() -> None:
     assert Configuration.get_global_configuration() is Configuration.get_global_configuration()
+
+
+async def test_storage_not_persisted_when_disabled(tmp_path: Path) -> None:
+    # Configure the crawler to not persist storage or metadata.
+    configuration = Configuration(
+        persist_storage=False,
+        write_metadata=False,
+        crawlee_storage_dir=str(tmp_path),  # type: ignore
+    )
+
+    crawler = HttpCrawler(configuration=configuration)
+
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        await context.push_data({'url': context.request.url})
+
+    await crawler.run(['https://crawlee.dev'])
+
+    # Verify that no files were created in the storage directory.
+    assert not any(tmp_path.iterdir()), 'Expected the storage directory to be empty, but it is not.'
+
+
+async def test_storage_persisted_when_enabled(tmp_path: Path) -> None:
+    # Configure the crawler to persist storage and metadata.
+    configuration = Configuration(
+        persist_storage=True,
+        write_metadata=True,
+        crawlee_storage_dir=str(tmp_path),  # type: ignore
+    )
+
+    crawler = HttpCrawler(configuration=configuration)
+
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        await context.push_data({'url': context.request.url})
+
+    await crawler.run(['https://crawlee.dev'])
+
+    # Verify that files were created in the storage directory.
+    assert any(tmp_path.iterdir()), 'Expected the storage directory to contain files, but it does not.'